linux-reiser4.patch

   1 diff -rupN linux-2.6.8-rc3/fs/reiser4/Makefile linux-2.6.8-rc3-a/fs/reiser4/Makefile
   2 --- linux-2.6.8-rc3/fs/reiser4/Makefile 1970-01-01 03:00:00.000000000 +0300
   3 +++ linux-2.6.8-rc3-a/fs/reiser4/Makefile       2004-08-05 21:20:53.186638708 +0400
   4 @@ -0,0 +1,166 @@
   5 +#
   6 +# reiser4/Makefile
   7 +#
   8 +
   9 +obj-$(CONFIG_REISER4_FS) += reiser4.o
  10 +
  11 +EXTRA_CFLAGS += \
  12 +           -Wformat \
  13 +              -Wundef \
  14 +           -Wunused \
  15 +              -Wcomment \
  16 +           \
  17 +              -Wno-nested-externs \
  18 +              -Wno-write-strings \
  19 +              -Wno-sign-compare
  20 +
  21 +#             -Wpointer-arith \
  22 +#             -Wlarger-than-16384 \
  23 +#             -Winline \
  24 +
  25 +ifeq ($(CONFIG_REISER4_NOOPT),y)
  26 +       EXTRA_CFLAGS += -O0 -fno-inline
  27 +else
  28 +# this warning is only supported when optimization is on.
  29 +       EXTRA_CFLAGS += \
  30 +           -Wuninitialized
  31 +endif
  32 +
  33 +ifeq ($(CONFIG_REISER4_ALL_IN_ONE),y)
  34 +
  35 +reiser4-objs := all-reiser4.o
  36 +
  37 +else
  38 +
  39 +reiser4-objs := \
  40 +                  debug.o \
  41 +                  stats.o \
  42 +                  jnode.o \
  43 +                  znode.o \
  44 +                  key.o \
  45 +                  pool.o \
  46 +                  tree_mod.o \
  47 +                  estimate.o \
  48 +                  carry.o \
  49 +                  carry_ops.o \
  50 +                  lock.o \
  51 +                  tree.o \
  52 +                  context.o \
  53 +                  tap.o \
  54 +                  coord.o \
  55 +                  block_alloc.o \
  56 +                  txnmgr.o \
  57 +                  kassign.o \
  58 +                  flush.o \
  59 +                  wander.o \
  60 +                  eottl.o \
  61 +                  search.o \
  62 +                  page_cache.o \
  63 +                  lnode.o \
  64 +                  kcond.o \
  65 +                  seal.o \
  66 +                  dscale.o \
  67 +                  log.o \
  68 +                  flush_queue.o \
  69 +                  ktxnmgrd.o \
  70 +                  kattr.o \
  71 +                  blocknrset.o \
  72 +                  super.o \
  73 +                  oid.o \
  74 +                  tree_walk.o \
  75 +                  inode.o \
  76 +                  vfs_ops.o \
  77 +                  inode_ops.o \
  78 +                  file_ops.o \
  79 +                  as_ops.o \
  80 +                  emergency_flush.o \
  81 +                  spinprof.o\
  82 +                  entd.o\
  83 +                  readahead.o \
  84 +                  crypt.o \
  85 +                  diskmap.o \
  86 +                  prof.o \
  87 +                  repacker.o \
  88 +                  status_flags.o \
  89 +                  init_super.o \
  90 +                  safe_link.o \
  91 +           \
  92 +                  plugin/plugin.o \
  93 +                  plugin/plugin_set.o \
  94 +                  plugin/node/node.o \
  95 +                  plugin/object.o \
  96 +                  plugin/symlink.o \
  97 +                  plugin/cryptcompress.o \
  98 +                  plugin/digest.o \
  99 +                  plugin/node/node40.o \
 100 +           \
 101 +                  plugin/compress/minilzo.o \
 102 +                  plugin/compress/compress.o \
 103 +           \
 104 +                  plugin/item/static_stat.o \
 105 +                  plugin/item/sde.o \
 106 +                  plugin/item/cde.o \
 107 +                  plugin/item/blackbox.o \
 108 +                  plugin/item/internal.o \
 109 +                  plugin/item/tail.o \
 110 +                  plugin/item/ctail.o \
 111 +                  plugin/item/extent.o \
 112 +                  plugin/item/extent_item_ops.o \
 113 +                  plugin/item/extent_file_ops.o \
 114 +                  plugin/item/extent_flush_ops.o \
 115 +                  plugin/item/extent_repack_ops.o \
 116 +           \
 117 +                  plugin/hash.o \
 118 +                  plugin/fibration.o \
 119 +                  plugin/tail_policy.o \
 120 +                  plugin/item/item.o \
 121 +           \
 122 +                  plugin/dir/hashed_dir.o \
 123 +                  plugin/dir/pseudo_dir.o \
 124 +                  plugin/dir/dir.o \
 125 +           \
 126 +                  plugin/security/perm.o \
 127 +           \
 128 +                  plugin/pseudo/pseudo.o \
 129 +           \
 130 +                  plugin/space/bitmap.o \
 131 +           \
 132 +                  plugin/disk_format/disk_format40.o \
 133 +                  plugin/disk_format/disk_format.o \
 134 +           \
 135 +                  plugin/file/pseudo.o \
 136 +                  plugin/file/file.o \
 137 +                  plugin/file/tail_conversion.o
 138 +
 139 +reiser4-objs += sys_reiser4.o
 140 +
 141 +ifeq ($(CONFIG_REISER4_FS_SYSCALL),y)
 142 +
 143 +  ifeq ($(CONFIG_REISER4_FS_SYSCALL_YACC),y)
 144 +
 145 +      YFLAGS= -d -v -r -b $(obj)/parser/parser
 146 +
 147 +   $(obj)/parser/parser.code.c: $(obj)/parser/parser.y
 148 +
 149 +       $(YACC) $(YFLAGS) $(obj)/parser/parser.y
 150 +
 151 +  endif
 152 +
 153 +  sys_reiser4.o: $/sys_reiser4.c       \
 154 +                 $/parser/parser.code.c \
 155 +                 $/parser/parser.tab.c \
 156 +                 $/parser/parser.tab.h \
 157 +                 $/parser/lib.c        \
 158 +                 $/parser/pars.cls.h   \
 159 +                 $/parser/pars.yacc.h  \
 160 +                 $/parser/parser.h
 161 +
 162 +
 163 +#      $(MAKE)  $(obj)/parser/parser
 164 +#clean-files := parser/parser.code.c
 165 +##clean-rule =@$(MAKE) -C $/parser clean
 166 +#clean-rule =@$(MAKE) $(obj)/parser/parser.code.c
 167 +endif
 168 +
 169 +endif
 170 +
 171 diff -rupN linux-2.6.8-rc3/fs/reiser4/README linux-2.6.8-rc3-a/fs/reiser4/README
 172 --- linux-2.6.8-rc3/fs/reiser4/README   1970-01-01 03:00:00.000000000 +0300
 173 +++ linux-2.6.8-rc3-a/fs/reiser4/README 2004-08-05 21:20:52.763727911 +0400
 174 @@ -0,0 +1,125 @@
 175 +[LICENSING]
 176 +
 177 +Reiser4 is hereby licensed under the GNU General
 178 +Public License version 2.
 179 +
 180 +Source code files that contain the phrase "licensing governed by
 181 +reiser4/README" are "governed files" throughout this file.  Governed
 182 +files are licensed under the GPL.  The portions of them owned by Hans
 183 +Reiser, or authorized to be licensed by him, have been in the past,
 184 +and likely will be in the future, licensed to other parties under
 185 +other licenses.  If you add your code to governed files, and don't
 186 +want it to be owned by Hans Reiser, put your copyright label on that
 187 +code so the poor blight and his customers can keep things straight.
 188 +All portions of governed files not labeled otherwise are owned by Hans
 189 +Reiser, and by adding your code to it, widely distributing it to
 190 +others or sending us a patch, and leaving the sentence in stating that
 191 +licensing is governed by the statement in this file, you accept this.
 192 +It will be a kindness if you identify whether Hans Reiser is allowed
 193 +to license code labeled as owned by you on your behalf other than
 194 +under the GPL, because he wants to know if it is okay to do so and put
 195 +a check in the mail to you (for non-trivial improvements) when he
 196 +makes his next sale.  He makes no guarantees as to the amount if any,
 197 +though he feels motivated to motivate contributors, and you can surely
 198 +discuss this with him before or after contributing.  You have the
 199 +right to decline to allow him to license your code contribution other
 200 +than under the GPL.
 201 +
 202 +Further licensing options are available for commercial and/or other
 203 +interests directly from Hans Reiser: reiser@namesys.com.  If you interpret
 204 +the GPL as not allowing those additional licensing options, you read
 205 +it wrongly, and Richard Stallman agrees with me, when carefully read
 206 +you can see that those restrictions on additional terms do not apply
 207 +to the owner of the copyright, and my interpretation of this shall
 208 +govern for this license.
 209 +
 210 +[END LICENSING]
 211 +
 212 +Reiser4 is a file system based on dancing tree algorithms, and is
 213 +described at http://www.namesys.com
 214 +
 215 +mkfs.reiser4 and other utilities are on our webpage or wherever your
 216 +Linux provider put them.  You really want to be running the latest
 217 +version off the website if you use fsck.
 218 +
 219 +Yes, if you update your reiser4 kernel module you do have to
 220 +recompile your kernel, most of the time.  The errors you get will be
 221 +quite cryptic if your forget to do so.
 222 +
 223 +Hideous Commercial Pitch: Spread your development costs across other OS
 224 +vendors.  Select from the best in the world, not the best in your
 225 +building, by buying from third party OS component suppliers.  Leverage
 226 +the software component development power of the internet.  Be the most
 227 +aggressive in taking advantage of the commercial possibilities of
 228 +decentralized internet development, and add value through your branded
 229 +integration that you sell as an operating system.  Let your competitors
 230 +be the ones to compete against the entire internet by themselves.  Be
 231 +hip, get with the new economic trend, before your competitors do.  Send
 232 +email to reiser@namesys.com
 233 +
 234 +Hans Reiser was the primary architect of Reiser4, but a whole team
 235 +chipped their ideas in.  He invested everything he had into Namesys
 236 +for 5.5 dark years of no money before Reiser3 finally started to work well
 237 +enough to bring in money.  He owns the copyright.
 238 +
 239 +DARPA was the primary sponsor of Reiser4.  DARPA does not endorse
 240 +Reiser4, it merely sponsors it.  DARPA is, in solely Hans's personal
 241 +opinion, unique in its willingness to invest into things more
 242 +theoretical than the VC community can readily understand, and more
 243 +longterm than allows them to be sure that they will be the ones to
 244 +extract the economic benefits from.  DARPA also integrated us into a
 245 +security community that transformed our security worldview.
 246 +
 247 +Vladimir Saveliev is our lead programmer, with us from the beginning,
 248 +and he worked long hours writing the cleanest code.  This is why he is
 249 +now the lead programmer after years of commitment to our work.  He
 250 +always made the effort to be the best he could be, and to make his
 251 +code the best that it could be.  What resulted was quite remarkable. I
 252 +don't think that money can ever motivate someone to work the way he
 253 +did, he is one of the most selfless men I know.
 254 +
 255 +Alexander Lyamin was our sysadmin, and helped to educate us in
 256 +security issues.  Moscow State University and IMT were very generous
 257 +in the internet access they provided us, and in lots of other little
 258 +ways that a generous institution can be.
 259 +
 260 +Alexander Zarochentcev (sometimes known as zam, or sasha), wrote the
 261 +locking code, the block allocator, and finished the flushing code.
 262 +His code is always crystal clean and well structured.
 263 +
 264 +Nikita Danilov wrote the core of the balancing code, the core of the
 265 +plugins code, and the directory code.  He worked a steady pace of long
 266 +hours that produced a whole lot of well abstracted code.  He is our
 267 +senior computer scientist.
 268 +
 269 +Vladimir Demidov wrote the parser.  Writing an in kernel parser is
 270 +something very few persons have the skills for, and it is thanks to
 271 +him that we can say that the parser is really not so big compared to
 272 +various bits of our other code, and making a parser work in the kernel
 273 +was not so complicated as everyone would imagine mainly because it was
 274 +him doing it...
 275 +
 276 +Joshua McDonald wrote the transaction manager, and the flush code.
 277 +The flush code unexpectedly turned out be extremely hairy for reasons
 278 +you can read about on our web page, and he did a great job on an
 279 +extremely difficult task.
 280 +
 281 +Nina Reiser handled our accounting, government relations, and much
 282 +more.
 283 +
 284 +Ramon Reiser developed our website.
 285 +
 286 +Beverly Palmer drew our graphics.
 287 +
 288 +Vitaly Fertman developed librepair, userspace plugins repair code, fsck
 289 +and worked with Umka on developing libreiser4 and userspace plugins.
 290 +
 291 +Yury Umanets (aka Umka) developed libreiser4, userspace plugins and
 292 +userspace tools (reiser4progs).
 293 +
 294 +Oleg Drokin (aka Green) is the release manager who fixes everything.
 295 +It is so nice to have someone like that on the team.  He (plus Chris
 296 +and Jeff) make it possible for the entire rest of the Namesys team to
 297 +focus on Reiser4, and he fixed a whole lot of Reiser4 bugs also.  It
 298 +is just amazing to watch his talent for spotting bugs in action.
 299 +
 300 diff -rupN linux-2.6.8-rc3/fs/reiser4/as_ops.c linux-2.6.8-rc3-a/fs/reiser4/as_ops.c
 301 --- linux-2.6.8-rc3/fs/reiser4/as_ops.c 1970-01-01 03:00:00.000000000 +0300
 302 +++ linux-2.6.8-rc3-a/fs/reiser4/as_ops.c       2004-08-05 21:20:53.424588519 +0400
 303 @@ -0,0 +1,688 @@
 304 +/* Copyright 2003 by Hans Reiser, licensing governed by reiser4/README */
 305 +
 306 +/* Interface to VFS. Reiser4 address_space_operations are defined here. */
 307 +
 308 +#include "forward.h"
 309 +#include "debug.h"
 310 +#include "dformat.h"
 311 +#include "coord.h"
 312 +#include "plugin/item/item.h"
 313 +#include "plugin/file/file.h"
 314 +#include "plugin/security/perm.h"
 315 +#include "plugin/disk_format/disk_format.h"
 316 +#include "plugin/plugin.h"
 317 +#include "plugin/plugin_set.h"
 318 +#include "plugin/object.h"
 319 +#include "txnmgr.h"
 320 +#include "jnode.h"
 321 +#include "znode.h"
 322 +#include "block_alloc.h"
 323 +#include "tree.h"
 324 +#include "log.h"
 325 +#include "vfs_ops.h"
 326 +#include "inode.h"
 327 +#include "page_cache.h"
 328 +#include "ktxnmgrd.h"
 329 +#include "super.h"
 330 +#include "reiser4.h"
 331 +#include "kattr.h"
 332 +#include "entd.h"
 333 +#include "emergency_flush.h"
 334 +
 335 +#include <linux/profile.h>
 336 +#include <linux/types.h>
 337 +#include <linux/mount.h>
 338 +#include <linux/vfs.h>
 339 +#include <linux/mm.h>
 340 +#include <linux/buffer_head.h>
 341 +#include <linux/dcache.h>
 342 +#include <linux/list.h>
 343 +#include <linux/pagemap.h>
 344 +#include <linux/slab.h>
 345 +#include <linux/seq_file.h>
 346 +#include <linux/init.h>
 347 +#include <linux/module.h>
 348 +#include <linux/writeback.h>
 349 +#include <linux/backing-dev.h>
 350 +#include <linux/quotaops.h>
 351 +#include <linux/security.h>
 352 +
 353 +/* address space operations */
 354 +
 355 +static int reiser4_readpage(struct file *, struct page *);
 356 +
 357 +static int reiser4_prepare_write(struct file *,
 358 +                                struct page *, unsigned, unsigned);
 359 +
 360 +static int reiser4_commit_write(struct file *,
 361 +                               struct page *, unsigned, unsigned);
 362 +
 363 +static int reiser4_set_page_dirty (struct page *);
 364 +static sector_t reiser4_bmap(struct address_space *, sector_t);
 365 +/* static int reiser4_direct_IO(int, struct inode *,
 366 +                            struct kiobuf *, unsigned long, int); */
 367 +
 368 +/* address space operations */
 369 +
 370 +/* clear PAGECACHE_TAG_DIRTY tag of a page. This is used in uncapture_page.  This resembles test_clear_page_dirty. The
 371 +   only difference is that page's mapping exists and REISER4_MOVED tag is checked */
 372 +reiser4_internal void
 373 +reiser4_clear_page_dirty(struct page *page)
 374 +{
 375 +       struct address_space *mapping;
 376 +       unsigned long flags;
 377 +
 378 +       mapping = page->mapping;
 379 +       BUG_ON(mapping == NULL);
 380 +
 381 +       read_lock_irqsave(&mapping->tree_lock, flags);
 382 +       if (TestClearPageDirty(page)) {
 383 +               /* clear dirty tag of page in address space radix tree */
 384 +               radix_tree_tag_clear(&mapping->page_tree, page->index,
 385 +                                    PAGECACHE_TAG_DIRTY);
 386 +               /* FIXME: remove this when reiser4_set_page_dirty will skip setting this tag for captured pages */
 387 +               radix_tree_tag_clear(&mapping->page_tree, page->index,
 388 +                                    PAGECACHE_TAG_REISER4_MOVED);
 389 +
 390 +               read_unlock_irqrestore(&mapping->tree_lock, flags);
 391 +               if (!mapping->backing_dev_info->memory_backed)
 392 +                       dec_page_state(nr_dirty);
 393 +               return;
 394 +       }
 395 +       read_unlock_irqrestore(&mapping->tree_lock, flags);
 396 +}
 397 +
 398 +/* as_ops->set_page_dirty() VFS method in reiser4_address_space_operations.
 399 +
 400 +   It is used by others (except reiser4) to set reiser4 pages dirty. Reiser4
 401 +   itself uses set_page_dirty_internal().
 402 +
 403 +   The difference is that reiser4_set_page_dirty sets MOVED tag on the page and clears DIRTY tag. Pages tagged as MOVED
 404 +   get processed by reiser4_writepages() to do reiser4 specific work over dirty pages (allocation jnode, capturing, atom
 405 +   creation) which cannot be done in the contexts where reiser4_set_page_dirty is called.
 406 +   set_page_dirty_internal sets DIRTY tag and clear MOVED
 407 +*/
 408 +static int reiser4_set_page_dirty(struct page *page /* page to mark dirty */)
 409 +{
 410 +       if (!TestSetPageDirty(page)) {
 411 +               struct address_space *mapping = page->mapping;
 412 +
 413 +               if (mapping) {
 414 +                       read_lock_irq(&mapping->tree_lock);
 415 +                       /* check for race with truncate */
 416 +                       if (page->mapping) {
 417 +                               assert("vs-1652", page->mapping == mapping);
 418 +                               if (!mapping->backing_dev_info->memory_backed)
 419 +                                       inc_page_state(nr_dirty);
 420 +                               radix_tree_tag_clear(&mapping->page_tree,
 421 +                                                  page->index, PAGECACHE_TAG_DIRTY);
 422 +                               /* FIXME: if would be nice to not set this tag on pages which are captured already */
 423 +                               radix_tree_tag_set(&mapping->page_tree,
 424 +                                                  page->index, PAGECACHE_TAG_REISER4_MOVED);
 425 +                       }
 426 +                       read_unlock_irq(&mapping->tree_lock);
 427 +                       __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
 428 +               }
 429 +       }
 430 +       return 0;
 431 +}
 432 +
 433 +/* ->readpage() VFS method in reiser4 address_space_operations
 434 +   method serving file mmapping
 435 +*/
 436 +static int
 437 +reiser4_readpage(struct file *f /* file to read from */ ,
 438 +                struct page *page      /* page where to read data
 439 +                                        * into */ )
 440 +{
 441 +       struct inode *inode;
 442 +       file_plugin *fplug;
 443 +       int result;
 444 +       reiser4_context ctx;
 445 +
 446 +       /*
 447 +        * basically calls ->readpage method of object plugin and handles
 448 +        * errors.
 449 +        */
 450 +
 451 +       assert("umka-078", f != NULL);
 452 +       assert("umka-079", page != NULL);
 453 +       assert("nikita-2280", PageLocked(page));
 454 +       assert("vs-976", !PageUptodate(page));
 455 +
 456 +       assert("vs-318", page->mapping && page->mapping->host);
 457 +       assert("nikita-1352", (f == NULL) || (f->f_dentry->d_inode == page->mapping->host));
 458 +
 459 +       /* ->readpage can be called from page fault service routine */
 460 +       assert("nikita-3174", schedulable());
 461 +
 462 +       inode = page->mapping->host;
 463 +       init_context(&ctx, inode->i_sb);
 464 +       fplug = inode_file_plugin(inode);
 465 +       if (fplug->readpage != NULL)
 466 +               result = fplug->readpage(f, page);
 467 +       else
 468 +               result = RETERR(-EINVAL);
 469 +       if (result != 0) {
 470 +               SetPageError(page);
 471 +               unlock_page(page);
 472 +       }
 473 +
 474 +       reiser4_exit_context(&ctx);
 475 +       return 0;
 476 +}
 477 +
 478 +/* ->readpages() VFS method in reiser4 address_space_operations
 479 +   method serving page cache readahead
 480 +
 481 +   reiser4_readpages works in the following way: on input it has coord which is set on extent that addresses first of
 482 +   pages for which read requests are to be issued. So, reiser4_readpages just walks forward through extent unit, finds
 483 +   which blocks are to be read and start read for them.
 484 +
 485 +reiser4_readpages can be called from two places: from
 486 +sys_read->reiser4_read->read_unix_file->read_extent->page_cache_readahead and
 487 +from
 488 +handling page fault:
 489 +handle_mm_fault->do_no_page->filemap_nopage->page_cache_readaround
 490 +
 491 +In first case coord is set by reiser4 read code. This case is detected by  if
 492 +(is_in_reiser4_context()).
 493 +
 494 +In second case, coord is not set and currently, reiser4_readpages does
 495 +nothing.
 496 +*/
 497 +static int
 498 +reiser4_readpages(struct file *file, struct address_space *mapping,
 499 +                 struct list_head *pages, unsigned nr_pages)
 500 +{
 501 +       file_plugin *fplug;
 502 +
 503 +       if (is_in_reiser4_context()) {
 504 +               /* we are called from reiser4 context, typically from method
 505 +                  which implements read into page cache. From read_extent,
 506 +                  for example */
 507 +               fplug = inode_file_plugin(mapping->host);
 508 +               if (fplug->readpages)
 509 +                       fplug->readpages(file, mapping, pages);
 510 +       } else {
 511 +               /* we are called from page fault. Currently, we do not
 512 +                * readahead in this case. */;
 513 +       }
 514 +
 515 +       /* __do_page_cache_readahead expects filesystem's readpages method to
 516 +        * process every page on this list */
 517 +       while (!list_empty(pages)) {
 518 +               struct page *page = list_entry(pages->prev, struct page, lru);
 519 +               list_del(&page->lru);
 520 +               page_cache_release(page);
 521 +       }
 522 +       return 0;
 523 +}
 524 +
 525 +/* prepares @page to be written. This means, that if we want to modify only some
 526 +   part of page, page should be read first and than modified. Actually this function
 527 +   almost the same as reiser4_readpage(). The differentce is only that, it does not
 528 +   unlock the page in the case of error. This is needed because loop back device
 529 +   driver expects it locked. */
 530 +static int reiser4_prepare_write(struct file *file, struct page *page,
 531 +                                unsigned from, unsigned to)
 532 +{
 533 +       int result;
 534 +       file_plugin * fplug;
 535 +       struct inode * inode;
 536 +       reiser4_context ctx;
 537 +
 538 +       inode = page->mapping->host;
 539 +       init_context(&ctx, inode->i_sb);
 540 +       fplug = inode_file_plugin(inode);
 541 +
 542 +       if (fplug->prepare_write != NULL)
 543 +               result = fplug->prepare_write(file, page, from, to);
 544 +       else
 545 +               result = RETERR(-EINVAL);
 546 +
 547 +       /* don't commit transaction under inode semaphore */
 548 +       context_set_commit_async(&ctx);
 549 +       reiser4_exit_context(&ctx);
 550 +
 551 +       return result;
 552 +}
 553 +
 554 +/* captures jnode of @page to current atom. */
 555 +static int reiser4_commit_write(struct file *file, struct page *page,
 556 +                               unsigned from, unsigned to)
 557 +{
 558 +       int result;
 559 +       file_plugin *fplug;
 560 +       struct inode *inode;
 561 +       reiser4_context ctx;
 562 +
 563 +       assert("umka-3101", file != NULL);
 564 +       assert("umka-3102", page != NULL);
 565 +       assert("umka-3093", PageLocked(page));
 566 +
 567 +       SetPageUptodate(page);
 568 +
 569 +       inode = page->mapping->host;
 570 +       init_context(&ctx, inode->i_sb);
 571 +       fplug = inode_file_plugin(inode);
 572 +
 573 +       if (fplug->capturepage)
 574 +               result = fplug->capturepage(page);
 575 +       else
 576 +               result = RETERR(-EINVAL);
 577 +
 578 +       /* here page is return locked. */
 579 +       assert("umka-3103", PageLocked(page));
 580 +
 581 +       /* don't commit transaction under inode semaphore */
 582 +       context_set_commit_async(&ctx);
 583 +       reiser4_exit_context(&ctx);
 584 +       return result;
 585 +}
 586 +
 587 +/* ->writepages()
 588 +   ->vm_writeback()
 589 +   ->set_page_dirty()
 590 +   ->prepare_write()
 591 +   ->commit_write()
 592 +*/
 593 +
 594 +/* ->bmap() VFS method in reiser4 address_space_operations */
 595 +reiser4_internal int
 596 +reiser4_lblock_to_blocknr(struct address_space *mapping,
 597 +                         sector_t lblock, reiser4_block_nr *blocknr)
 598 +{
 599 +       file_plugin *fplug;
 600 +       int result;
 601 +       reiser4_context ctx;
 602 +
 603 +       init_context(&ctx, mapping->host->i_sb);
 604 +       reiser4_stat_inc(vfs_calls.bmap);
 605 +
 606 +       fplug = inode_file_plugin(mapping->host);
 607 +       if (fplug && fplug->get_block) {
 608 +               *blocknr = generic_block_bmap(mapping, lblock, fplug->get_block);
 609 +               result = 0;
 610 +       } else
 611 +               result = RETERR(-EINVAL);
 612 +       reiser4_exit_context(&ctx);
 613 +       return result;
 614 +}
 615 +
 616 +/* ->bmap() VFS method in reiser4 address_space_operations */
 617 +static sector_t
 618 +reiser4_bmap(struct address_space *mapping, sector_t lblock)
 619 +{
 620 +       reiser4_block_nr blocknr;
 621 +       int result;
 622 +
 623 +       result = reiser4_lblock_to_blocknr(mapping, lblock, &blocknr);
 624 +       if (result == 0)
 625 +               if (sizeof blocknr == sizeof(sector_t) ||
 626 +                   !blocknr_is_fake(&blocknr))
 627 +                       return blocknr;
 628 +               else
 629 +                       return 0;
 630 +       else
 631 +               return result;
 632 +}
 633 +
 634 +/* ->invalidatepage method for reiser4 */
 635 +
 636 +/*
 637 + * this is called for each truncated page from
 638 + * truncate_inode_pages()->truncate_{complete,partial}_page().
 639 + *
 640 + * At the moment of call, page is under lock, and outstanding io (if any) has
 641 + * completed.
 642 + */
 643 +
 644 +reiser4_internal int
 645 +reiser4_invalidatepage(struct page *page /* page to invalidate */,
 646 +                      unsigned long offset /* starting offset for partial
 647 +                                            * invalidation */)
 648 +{
 649 +       int ret = 0;
 650 +       reiser4_context ctx;
 651 +       struct inode *inode;
 652 +
 653 +       /*
 654 +        * This is called to truncate file's page.
 655 +        *
 656 +        * Originally, reiser4 implemented truncate in a standard way
 657 +        * (vmtruncate() calls ->invalidatepage() on all truncated pages
 658 +        * first, then file system ->truncate() call-back is invoked).
 659 +        *
 660 +        * This lead to the problem when ->invalidatepage() was called on a
 661 +        * page with jnode that was captured into atom in ASTAGE_PRE_COMMIT
 662 +        * process. That is, truncate was bypassing transactions. To avoid
 663 +        * this, try_capture_page_to_invalidate() call was added here.
 664 +        *
 665 +        * After many troubles with vmtruncate() based truncate (including
 666 +        * races with flush, tail conversion, etc.) it was re-written in the
 667 +        * top-to-bottom style: items are killed in cut_tree_object() and
 668 +        * pages belonging to extent are invalidated in kill_hook_extent(). So
 669 +        * probably now additional call to capture is not needed here.
 670 +        *
 671 +        */
 672 +
 673 +       assert("nikita-3137", PageLocked(page));
 674 +       assert("nikita-3138", !PageWriteback(page));
 675 +       inode = page->mapping->host;
 676 +
 677 +       /*
 678 +        * ->invalidatepage() should only be called for the unformatted
 679 +        * jnodes. Destruction of all other types of jnodes is performed
 680 +        * separately. But, during some corner cases (like handling errors
 681 +        * during mount) it is simpler to let ->invalidatepage to be called on
 682 +        * them. Check for this, and do nothing.
 683 +        */
 684 +       if (get_super_fake(inode->i_sb) == inode)
 685 +               return 0;
 686 +       if (get_cc_fake(inode->i_sb) == inode)
 687 +               return 0;
 688 +       if (get_super_private(inode->i_sb)->bitmap == inode)
 689 +               return 0;
 690 +
 691 +       assert("vs-1426", PagePrivate(page));
 692 +       assert("vs-1427", page->mapping == jnode_get_mapping(jnode_by_page(page)));
 693 +
 694 +       init_context(&ctx, inode->i_sb);
 695 +       /* capture page being truncated. */
 696 +       ret = try_capture_page_to_invalidate(page);
 697 +       if (ret != 0) {
 698 +               warning("nikita-3141", "Cannot capture: %i", ret);
 699 +               print_page("page", page);
 700 +       }
 701 +
 702 +       if (offset == 0) {
 703 +               jnode *node;
 704 +
 705 +               /* remove jnode from transaction and detach it from page. */
 706 +               node = jnode_by_page(page);
 707 +               if (node != NULL) {
 708 +                       assert("vs-1435", !JF_ISSET(node, JNODE_CC));
 709 +                       jref(node);
 710 +                       JF_SET(node, JNODE_HEARD_BANSHEE);
 711 +                       /* page cannot be detached from jnode concurrently,
 712 +                        * because it is locked */
 713 +                       uncapture_page(page);
 714 +
 715 +                       /* this detaches page from jnode, so that jdelete will not try to lock page which is already locked */
 716 +                       UNDER_SPIN_VOID(jnode,
 717 +                                       node,
 718 +                                       page_clear_jnode(page, node));
 719 +                       unhash_unformatted_jnode(node);
 720 +
 721 +                       jput(node);
 722 +               }
 723 +       }
 724 +       reiser4_exit_context(&ctx);
 725 +       return ret;
 726 +}
 727 +
 728 +#define INC_STAT(page, node, counter)                                          \
 729 +       reiser4_stat_inc_at(page->mapping->host->i_sb,                          \
 730 +                           level[jnode_get_level(node)].counter);
 731 +
 732 +#define INC_NSTAT(node, counter) INC_STAT(jnode_page(node), node, counter)
 733 +
 734 +int is_cced(const jnode *node);
 735 +
 736 +/* help function called from reiser4_releasepage(). It returns true if jnode
 737 + * can be detached from its page and page released. */
 738 +static int
 739 +releasable(const jnode *node /* node to check */)
 740 +{
 741 +       assert("nikita-2781", node != NULL);
 742 +       assert("nikita-2783", spin_jnode_is_locked(node));
 743 +
 744 +       /* is some thread is currently using jnode page, later cannot be
 745 +        * detached */
 746 +       if (atomic_read(&node->d_count) != 0) {
 747 +               INC_NSTAT(node, vm.release.loaded);
 748 +               return 0;
 749 +       }
 750 +
 751 +       assert("vs-1214", !jnode_is_loaded(node));
 752 +
 753 +       /* this jnode is just a copy. Its page cannot be released, because
 754 +        * otherwise next jload() would load obsolete data from disk
 755 +        * (up-to-date version may still be in memory). */
 756 +       if (is_cced(node)) {
 757 +               INC_NSTAT(node, vm.release.copy);
 758 +               return 0;
 759 +       }
 760 +
 761 +       /* emergency flushed page can be released. This is what emergency
 762 +        * flush is all about after all. */
 763 +       if (JF_ISSET(node, JNODE_EFLUSH)) {
 764 +               INC_NSTAT(node, vm.release.eflushed);
 765 +               return 1; /* yeah! */
 766 +       }
 767 +
 768 +       /* can only release page if real block number is assigned to
 769 +          it. Simple check for ->atom wouldn't do, because it is possible for
 770 +          node to be clean, not it atom yet, and still having fake block
 771 +          number. For example, node just created in jinit_new(). */
 772 +       if (blocknr_is_fake(jnode_get_block(node))) {
 773 +               INC_NSTAT(node, vm.release.fake);
 774 +               return 0;
 775 +       }
 776 +       /* dirty jnode cannot be released. It can however be submitted to disk
 777 +        * as part of early flushing, but only after getting flush-prepped. */
 778 +       if (jnode_is_dirty(node)) {
 779 +               INC_NSTAT(node, vm.release.dirty);
 780 +               return 0;
 781 +       }
 782 +       /* overwrite set is only written by log writer. */
 783 +       if (JF_ISSET(node, JNODE_OVRWR)) {
 784 +               INC_NSTAT(node, vm.release.ovrwr);
 785 +               return 0;
 786 +       }
 787 +       /* jnode is already under writeback */
 788 +       if (JF_ISSET(node, JNODE_WRITEBACK)) {
 789 +               INC_NSTAT(node, vm.release.writeback);
 790 +               return 0;
 791 +       }
 792 +       /* page was modified through mmap, but its jnode is not yet
 793 +        * captured. Don't discard modified data. */
 794 +       if (jnode_is_unformatted(node) && JF_ISSET(node, JNODE_KEEPME)) {
 795 +               INC_NSTAT(node, vm.release.keepme);
 796 +               return 0;
 797 +       }
 798 +       /* don't flush bitmaps or journal records */
 799 +       if (!jnode_is_znode(node) && !jnode_is_unformatted(node)) {
 800 +               INC_NSTAT(node, vm.release.bitmap);
 801 +               return 0;
 802 +       }
 803 +       return 1;
 804 +}
 805 +
 806 +#if REISER4_DEBUG
 807 +int jnode_is_releasable(jnode *node)
 808 +{
 809 +       return UNDER_SPIN(jload, node, releasable(node));
 810 +}
 811 +#endif
 812 +
 813 +/*
 814 + * ->releasepage method for reiser4
 815 + *
 816 + * This is called by VM scanner when it comes across clean page.  What we have
 817 + * to do here is to check whether page can really be released (freed that is)
 818 + * and if so, detach jnode from it and remove page from the page cache.
 819 + *
 820 + * Check for releasability is done by releasable() function.
 821 + */
 822 +reiser4_internal int
 823 +reiser4_releasepage(struct page *page, int gfp UNUSED_ARG)
 824 +{
 825 +       jnode *node;
 826 +       void *oid;
 827 +
 828 +       assert("nikita-2257", PagePrivate(page));
 829 +       assert("nikita-2259", PageLocked(page));
 830 +       assert("nikita-2892", !PageWriteback(page));
 831 +       assert("nikita-3019", schedulable());
 832 +
 833 +       /* NOTE-NIKITA: this can be called in the context of reiser4 call. It
 834 +          is not clear what to do in this case. A lot of deadlocks seems be
 835 +          possible. */
 836 +
 837 +       node = jnode_by_page(page);
 838 +       assert("nikita-2258", node != NULL);
 839 +       assert("reiser4-4", page->mapping != NULL);
 840 +       assert("reiser4-5", page->mapping->host != NULL);
 841 +
 842 +       INC_STAT(page, node, vm.release.try);
 843 +
 844 +       oid = (void *)(unsigned long)get_inode_oid(page->mapping->host);
 845 +
 846 +       /* is_page_cache_freeable() check
 847 +          (mapping + private + page_cache_get() by shrink_cache()) */
 848 +       if (page_count(page) > 3)
 849 +               return 0;
 850 +
 851 +       if (PageDirty(page))
 852 +               return 0;
 853 +
 854 +       /* releasable() needs jnode lock, because it looks at the jnode fields
 855 +        * and we need jload_lock here to avoid races with jload(). */
 856 +       LOCK_JNODE(node);
 857 +       LOCK_JLOAD(node);
 858 +       if (releasable(node)) {
 859 +               struct address_space *mapping;
 860 +
 861 +               mapping = page->mapping;
 862 +               INC_STAT(page, node, vm.release.ok);
 863 +               jref(node);
 864 +               if (jnode_is_znode(node))
 865 +                       ON_STATS(znode_at_read(JZNODE(node)));
 866 +               /* there is no need to synchronize against
 867 +                * jnode_extent_write() here, because pages seen by
 868 +                * jnode_extent_write() are !releasable(). */
 869 +               page_clear_jnode(page, node);
 870 +               UNLOCK_JLOAD(node);
 871 +               UNLOCK_JNODE(node);
 872 +
 873 +               /* we are under memory pressure so release jnode also. */
 874 +               jput(node);
 875 +
 876 +               write_lock_irq(&mapping->tree_lock);
 877 +               /* shrink_list() + radix-tree */
 878 +               if (page_count(page) == 2) {
 879 +                       __remove_from_page_cache(page);
 880 +                       __put_page(page);
 881 +               }
 882 +               write_unlock_irq(&mapping->tree_lock);
 883 +
 884 +               return 1;
 885 +       } else {
 886 +               UNLOCK_JLOAD(node);
 887 +               UNLOCK_JNODE(node);
 888 +               assert("nikita-3020", schedulable());
 889 +               return 0;
 890 +       }
 891 +}
 892 +
 893 +#undef INC_NSTAT
 894 +#undef INC_STAT
 895 +
 896 +reiser4_internal void
 897 +move_inode_out_from_sync_inodes_loop(struct address_space * mapping)
 898 +{
 899 +       /* work around infinite loop in pdflush->sync_sb_inodes. */
 900 +       /* Problem: ->writepages() is supposed to submit io for the pages from
 901 +        * ->io_pages list and to clean this list. */
 902 +       mapping->host->dirtied_when = jiffies;
 903 +       spin_lock(&inode_lock);
 904 +       list_move(&mapping->host->i_list, &mapping->host->i_sb->s_dirty);
 905 +       spin_unlock(&inode_lock);
 906 +
 907 +}
 908 +
 909 +/* reiser4 writepages() address space operation this captures anonymous pages
 910 +   and anonymous jnodes. Anonymous pages are pages which are dirtied via
 911 +   mmapping. Anonymous jnodes are ones which were created by reiser4_writepage
 912 + */
 913 +reiser4_internal int
 914 +reiser4_writepages(struct address_space *mapping,
 915 +                  struct writeback_control *wbc)
 916 +{
 917 +       int ret = 0;
 918 +       struct inode *inode;
 919 +       file_plugin *fplug;
 920 +
 921 +       inode = mapping->host;
 922 +       fplug = inode_file_plugin(inode);
 923 +       if (fplug != NULL && fplug->capture != NULL) {
 924 +               long captured = 0;
 925 +
 926 +               /* call file plugin method to capture anonymous pages and
 927 +                * anonymous jnodes */
 928 +               ret = fplug->capture(inode, wbc, &captured);
 929 +       }
 930 +
 931 +       move_inode_out_from_sync_inodes_loop(mapping);
 932 +       return ret;
 933 +}
 934 +
 935 +/* start actual IO on @page */
 936 +reiser4_internal int reiser4_start_up_io(struct page *page)
 937 +{
 938 +       block_sync_page(page);
 939 +       return 0;
 940 +}
 941 +
 942 +/*
 943 + * reiser4 methods for VM
 944 + */
 945 +struct address_space_operations reiser4_as_operations = {
 946 +       /* called during memory pressure by kswapd */
 947 +       .writepage = reiser4_writepage,
 948 +       /* called to read page from the storage when page is added into page
 949 +          cache. This is done by page-fault handler. */
 950 +       .readpage = reiser4_readpage,
 951 +       /* Start IO on page. This is called from wait_on_page_bit() and
 952 +          lock_page() and its purpose is to actually start io by jabbing
 953 +          device drivers. */
 954 +       .sync_page = reiser4_start_up_io,
 955 +       /* called from
 956 +        * reiser4_sync_inodes()->generic_sync_sb_inodes()->...->do_writepages()
 957 +        *
 958 +        * captures anonymous pages for given inode
 959 +        */
 960 +       .writepages = reiser4_writepages,
 961 +       /* marks page dirty. Note that this is never called by reiser4
 962 +        * directly. Reiser4 uses set_page_dirty_internal(). Reiser4 set page
 963 +        * dirty is called for pages dirtied though mmap and moves dirty page
 964 +        * to the special ->moved_list in its mapping. */
 965 +       .set_page_dirty = reiser4_set_page_dirty,
 966 +       /* called during read-ahead */
 967 +       .readpages = reiser4_readpages,
 968 +       .prepare_write = reiser4_prepare_write, /* loop back device driver and generic_file_write() call-back */
 969 +       .commit_write = reiser4_commit_write,  /* loop back device driver and generic_file_write() call-back */
 970 +       /* map logical block number to disk block number. Used by FIBMAP ioctl
 971 +        * and ..bmap pseudo file. */
 972 +       .bmap = reiser4_bmap,
 973 +       /* called just before page is taken out from address space (on
 974 +          truncate, umount, or similar).  */
 975 +       .invalidatepage = reiser4_invalidatepage,
 976 +       /* called when VM is about to take page from address space (due to
 977 +          memory pressure). */
 978 +       .releasepage = reiser4_releasepage,
 979 +       /* not yet implemented */
 980 +       .direct_IO = NULL
 981 +};
 982 +
 983 +/* Make Linus happy.
 984 +   Local variables:
 985 +   c-indentation-style: "K&R"
 986 +   mode-name: "LC"
 987 +   c-basic-offset: 8
 988 +   tab-width: 8
 989 +   fill-column: 120
 990 +   End:
 991 +*/
 992 diff -rupN linux-2.6.8-rc3/fs/reiser4/block_alloc.c linux-2.6.8-rc3-a/fs/reiser4/block_alloc.c
 993 --- linux-2.6.8-rc3/fs/reiser4/block_alloc.c    1970-01-01 03:00:00.000000000 +0300
 994 +++ linux-2.6.8-rc3-a/fs/reiser4/block_alloc.c  2004-08-05 21:20:53.206634490 +0400
 995 @@ -0,0 +1,1196 @@
 996 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
 997 +
 998 +#include "debug.h"
 999 +#include "dformat.h"
1000 +#include "plugin/plugin.h"
1001 +#include "txnmgr.h"
1002 +#include "znode.h"
1003 +#include "block_alloc.h"
1004 +#include "tree.h"
1005 +#include "super.h"
1006 +#include "lib.h"
1007 +
1008 +#include <linux/types.h>       /* for __u??  */
1009 +#include <linux/fs.h>          /* for struct super_block  */
1010 +#include <linux/spinlock.h>
1011 +
1012 +/* THE REISER4 DISK SPACE RESERVATION SCHEME. */
1013 +
1014 +/* We need to be able to reserve enough disk space to ensure that an atomic
1015 +   operation will have enough disk space to flush (see flush.c and
1016 +   http://namesys.com/v4/v4.html) and commit it once it is started.
1017 +
1018 +   In our design a call for reserving disk space may fail but not an actual
1019 +   block allocation.
1020 +
1021 +   All free blocks, already allocated blocks, and all kinds of reserved blocks
1022 +   are counted in different per-fs block counters.
1023 +
1024 +   A reiser4 super block's set of block counters currently is:
1025 +
1026 +   free -- free blocks,
1027 +   used -- already allocated blocks,
1028 +
1029 +   grabbed -- initially reserved for performing an fs operation, those blocks
1030 +          are taken from free blocks, then grabbed disk space leaks from grabbed
1031 +          blocks counter to other counters like "fake allocated", "flush
1032 +          reserved", "used", the rest of not used grabbed space is returned to
1033 +          free space at the end of fs operation;
1034 +
1035 +   fake allocated -- counts all nodes without real disk block numbers assigned,
1036 +                     we have separate accounting for formatted and unformatted
1037 +                     nodes (for easier debugging);
1038 +
1039 +   flush reserved -- disk space needed for flushing and committing an atom.
1040 +                     Each dirty already allocated block could be written as a
1041 +                     part of atom's overwrite set or as a part of atom's
1042 +                     relocate set.  In both case one additional block is needed,
1043 +                     it is used as a wandered block if we do overwrite or as a
1044 +                    new location for a relocated block.
1045 +
1046 +   In addition, blocks in some states are counted on per-thread and per-atom
1047 +   basis.  A reiser4 context has a counter of blocks grabbed by this transaction
1048 +   and the sb's grabbed blocks counter is a sum of grabbed blocks counter values
1049 +   of each reiser4 context.  Each reiser4 atom has a counter of "flush reserved"
1050 +   blocks, which are reserved for flush processing and atom commit. */
1051 +
1052 +/* AN EXAMPLE: suppose we insert new item to the reiser4 tree.  We estimate
1053 +   number of blocks to grab for most expensive case of balancing when the leaf
1054 +   node we insert new item to gets split and new leaf node is allocated.
1055 +
1056 +   So, we need to grab blocks for
1057 +
1058 +   1) one block for possible dirtying the node we insert an item to. That block
1059 +      would be used for node relocation at flush time or for allocating of a
1060 +      wandered one, it depends what will be a result (what set, relocate or
1061 +      overwrite the node gets assigned to) of the node processing by the flush
1062 +      algorithm.
1063 +
1064 +   2) one block for either allocating a new node, or dirtying of right or left
1065 +      clean neighbor, only one case may happen.
1066 +
1067 +   VS-FIXME-HANS: why can only one case happen? I would expect to see dirtying of left neighbor, right neighbor, current
1068 +   node, and creation of new node.  have I forgotten something?  email me.
1069 +
1070 +   These grabbed blocks are counted in both reiser4 context "grabbed blocks"
1071 +   counter and in the fs-wide one (both ctx->grabbed_blocks and
1072 +   sbinfo->blocks_grabbed get incremented by 2), sb's free blocks counter is
1073 +   decremented by 2.
1074 +
1075 +   Suppose both two blocks were spent for dirtying of an already allocated clean
1076 +   node (one block went from "grabbed" to "flush reserved") and for new block
1077 +   allocating (one block went from "grabbed" to "fake allocated formatted").
1078 +
1079 +   Inserting of a child pointer to the parent node caused parent node to be
1080 +   split, the balancing code takes care about this grabbing necessary space
1081 +   immediately by calling reiser4_grab with BA_RESERVED flag set which means
1082 +   "can use the 5% reserved disk space".
1083 +
1084 +   At this moment insertion completes and grabbed blocks (if they were not used)
1085 +   should be returned to the free space counter.
1086 +
1087 +   However the atom life-cycle is not completed.  The atom had one "flush
1088 +   reserved" block added by our insertion and the new fake allocated node is
1089 +   counted as a "fake allocated formatted" one.  The atom has to be fully
1090 +   processed by flush before commit.  Suppose that the flush moved the first,
1091 +   already allocated node to the atom's overwrite list, the new fake allocated
1092 +   node, obviously, went into the atom relocate set.  The reiser4 flush
1093 +   allocates the new node using one unit from "fake allocated formatted"
1094 +   counter, the log writer uses one from "flush reserved" for wandered block
1095 +   allocation.
1096 +
1097 +   And, it is not the end.  When the wandered block is deallocated after the
1098 +   atom gets fully played (see wander.c for term description), the disk space
1099 +   occupied for it is returned to free blocks. */
1100 +
1101 +/* BLOCK NUMBERS */
1102 +
1103 +/* Any reiser4 node has a block number assigned to it.  We use these numbers for
1104 +   indexing in hash tables, so if a block has not yet been assigned a location
1105 +   on disk we need to give it a temporary fake block number.
1106 +
1107 +   Current implementation of reiser4 uses 64-bit integers for block numbers. We
1108 +   use highest bit in 64-bit block number to distinguish fake and real block
1109 +   numbers. So, only 63 bits may be used to addressing of real device
1110 +   blocks. That "fake" block numbers space is divided into subspaces of fake
1111 +   block numbers for data blocks and for shadow (working) bitmap blocks.
1112 +
1113 +   Fake block numbers for data blocks are generated by a cyclic counter, which
1114 +   gets incremented after each real block allocation. We assume that it is
1115 +   impossible to overload this counter during one transaction life. */
1116 +
1117 +/* Initialize a blocknr hint. */
1118 +reiser4_internal void
1119 +blocknr_hint_init(reiser4_blocknr_hint * hint)
1120 +{
1121 +       xmemset(hint, 0, sizeof (reiser4_blocknr_hint));
1122 +}
1123 +
1124 +/* Release any resources of a blocknr hint. */
1125 +reiser4_internal void
1126 +blocknr_hint_done(reiser4_blocknr_hint * hint UNUSED_ARG)
1127 +{
1128 +       /* No resources should be freed in current blocknr_hint implementation.*/
1129 +}
1130 +
1131 +/* see above for explanation of fake block number.  */
1132 +/* Audited by: green(2002.06.11) */
1133 +reiser4_internal int
1134 +blocknr_is_fake(const reiser4_block_nr * da)
1135 +{
1136 +       /* The reason for not simply returning result of '&' operation is that
1137 +          while return value is (possibly 32bit) int,  the reiser4_block_nr is
1138 +          at least 64 bits long, and high bit (which is the only possible
1139 +          non zero bit after the masking) would be stripped off */
1140 +       return (*da & REISER4_FAKE_BLOCKNR_BIT_MASK) ? 1 : 0;
1141 +}
1142 +
1143 +/* Static functions for <reiser4 super block>/<reiser4 context> block counters
1144 +   arithmetic. Mostly, they are isolated to not to code same assertions in
1145 +   several places. */
1146 +static void
1147 +sub_from_ctx_grabbed(reiser4_context *ctx, __u64 count)
1148 +{
1149 +       if (ctx->grabbed_blocks < count)
1150 +               print_clog();
1151 +       BUG_ON(ctx->grabbed_blocks < count);
1152 +       assert("zam-527", ctx->grabbed_blocks >= count);
1153 +       ctx->grabbed_blocks -= count;
1154 +}
1155 +
1156 +
1157 +static void
1158 +sub_from_sb_grabbed(reiser4_super_info_data *sbinfo, __u64 count)
1159 +{
1160 +       assert("zam-525", sbinfo->blocks_grabbed >= count);
1161 +       sbinfo->blocks_grabbed -= count;
1162 +}
1163 +
1164 +/* Decrease the counter of block reserved for flush in super block. */
1165 +static void
1166 +sub_from_sb_flush_reserved (reiser4_super_info_data *sbinfo, __u64 count)
1167 +{
1168 +       assert ("vpf-291", sbinfo->blocks_flush_reserved >= count);
1169 +       sbinfo->blocks_flush_reserved -= count;
1170 +}
1171 +
1172 +static void
1173 +sub_from_sb_fake_allocated(reiser4_super_info_data *sbinfo, __u64 count, reiser4_ba_flags_t flags)
1174 +{
1175 +       if (flags & BA_FORMATTED) {
1176 +               assert("zam-806", sbinfo->blocks_fake_allocated >= count);
1177 +               sbinfo->blocks_fake_allocated -= count;
1178 +       } else {
1179 +               assert("zam-528", sbinfo->blocks_fake_allocated_unformatted >= count);
1180 +               sbinfo->blocks_fake_allocated_unformatted -= count;
1181 +       }
1182 +}
1183 +
1184 +static void
1185 +sub_from_sb_used(reiser4_super_info_data *sbinfo, __u64 count)
1186 +{
1187 +       assert("zam-530", sbinfo->blocks_used >= count + sbinfo->min_blocks_used);
1188 +       sbinfo->blocks_used -= count;
1189 +}
1190 +
1191 +static void
1192 +sub_from_cluster_reserved(reiser4_super_info_data *sbinfo, __u64 count)
1193 +{
1194 +       assert("edward-501", sbinfo->blocks_clustered >= count);
1195 +       sbinfo->blocks_clustered -= count;
1196 +}
1197 +
1198 +/* Increase the counter of block reserved for flush in atom. */
1199 +static void
1200 +add_to_atom_flush_reserved_nolock (txn_atom * atom, __u32 count)
1201 +{
1202 +       assert ("zam-772", atom != NULL);
1203 +       assert ("zam-773", spin_atom_is_locked (atom));
1204 +       atom->flush_reserved += count;
1205 +}
1206 +
1207 +/* Decrease the counter of block reserved for flush in atom. */
1208 +static void
1209 +sub_from_atom_flush_reserved_nolock (txn_atom * atom, __u32 count)
1210 +{
1211 +       assert ("zam-774", atom != NULL);
1212 +       assert ("zam-775", spin_atom_is_locked (atom));
1213 +       assert ("nikita-2790", atom->flush_reserved >= count);
1214 +       atom->flush_reserved -= count;
1215 +}
1216 +
1217 +/* super block has 6 counters: free, used, grabbed, fake allocated
1218 +   (formatted and unformatted) and flush reserved. Their sum must be
1219 +   number of blocks on a device. This function checks this */
1220 +reiser4_internal int
1221 +check_block_counters(const struct super_block *super)
1222 +{
1223 +       __u64 sum;
1224 +
1225 +       sum = reiser4_grabbed_blocks(super) + reiser4_free_blocks(super) +
1226 +               reiser4_data_blocks(super) + reiser4_fake_allocated(super) +
1227 +               reiser4_fake_allocated_unformatted(super) + flush_reserved(super) +
1228 +               reiser4_clustered_blocks(super);
1229 +       if (reiser4_block_count(super) != sum) {
1230 +               printk("super block counters: "
1231 +                      "used %llu, free %llu, "
1232 +                      "grabbed %llu, fake allocated (formatetd %llu, unformatted %llu), "
1233 +                      "reserved %llu, clustered %llu, sum %llu, must be (block count) %llu\n",
1234 +                      reiser4_data_blocks(super),
1235 +                      reiser4_free_blocks(super),
1236 +                      reiser4_grabbed_blocks(super),
1237 +                      reiser4_fake_allocated(super),
1238 +                      reiser4_fake_allocated_unformatted(super),
1239 +                      flush_reserved(super),
1240 +                      reiser4_clustered_blocks(super),
1241 +                      sum, reiser4_block_count(super));
1242 +               return 0;
1243 +       }
1244 +       return 1;
1245 +}
1246 +
1247 +#if REISER4_DEBUG_OUTPUT
1248 +reiser4_internal void
1249 +print_block_counters(const char *prefix,
1250 +                    const struct super_block *super, txn_atom *atom)
1251 +{
1252 +       if (super == NULL)
1253 +               super = reiser4_get_current_sb();
1254 +       printk("%s:\tsuper: G: %llu, F: %llu, D: %llu, U: %llu + %llu, R: %llu, C: %llu, T: %llu\n",
1255 +              prefix,
1256 +              reiser4_grabbed_blocks(super),
1257 +              reiser4_free_blocks(super),
1258 +              reiser4_data_blocks(super),
1259 +              reiser4_fake_allocated(super),
1260 +              reiser4_fake_allocated_unformatted(super),
1261 +              flush_reserved(super),
1262 +              reiser4_clustered_blocks(super),
1263 +              reiser4_block_count(super));
1264 +       printk("\tcontext: G: %llu",
1265 +              get_current_context()->grabbed_blocks);
1266 +       if (atom == NULL)
1267 +               atom = get_current_atom_locked_nocheck();
1268 +       if (atom != NULL) {
1269 +               printk("\tatom: R: %llu", atom->flush_reserved);
1270 +               UNLOCK_ATOM(atom);
1271 +       }
1272 +       printk("\n");
1273 +}
1274 +#endif
1275 +
1276 +/* Adjust "working" free blocks counter for number of blocks we are going to
1277 +   allocate.  Record number of grabbed blocks in fs-wide and per-thread
1278 +   counters.  This function should be called before bitmap scanning or
1279 +   allocating fake block numbers
1280 +
1281 +   @super           -- pointer to reiser4 super block;
1282 +   @count           -- number of blocks we reserve;
1283 +
1284 +   @return          -- 0 if success,  -ENOSPC, if all
1285 +                       free blocks are preserved or already allocated.
1286 +*/
1287 +
1288 +static int
1289 +reiser4_grab(reiser4_context *ctx, __u64 count, reiser4_ba_flags_t flags)
1290 +{
1291 +       __u64 free_blocks;
1292 +       int ret = 0, use_reserved = flags & BA_RESERVED;
1293 +       reiser4_super_info_data *sbinfo;
1294 +
1295 +       assert("vs-1276", ctx == get_current_context());
1296 +
1297 +       sbinfo = get_super_private(ctx->super);
1298 +
1299 +       reiser4_spin_lock_sb(sbinfo);
1300 +
1301 +       free_blocks = sbinfo->blocks_free;
1302 +
1303 +       ON_TRACE(TRACE_ALLOC, "reiser4_grab: free_blocks %llu\n", free_blocks);
1304 +
1305 +       if ((use_reserved && free_blocks < count) ||
1306 +           (!use_reserved && free_blocks < count + sbinfo->blocks_reserved)) {
1307 +               ret = RETERR(-ENOSPC);
1308 +
1309 +               ON_TRACE(TRACE_ALLOC, "reiser4_grab: ENOSPC: count %llu\n", count);
1310 +
1311 +               goto unlock_and_ret;
1312 +       }
1313 +
1314 +       ctx->grabbed_blocks += count;
1315 +
1316 +       sbinfo->blocks_grabbed += count;
1317 +       sbinfo->blocks_free -= count;
1318 +
1319 +#if REISER4_DEBUG
1320 +       ctx->grabbed_initially = count;
1321 +       fill_backtrace(&ctx->grabbed_at, REISER4_BACKTRACE_DEPTH, 0);
1322 +#endif
1323 +
1324 +       assert("nikita-2986", check_block_counters(ctx->super));
1325 +
1326 +       ON_TRACE(TRACE_ALLOC, "%s: grabbed %llu, free blocks left %llu\n",
1327 +                __FUNCTION__, count, reiser4_free_blocks (ctx->super));
1328 +
1329 +       /* disable grab space in current context */
1330 +       ctx->grab_enabled = 0;
1331 +
1332 +unlock_and_ret:
1333 +       reiser4_spin_unlock_sb(sbinfo);
1334 +
1335 +       return ret;
1336 +}
1337 +
1338 +reiser4_internal int
1339 +reiser4_grab_space(__u64 count, reiser4_ba_flags_t flags)
1340 +{
1341 +       int ret;
1342 +       reiser4_context *ctx;
1343 +
1344 +       assert("nikita-2964", ergo(flags & BA_CAN_COMMIT,
1345 +                                  lock_stack_isclean(get_current_lock_stack())));
1346 +       ON_TRACE(TRACE_RESERVE, "grab_space: %llu block(s).", count);
1347 +
1348 +       ctx = get_current_context();
1349 +       if (!(flags & BA_FORCE) && !is_grab_enabled(ctx)) {
1350 +               ON_TRACE(TRACE_RESERVE, "grab disabled and not forced!\n");
1351 +               return 0;
1352 +       }
1353 +
1354 +       ret = reiser4_grab(ctx, count, flags);
1355 +       if (ret == -ENOSPC) {
1356 +
1357 +               /* Trying to commit the all transactions if BA_CAN_COMMIT flag present */
1358 +               if (flags & BA_CAN_COMMIT) {
1359 +
1360 +                       ON_TRACE(TRACE_RESERVE, "force commit!..");
1361 +
1362 +                       txnmgr_force_commit_all(ctx->super, 0);
1363 +
1364 +                       ctx->grab_enabled = 1;
1365 +                       ret = reiser4_grab(ctx, count, flags);
1366 +               }
1367 +       }
1368 +       ON_TRACE(TRACE_RESERVE, "%s(%d)\n", (ret == 0) ? "ok" : "failed", ret);
1369 +       /*
1370 +        * allocation from reserved pool cannot fail. This is severe error.
1371 +        */
1372 +       assert("nikita-3005", ergo(flags & BA_RESERVED, ret == 0));
1373 +       return ret;
1374 +}
1375 +
1376 +/*
1377 + * SPACE RESERVED FOR UNLINK/TRUNCATE
1378 + *
1379 + * Unlink and truncate require space in transaction (to update stat data, at
1380 + * least). But we don't want rm(1) to fail with "No space on device" error.
1381 + *
1382 + * Solution is to reserve 5% of disk space for truncates and
1383 + * unlinks. Specifically, normal space grabbing requests don't grab space from
1384 + * reserved area. Only requests with BA_RESERVED bit in flags are allowed to
1385 + * drain it. Per super block delete_sema semaphore is used to allow only one
1386 + * thread at a time to grab from reserved area.
1387 + *
1388 + * Grabbing from reserved area should always be performed with BA_CAN_COMMIT
1389 + * flag.
1390 + *
1391 + */
1392 +
1393 +reiser4_internal int reiser4_grab_reserved(struct super_block *super,
1394 +                                          __u64 count, reiser4_ba_flags_t flags)
1395 +{
1396 +       reiser4_super_info_data *sbinfo = get_super_private(super);
1397 +
1398 +       assert("nikita-3175", flags & BA_CAN_COMMIT);
1399 +
1400 +       /* Check the delete semaphore already taken by us, we assume that
1401 +        * reading of machine word is atomic. */
1402 +       if (sbinfo->delete_sema_owner == current) {
1403 +               if (reiser4_grab_space(count, (flags | BA_RESERVED) & ~BA_CAN_COMMIT)) {
1404 +                       warning("zam-1003", "nested call of grab_reserved fails count=(%llu)",
1405 +                               (unsigned long long)count);
1406 +                       reiser4_release_reserved(super);
1407 +                       return RETERR(-ENOSPC);
1408 +               }
1409 +               return 0;
1410 +       }
1411 +
1412 +       if (reiser4_grab_space(count, flags)) {
1413 +               down(&sbinfo->delete_sema);
1414 +               assert("nikita-2929", sbinfo->delete_sema_owner == NULL);
1415 +               sbinfo->delete_sema_owner = current;
1416 +
1417 +               if (reiser4_grab_space(count, flags | BA_RESERVED)) {
1418 +                       warning("zam-833",
1419 +                               "reserved space is not enough (%llu)", (unsigned long long)count);
1420 +                       reiser4_release_reserved(super);
1421 +                       return RETERR(-ENOSPC);
1422 +               }
1423 +       }
1424 +       return 0;
1425 +}
1426 +
1427 +reiser4_internal void
1428 +reiser4_release_reserved(struct super_block *super)
1429 +{
1430 +       reiser4_super_info_data *info;
1431 +
1432 +       info = get_super_private(super);
1433 +       if (info->delete_sema_owner == current) {
1434 +               info->delete_sema_owner = NULL;
1435 +               up(&info->delete_sema);
1436 +       }
1437 +}
1438 +
1439 +static reiser4_super_info_data *
1440 +grabbed2fake_allocated_head(void)
1441 +{
1442 +       reiser4_context *ctx;
1443 +       reiser4_super_info_data *sbinfo;
1444 +
1445 +       ctx = get_current_context();
1446 +       sub_from_ctx_grabbed(ctx, 1);
1447 +
1448 +       sbinfo = get_super_private(ctx->super);
1449 +       reiser4_spin_lock_sb(sbinfo);
1450 +
1451 +       sub_from_sb_grabbed(sbinfo, 1);
1452 +       /* return sbinfo locked */
1453 +       return sbinfo;
1454 +}
1455 +
1456 +/* is called after @count fake block numbers are allocated and pointer to
1457 +   those blocks are inserted into tree. */
1458 +static void
1459 +grabbed2fake_allocated_formatted(void)
1460 +{
1461 +       reiser4_super_info_data *sbinfo;
1462 +
1463 +       sbinfo = grabbed2fake_allocated_head();
1464 +       sbinfo->blocks_fake_allocated ++;
1465 +
1466 +       assert("vs-922", check_block_counters(reiser4_get_current_sb()));
1467 +
1468 +       reiser4_spin_unlock_sb(sbinfo);
1469 +}
1470 +
1471 +static void
1472 +grabbed2fake_allocated_unformatted(void)
1473 +{
1474 +       reiser4_super_info_data *sbinfo;
1475 +
1476 +       sbinfo = grabbed2fake_allocated_head();
1477 +       sbinfo->blocks_fake_allocated_unformatted ++;
1478 +
1479 +       assert("vs-9221", check_block_counters(reiser4_get_current_sb()));
1480 +
1481 +       reiser4_spin_unlock_sb(sbinfo);
1482 +}
1483 +
1484 +reiser4_internal void
1485 +grabbed2cluster_reserved(int count)
1486 +{
1487 +       reiser4_context *ctx;
1488 +       reiser4_super_info_data *sbinfo;
1489 +
1490 +       ctx = get_current_context();
1491 +       sub_from_ctx_grabbed(ctx, count);
1492 +
1493 +       sbinfo = get_super_private(ctx->super);
1494 +       reiser4_spin_lock_sb(sbinfo);
1495 +
1496 +       sub_from_sb_grabbed(sbinfo, count);
1497 +       sbinfo->blocks_clustered += count;
1498 +
1499 +       assert("edward-504", check_block_counters(ctx->super));
1500 +
1501 +       reiser4_spin_unlock_sb(sbinfo);
1502 +}
1503 +
1504 +reiser4_internal void
1505 +cluster_reserved2grabbed(int count)
1506 +{
1507 +       reiser4_context *ctx;
1508 +       reiser4_super_info_data *sbinfo;
1509 +
1510 +       ctx = get_current_context();
1511 +
1512 +       sbinfo = get_super_private(ctx->super);
1513 +       reiser4_spin_lock_sb(sbinfo);
1514 +
1515 +       sub_from_cluster_reserved(sbinfo, count);
1516 +       sbinfo->blocks_grabbed += count;
1517 +
1518 +       assert("edward-505", check_block_counters(ctx->super));
1519 +
1520 +       reiser4_spin_unlock_sb(sbinfo);
1521 +       ctx->grabbed_blocks += count;
1522 +}
1523 +
1524 +reiser4_internal void
1525 +cluster_reserved2free(int count)
1526 +{
1527 +       reiser4_context *ctx;
1528 +       reiser4_super_info_data *sbinfo;
1529 +
1530 +       assert("edward-503", get_current_context()->grabbed_blocks == 0);
1531 +
1532 +       ctx = get_current_context();
1533 +       sbinfo = get_super_private(ctx->super);
1534 +       reiser4_spin_lock_sb(sbinfo);
1535 +
1536 +       sub_from_cluster_reserved(sbinfo, count);
1537 +       sbinfo->blocks_free += count;
1538 +
1539 +       assert("edward-502", check_block_counters(ctx->super));
1540 +
1541 +       reiser4_spin_unlock_sb(sbinfo);
1542 +}
1543 +
1544 +static spinlock_t fake_lock = SPIN_LOCK_UNLOCKED;
1545 +static reiser4_block_nr fake_gen = 0;
1546 +
1547 +/* obtain a block number for new formatted node which will be used to refer
1548 +   to this newly allocated node until real allocation is done */
1549 +static inline void assign_fake_blocknr(reiser4_block_nr *blocknr)
1550 +{
1551 +       spin_lock(&fake_lock);
1552 +       *blocknr = fake_gen++;
1553 +       spin_unlock(&fake_lock);
1554 +
1555 +       *blocknr &= ~REISER4_BLOCKNR_STATUS_BIT_MASK;
1556 +       *blocknr |= REISER4_UNALLOCATED_STATUS_VALUE;
1557 +       assert("zam-394", zlook(current_tree, blocknr) == NULL);
1558 +}
1559 +
1560 +reiser4_internal int
1561 +assign_fake_blocknr_formatted(reiser4_block_nr *blocknr)
1562 +{
1563 +       ON_TRACE(TRACE_RESERVE, "assign_fake_blocknr_formatted: moving 1 grabbed block to fake allocated formatted\n");
1564 +
1565 +       assign_fake_blocknr(blocknr);
1566 +       grabbed2fake_allocated_formatted();
1567 +
1568 +       return 0;
1569 +}
1570 +
1571 +/* return fake blocknr which will be used for unformatted nodes */
1572 +reiser4_internal reiser4_block_nr
1573 +fake_blocknr_unformatted(void)
1574 +{
1575 +       reiser4_block_nr blocknr;
1576 +
1577 +       ON_TRACE(TRACE_RESERVE, "fake_blocknr_unformatted: moving 1 grabbed block to fake allocated unformatted\n");
1578 +
1579 +       assign_fake_blocknr(&blocknr);
1580 +       grabbed2fake_allocated_unformatted();
1581 +
1582 +       /*XXXXX*/inc_unalloc_unfm_ptr();
1583 +       return blocknr;
1584 +}
1585 +
1586 +
1587 +/* adjust sb block counters, if real (on-disk) block allocation immediately
1588 +   follows grabbing of free disk space. */
1589 +static void
1590 +grabbed2used(reiser4_context *ctx, reiser4_super_info_data *sbinfo, __u64 count)
1591 +{
1592 +       sub_from_ctx_grabbed(ctx, count);
1593 +
1594 +       reiser4_spin_lock_sb(sbinfo);
1595 +
1596 +       sub_from_sb_grabbed(sbinfo, count);
1597 +       sbinfo->blocks_used += count;
1598 +
1599 +       assert("nikita-2679", check_block_counters(ctx->super));
1600 +
1601 +       reiser4_spin_unlock_sb(sbinfo);
1602 +}
1603 +
1604 +/* adjust sb block counters when @count unallocated blocks get mapped to disk */
1605 +static void
1606 +fake_allocated2used(reiser4_super_info_data *sbinfo, __u64 count, reiser4_ba_flags_t flags)
1607 +{
1608 +       reiser4_spin_lock_sb(sbinfo);
1609 +
1610 +       sub_from_sb_fake_allocated(sbinfo, count, flags);
1611 +       sbinfo->blocks_used += count;
1612 +
1613 +       assert("nikita-2680", check_block_counters(reiser4_get_current_sb()));
1614 +
1615 +       reiser4_spin_unlock_sb(sbinfo);
1616 +}
1617 +
1618 +static void
1619 +flush_reserved2used(txn_atom * atom, __u64 count)
1620 +{
1621 +       reiser4_super_info_data *sbinfo;
1622 +
1623 +       assert("zam-787", atom != NULL);
1624 +       assert("zam-788", spin_atom_is_locked(atom));
1625 +
1626 +       sub_from_atom_flush_reserved_nolock(atom, (__u32)count);
1627 +
1628 +       sbinfo = get_current_super_private();
1629 +       reiser4_spin_lock_sb(sbinfo);
1630 +
1631 +       sub_from_sb_flush_reserved(sbinfo, count);
1632 +       sbinfo->blocks_used += count;
1633 +
1634 +       assert ("zam-789", check_block_counters(reiser4_get_current_sb()));
1635 +
1636 +       reiser4_spin_unlock_sb(sbinfo);
1637 +}
1638 +
1639 +/* update the per fs  blocknr hint default value. */
1640 +reiser4_internal void
1641 +update_blocknr_hint_default (const struct super_block *s, const reiser4_block_nr * block)
1642 +{
1643 +       reiser4_super_info_data *sbinfo = get_super_private(s);
1644 +
1645 +       assert("nikita-3342", !blocknr_is_fake(block));
1646 +
1647 +       reiser4_spin_lock_sb(sbinfo);
1648 +       if (*block < sbinfo->block_count) {
1649 +               sbinfo->blocknr_hint_default = *block;
1650 +       } else {
1651 +               warning("zam-676",
1652 +                       "block number %llu is too large to be used in a blocknr hint\n", (unsigned long long) *block);
1653 +               dump_stack();
1654 +               DEBUGON(1);
1655 +       }
1656 +       reiser4_spin_unlock_sb(sbinfo);
1657 +}
1658 +
1659 +/* get current value of the default blocknr hint. */
1660 +reiser4_internal void get_blocknr_hint_default(reiser4_block_nr * result)
1661 +{
1662 +       reiser4_super_info_data * sbinfo = get_current_super_private();
1663 +
1664 +       reiser4_spin_lock_sb(sbinfo);
1665 +       *result = sbinfo->blocknr_hint_default;
1666 +       assert("zam-677", *result < sbinfo->block_count);
1667 +       reiser4_spin_unlock_sb(sbinfo);
1668 +}
1669 +
1670 +/* Allocate "real" disk blocks by calling a proper space allocation plugin
1671 + * method. Blocks are allocated in one contiguous disk region. The plugin
1672 + * independent part accounts blocks by subtracting allocated amount from grabbed
1673 + * or fake block counter and add the same amount to the counter of allocated
1674 + * blocks.
1675 + *
1676 + * @hint -- a reiser4 blocknr hint object which contains further block
1677 + *          allocation hints and parameters (search start, a stage of block
1678 + *          which will be mapped to disk, etc.),
1679 + * @blk  -- an out parameter for the beginning of the allocated region,
1680 + * @len  -- in/out parameter, it should contain the maximum number of allocated
1681 + *          blocks, after block allocation completes, it contains the length of
1682 + *          allocated disk region.
1683 + * @flags -- see reiser4_ba_flags_t description.
1684 + *
1685 + * @return -- 0 if success, error code otherwise.
1686 + */
1687 +reiser4_internal int
1688 +reiser4_alloc_blocks(reiser4_blocknr_hint * hint, reiser4_block_nr * blk,
1689 +                    reiser4_block_nr * len, reiser4_ba_flags_t flags)
1690 +{
1691 +       __u64 needed = *len;
1692 +       reiser4_context *ctx;
1693 +       reiser4_super_info_data *sbinfo;
1694 +       int ret;
1695 +
1696 +       assert ("zam-986", hint != NULL);
1697 +
1698 +       ctx = get_current_context();
1699 +       sbinfo = get_super_private(ctx->super);
1700 +
1701 +       ON_TRACE(TRACE_RESERVE, "reiser4_alloc_blocks: needed %llu..", needed);
1702 +
1703 +       assert("vpf-339", hint != NULL);
1704 +
1705 +       ON_TRACE(TRACE_ALLOC,
1706 +                "alloc_blocks: requested %llu, search from %llu\n",
1707 +                (unsigned long long) *len, (unsigned long long) (hint ? hint->blk : ~0ull));
1708 +
1709 +       /* For write-optimized data we use default search start value, which is
1710 +        * close to last write location. */
1711 +       if (flags & BA_USE_DEFAULT_SEARCH_START) {
1712 +               reiser4_stat_inc(block_alloc.nohint);
1713 +               get_blocknr_hint_default(&hint->blk);
1714 +       }
1715 +
1716 +       /* VITALY: allocator should grab this for internal/tx-lists/similar only. */
1717 +/* VS-FIXME-HANS: why is this comment above addressed to vitaly (from vitaly)? */
1718 +       if (hint->block_stage == BLOCK_NOT_COUNTED) {
1719 +               ret = reiser4_grab_space_force(*len, flags);
1720 +               if (ret != 0)
1721 +                       return ret;
1722 +       }
1723 +
1724 +       ret = sa_alloc_blocks(get_space_allocator(ctx->super), hint, (int) needed, blk, len);
1725 +
1726 +       if (!ret) {
1727 +               assert("zam-680", *blk < reiser4_block_count(ctx->super));
1728 +               assert("zam-681", *blk + *len <= reiser4_block_count(ctx->super));
1729 +
1730 +               if (flags & BA_PERMANENT) {
1731 +                       /* we assume that current atom exists at this moment */
1732 +                       txn_atom * atom = get_current_atom_locked ();
1733 +                       atom -> nr_blocks_allocated += *len;
1734 +                       UNLOCK_ATOM (atom);
1735 +               }
1736 +
1737 +               switch (hint->block_stage) {
1738 +               case BLOCK_NOT_COUNTED:
1739 +               case BLOCK_GRABBED:
1740 +                       ON_TRACE(TRACE_RESERVE, "ok. %llu blocks grabbed to used.\n", *len);
1741 +                       grabbed2used(ctx, sbinfo, *len);
1742 +                       break;
1743 +               case BLOCK_UNALLOCATED:
1744 +                       ON_TRACE(TRACE_RESERVE, "ok. %llu blocks fake allocated to used.\n", *len);
1745 +                       fake_allocated2used(sbinfo, *len, flags);
1746 +                       break;
1747 +               case BLOCK_FLUSH_RESERVED:
1748 +                       ON_TRACE(TRACE_RESERVE, "ok. %llu flush reserved to used (get wandered?)\n", *len);
1749 +                       {
1750 +                               txn_atom * atom = get_current_atom_locked ();
1751 +                               flush_reserved2used(atom, *len);
1752 +                               UNLOCK_ATOM (atom);
1753 +                       }
1754 +                       break;
1755 +               default:
1756 +                       impossible("zam-531", "wrong block stage");
1757 +               }
1758 +       } else {
1759 +               assert ("zam-821", ergo(hint->max_dist == 0 && !hint->backward, ret != -ENOSPC));
1760 +               if (hint->block_stage == BLOCK_NOT_COUNTED)
1761 +                       grabbed2free(ctx, sbinfo, needed);
1762 +       }
1763 +
1764 +       return ret;
1765 +}
1766 +
1767 +/* used -> fake_allocated -> grabbed -> free */
1768 +
1769 +/* adjust sb block counters when @count unallocated blocks get unmapped from
1770 +   disk */
1771 +static void
1772 +used2fake_allocated(reiser4_super_info_data *sbinfo, __u64 count, int formatted)
1773 +{
1774 +       reiser4_spin_lock_sb(sbinfo);
1775 +
1776 +       if (formatted)
1777 +               sbinfo->blocks_fake_allocated += count;
1778 +       else
1779 +               sbinfo->blocks_fake_allocated_unformatted += count;
1780 +
1781 +       sub_from_sb_used(sbinfo, count);
1782 +
1783 +       assert("nikita-2681", check_block_counters(reiser4_get_current_sb()));
1784 +
1785 +       reiser4_spin_unlock_sb(sbinfo);
1786 +}
1787 +
1788 +static void
1789 +used2flush_reserved(reiser4_super_info_data *sbinfo, txn_atom * atom, __u64 count,
1790 +                   reiser4_ba_flags_t flags UNUSED_ARG)
1791 +{
1792 +       assert("nikita-2791", atom != NULL);
1793 +       assert("nikita-2792", spin_atom_is_locked(atom));
1794 +
1795 +       add_to_atom_flush_reserved_nolock(atom, (__u32)count);
1796 +
1797 +       reiser4_spin_lock_sb(sbinfo);
1798 +
1799 +       sbinfo->blocks_flush_reserved += count;
1800 +       /*add_to_sb_flush_reserved(sbinfo, count);*/
1801 +       sub_from_sb_used(sbinfo, count);
1802 +
1803 +       assert("nikita-2681", check_block_counters(reiser4_get_current_sb()));
1804 +
1805 +       reiser4_spin_unlock_sb(sbinfo);
1806 +}
1807 +
1808 +/* disk space, virtually used by fake block numbers is counted as "grabbed" again. */
1809 +static void
1810 +fake_allocated2grabbed(reiser4_context *ctx, reiser4_super_info_data *sbinfo, __u64 count, reiser4_ba_flags_t flags)
1811 +{
1812 +       ctx->grabbed_blocks += count;
1813 +
1814 +       reiser4_spin_lock_sb(sbinfo);
1815 +
1816 +       assert("nikita-2682", check_block_counters(ctx->super));
1817 +
1818 +       sbinfo->blocks_grabbed += count;
1819 +       sub_from_sb_fake_allocated(sbinfo, count, flags & BA_FORMATTED);
1820 +
1821 +       assert("nikita-2683", check_block_counters(ctx->super));
1822 +
1823 +       reiser4_spin_unlock_sb(sbinfo);
1824 +}
1825 +
1826 +reiser4_internal void
1827 +fake_allocated2free(__u64 count, reiser4_ba_flags_t flags)
1828 +{
1829 +       reiser4_context *ctx;
1830 +       reiser4_super_info_data *sbinfo;
1831 +
1832 +       ctx = get_current_context();
1833 +       sbinfo = get_super_private(ctx->super);
1834 +
1835 +       ON_TRACE(TRACE_RESERVE, "fake_allocated2free %llu blocks\n", count);
1836 +
1837 +       fake_allocated2grabbed(ctx, sbinfo, count, flags);
1838 +       grabbed2free(ctx, sbinfo, count);
1839 +}
1840 +
1841 +reiser4_internal void
1842 +grabbed2free_mark(__u64 mark)
1843 +{
1844 +       reiser4_context *ctx;
1845 +       reiser4_super_info_data *sbinfo;
1846 +
1847 +       ctx = get_current_context();
1848 +       sbinfo = get_super_private(ctx->super);
1849 +
1850 +       assert("nikita-3007", (__s64)mark >= 0);
1851 +       assert("nikita-3006",
1852 +              ctx->grabbed_blocks >= mark);
1853 +       grabbed2free(ctx, sbinfo, ctx->grabbed_blocks - mark);
1854 +}
1855 +
1856 +/* Adjust free blocks count for blocks which were reserved but were not used. */
1857 +reiser4_internal void
1858 +grabbed2free(reiser4_context *ctx, reiser4_super_info_data *sbinfo,
1859 +              __u64 count)
1860 +{
1861 +       ON_TRACE(TRACE_RESERVE, "grabbed2free: %llu\n", count);
1862 +
1863 +       sub_from_ctx_grabbed(ctx, count);
1864 +
1865 +
1866 +       reiser4_spin_lock_sb(sbinfo);
1867 +
1868 +       sub_from_sb_grabbed(sbinfo, count);
1869 +       sbinfo->blocks_free += count;
1870 +       assert("nikita-2684", check_block_counters(ctx->super));
1871 +
1872 +       reiser4_spin_unlock_sb(sbinfo);
1873 +}
1874 +
1875 +reiser4_internal void
1876 +grabbed2flush_reserved_nolock(txn_atom * atom, __u64 count)
1877 +{
1878 +       reiser4_context *ctx;
1879 +       reiser4_super_info_data *sbinfo;
1880 +
1881 +       assert("vs-1095", atom);
1882 +
1883 +       ctx = get_current_context();
1884 +       sbinfo = get_super_private(ctx->super);
1885 +
1886 +       sub_from_ctx_grabbed(ctx, count);
1887 +
1888 +       add_to_atom_flush_reserved_nolock(atom, count);
1889 +
1890 +       reiser4_spin_lock_sb(sbinfo);
1891 +
1892 +       sbinfo->blocks_flush_reserved += count;
1893 +       sub_from_sb_grabbed(sbinfo, count);
1894 +
1895 +       assert ("vpf-292", check_block_counters(ctx->super));
1896 +
1897 +       ON_TRACE(TRACE_RESERVE, "__grabbed2flush_reserved_nolock %llu blocks: atom %u has %llu flush reserved blocks\n",
1898 +                count, atom->atom_id, atom->flush_reserved);
1899 +
1900 +       reiser4_spin_unlock_sb(sbinfo);
1901 +}
1902 +
1903 +reiser4_internal void
1904 +grabbed2flush_reserved(__u64 count)
1905 +{
1906 +       txn_atom * atom = get_current_atom_locked ();
1907 +
1908 +       ON_TRACE(TRACE_RESERVE, "__grabbed2flush_reserved\n");
1909 +
1910 +       grabbed2flush_reserved_nolock (atom, count);
1911 +
1912 +       UNLOCK_ATOM (atom);
1913 +}
1914 +
1915 +reiser4_internal void flush_reserved2grabbed(txn_atom * atom, __u64 count)
1916 +{
1917 +       reiser4_context *ctx;
1918 +       reiser4_super_info_data *sbinfo;
1919 +
1920 +       assert("nikita-2788", atom != NULL);
1921 +       assert("nikita-2789", spin_atom_is_locked(atom));
1922 +
1923 +       ctx = get_current_context();
1924 +       sbinfo = get_super_private(ctx->super);
1925 +
1926 +       ctx->grabbed_blocks += count;
1927 +
1928 +       sub_from_atom_flush_reserved_nolock(atom, (__u32)count);
1929 +
1930 +       reiser4_spin_lock_sb(sbinfo);
1931 +
1932 +       sbinfo->blocks_grabbed += count;
1933 +       sub_from_sb_flush_reserved(sbinfo, count);
1934 +
1935 +       assert ("vpf-292", check_block_counters (ctx->super));
1936 +
1937 +       reiser4_spin_unlock_sb (sbinfo);
1938 +}
1939 +
1940 +/* release all blocks grabbed in context which where not used. */
1941 +reiser4_internal void
1942 +all_grabbed2free(void)
1943 +{
1944 +       reiser4_context *ctx = get_current_context();
1945 +
1946 +       grabbed2free(ctx, get_super_private(ctx->super), ctx->grabbed_blocks);
1947 +}
1948 +
1949 +/* adjust sb block counters if real (on-disk) blocks do not become unallocated
1950 +   after freeing, @count blocks become "grabbed". */
1951 +static void
1952 +used2grabbed(reiser4_context *ctx, reiser4_super_info_data *sbinfo, __u64 count)
1953 +{
1954 +       ctx->grabbed_blocks += count;
1955 +
1956 +       reiser4_spin_lock_sb(sbinfo);
1957 +
1958 +       sbinfo->blocks_grabbed += count;
1959 +       sub_from_sb_used(sbinfo, count);
1960 +
1961 +       assert("nikita-2685", check_block_counters(ctx->super));
1962 +
1963 +       reiser4_spin_unlock_sb(sbinfo);
1964 +}
1965 +
1966 +/* this used to be done through used2grabbed and grabbed2free*/
1967 +static void
1968 +used2free(reiser4_super_info_data *sbinfo, __u64 count)
1969 +{
1970 +       reiser4_spin_lock_sb(sbinfo);
1971 +
1972 +       sbinfo->blocks_free += count;
1973 +       sub_from_sb_used(sbinfo, count);
1974 +
1975 +       assert("nikita-2685", check_block_counters(reiser4_get_current_sb()));
1976 +
1977 +       reiser4_spin_unlock_sb(sbinfo);
1978 +}
1979 +
1980 +#if REISER4_DEBUG
1981 +
1982 +/* check "allocated" state of given block range */
1983 +void
1984 +reiser4_check_blocks(const reiser4_block_nr * start, const reiser4_block_nr * len, int desired)
1985 +{
1986 +       sa_check_blocks(start, len, desired);
1987 +}
1988 +
1989 +/* check "allocated" state of given block */
1990 +void
1991 +reiser4_check_block(const reiser4_block_nr * block, int desired)
1992 +{
1993 +       const reiser4_block_nr one = 1;
1994 +
1995 +       reiser4_check_blocks(block, &one, desired);
1996 +}
1997 +
1998 +#endif
1999 +
2000 +/* Blocks deallocation function may do an actual deallocation through space
2001 +   plugin allocation or store deleted block numbers in atom's delete_set data
2002 +   structure depend on @defer parameter. */
2003 +
2004 +/* if BA_DEFER bit is not turned on, @target_stage means the stage of blocks which
2005 +   will be deleted from WORKING bitmap. They might be just unmapped from disk, or
2006 +   freed but disk space is still grabbed by current thread, or these blocks must
2007 +   not be counted in any reiser4 sb block counters, see block_stage_t comment */
2008 +
2009 +/* BA_FORMATTED bit is only used when BA_DEFER in not present: it is used to
2010 +   distinguish blocks allocated for unformatted and formatted nodes */
2011 +
2012 +reiser4_internal int
2013 +reiser4_dealloc_blocks(const reiser4_block_nr * start,
2014 +                      const reiser4_block_nr * len,
2015 +                      block_stage_t target_stage, reiser4_ba_flags_t flags)
2016 +{
2017 +       txn_atom *atom = NULL;
2018 +       int ret;
2019 +       reiser4_context *ctx;
2020 +       reiser4_super_info_data *sbinfo;
2021 +
2022 +       ON_TRACE(TRACE_RESERVE, "reiser4_dealloc_blocks: %llu blocks", *len);
2023 +
2024 +       ctx = get_current_context();
2025 +       sbinfo = get_super_private(ctx->super);
2026 +
2027 +       if (REISER4_DEBUG) {
2028 +               assert("zam-431", *len != 0);
2029 +               assert("zam-432", *start != 0);
2030 +               assert("zam-558", !blocknr_is_fake(start));
2031 +
2032 +               reiser4_spin_lock_sb(sbinfo);
2033 +               assert("zam-562", *start < sbinfo->block_count);
2034 +               reiser4_spin_unlock_sb(sbinfo);
2035 +       }
2036 +
2037 +       if (flags & BA_DEFER) {
2038 +               blocknr_set_entry *bsep = NULL;
2039 +
2040 +               ON_TRACE(TRACE_RESERVE, "put on delete set\n");
2041 +
2042 +               /* storing deleted block numbers in a blocknr set
2043 +                  datastructure for further actual deletion */
2044 +               do {
2045 +                       atom = get_current_atom_locked();
2046 +                       assert("zam-430", atom != NULL);
2047 +
2048 +                       ret = blocknr_set_add_extent(atom, &atom->delete_set, &bsep, start, len);
2049 +
2050 +                       if (ret == -ENOMEM)
2051 +                               return ret;
2052 +
2053 +                       /* This loop might spin at most two times */
2054 +               } while (ret == -E_REPEAT);
2055 +
2056 +               assert("zam-477", ret == 0);
2057 +               assert("zam-433", atom != NULL);
2058 +
2059 +               UNLOCK_ATOM(atom);
2060 +
2061 +       } else {
2062 +               assert("zam-425", get_current_super_private() != NULL);
2063 +               sa_dealloc_blocks(get_space_allocator(ctx->super), *start, *len);
2064 +
2065 +               if (flags & BA_PERMANENT) {
2066 +                       /* These blocks were counted as allocated, we have to revert it
2067 +                        * back if allocation is discarded. */
2068 +                       txn_atom * atom = get_current_atom_locked ();
2069 +                       atom->nr_blocks_allocated -= *len;
2070 +                       UNLOCK_ATOM (atom);
2071 +               }
2072 +
2073 +               switch (target_stage) {
2074 +               case BLOCK_NOT_COUNTED:
2075 +                       assert("vs-960", flags & BA_FORMATTED);
2076 +
2077 +                       ON_TRACE(TRACE_RESERVE, "moved from used to free\n");
2078 +
2079 +                       /* VITALY: This is what was grabbed for internal/tx-lists/similar only */
2080 +                       used2free(sbinfo, *len);
2081 +                       break;
2082 +
2083 +               case BLOCK_GRABBED:
2084 +
2085 +                       ON_TRACE(TRACE_RESERVE, "moved from used to grabbed\n");
2086 +
2087 +                       used2grabbed(ctx, sbinfo, *len);
2088 +                       break;
2089 +
2090 +               case BLOCK_UNALLOCATED:
2091 +
2092 +                       ON_TRACE(TRACE_RESERVE, "moved from used to fake allocated\n");
2093 +
2094 +                       used2fake_allocated(sbinfo, *len, flags & BA_FORMATTED);
2095 +                       break;
2096 +
2097 +               case BLOCK_FLUSH_RESERVED: {
2098 +                       txn_atom *atom;
2099 +
2100 +                       ON_TRACE(TRACE_RESERVE, "moved from used to flush reserved\n");
2101 +
2102 +                       atom = get_current_atom_locked();
2103 +                       used2flush_reserved(sbinfo, atom, *len, flags & BA_FORMATTED);
2104 +                       UNLOCK_ATOM(atom);
2105 +                       break;
2106 +               }
2107 +               default:
2108 +                       impossible("zam-532", "wrong block stage");
2109 +               }
2110 +       }
2111 +
2112 +       return 0;
2113 +}
2114 +
2115 +/* wrappers for block allocator plugin methods */
2116 +reiser4_internal int
2117 +pre_commit_hook(void)
2118 +{
2119 +       assert("zam-502", get_current_super_private() != NULL);
2120 +       sa_pre_commit_hook();
2121 +       return 0;
2122 +}
2123 +
2124 +/* an actor which applies delete set to block allocator data */
2125 +static int
2126 +apply_dset(txn_atom * atom UNUSED_ARG, const reiser4_block_nr * a, const reiser4_block_nr * b, void *data UNUSED_ARG)
2127 +{
2128 +       reiser4_context *ctx;
2129 +       reiser4_super_info_data *sbinfo;
2130 +
2131 +       __u64 len = 1;
2132 +
2133 +       ctx = get_current_context();
2134 +       sbinfo = get_super_private(ctx->super);
2135 +
2136 +       assert("zam-877", atom->stage >= ASTAGE_PRE_COMMIT);
2137 +       assert("zam-552", sbinfo != NULL);
2138 +
2139 +       if (b != NULL)
2140 +               len = *b;
2141 +
2142 +       if (REISER4_DEBUG) {
2143 +               reiser4_spin_lock_sb(sbinfo);
2144 +
2145 +               assert("zam-554", *a < reiser4_block_count(ctx->super));
2146 +               assert("zam-555", *a + len <= reiser4_block_count(ctx->super));
2147 +
2148 +               reiser4_spin_unlock_sb(sbinfo);
2149 +       }
2150 +
2151 +       sa_dealloc_blocks(&sbinfo->space_allocator, *a, len);
2152 +       /* adjust sb block counters */
2153 +       used2free(sbinfo, len);
2154 +       return 0;
2155 +}
2156 +
2157 +reiser4_internal void
2158 +post_commit_hook(void)
2159 +{
2160 +       txn_atom *atom;
2161 +
2162 +       atom = get_current_atom_locked();
2163 +       assert("zam-452", atom->stage == ASTAGE_POST_COMMIT);
2164 +       UNLOCK_ATOM(atom);
2165 +
2166 +       /* do the block deallocation which was deferred
2167 +          until commit is done */
2168 +       blocknr_set_iterator(atom, &atom->delete_set, apply_dset, NULL, 1);
2169 +
2170 +       assert("zam-504", get_current_super_private() != NULL);
2171 +       sa_post_commit_hook();
2172 +}
2173 +
2174 +reiser4_internal void
2175 +post_write_back_hook(void)
2176 +{
2177 +       assert("zam-504", get_current_super_private() != NULL);
2178 +
2179 +       sa_post_commit_hook();
2180 +}
2181 +
2182 +/*
2183 +   Local variables:
2184 +   c-indentation-style: "K&R"
2185 +   mode-name: "LC"
2186 +   c-basic-offset: 8
2187 +   tab-width: 8
2188 +   fill-column: 120
2189 +   scroll-step: 1
2190 +   End:
2191 +*/
2192 diff -rupN linux-2.6.8-rc3/fs/reiser4/block_alloc.h linux-2.6.8-rc3-a/fs/reiser4/block_alloc.h
2193 --- linux-2.6.8-rc3/fs/reiser4/block_alloc.h    1970-01-01 03:00:00.000000000 +0300
2194 +++ linux-2.6.8-rc3-a/fs/reiser4/block_alloc.h  2004-08-05 21:20:52.765727489 +0400
2195 @@ -0,0 +1,185 @@
2196 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
2197 +
2198 +#if !defined (__FS_REISER4_BLOCK_ALLOC_H__)
2199 +#define __FS_REISER4_BLOCK_ALLOC_H__
2200 +
2201 +#include "dformat.h"
2202 +#include "forward.h"
2203 +
2204 +#include <linux/types.h>       /* for __u??  */
2205 +#include <linux/fs.h>
2206 +
2207 +/* Mask when is applied to given block number shows is that block number is a fake one */
2208 +#define REISER4_FAKE_BLOCKNR_BIT_MASK   0x8000000000000000ULL
2209 +/* Mask which isolates a type of object this fake block number was assigned to */
2210 +#define REISER4_BLOCKNR_STATUS_BIT_MASK 0xC000000000000000ULL
2211 +
2212 +/*result after applying the REISER4_BLOCKNR_STATUS_BIT_MASK should be compared
2213 +   against these two values to understand is the object unallocated or bitmap
2214 +   shadow object (WORKING BITMAP block, look at the plugin/space/bitmap.c) */
2215 +#define REISER4_UNALLOCATED_STATUS_VALUE    0xC000000000000000ULL
2216 +#define REISER4_BITMAP_BLOCKS_STATUS_VALUE  0x8000000000000000ULL
2217 +
2218 +/* specification how block allocation was counted in sb block counters */
2219 +typedef enum {
2220 +       BLOCK_NOT_COUNTED       = 0,    /* reiser4 has no info about this block yet */
2221 +       BLOCK_GRABBED           = 1,    /* free space grabbed for further allocation
2222 +                                          of this block */
2223 +       BLOCK_FLUSH_RESERVED    = 2,    /* block is reserved for flush needs. */
2224 +       BLOCK_UNALLOCATED       = 3,    /* block is used for existing in-memory object
2225 +                                          ( unallocated formatted or unformatted
2226 +                                          node) */
2227 +       BLOCK_ALLOCATED         = 4     /* block is mapped to disk, real on-disk block
2228 +                                          number assigned */
2229 +} block_stage_t;
2230 +
2231 +/* a hint for block allocator */
2232 +struct reiser4_blocknr_hint {
2233 +       /* FIXME: I think we want to add a longterm lock on the bitmap block here.  This
2234 +          is to prevent jnode_flush() calls from interleaving allocations on the same
2235 +          bitmap, once a hint is established. */
2236 +
2237 +       /* search start hint */
2238 +       reiser4_block_nr blk;
2239 +       /* if not zero, it is a region size we search for free blocks in */
2240 +       reiser4_block_nr max_dist;
2241 +       /* level for allocation, may be useful have branch-level and higher
2242 +          write-optimized. */
2243 +       tree_level level;
2244 +       /* block allocator assumes that blocks, which will be mapped to disk,
2245 +          are in this specified block_stage */
2246 +       block_stage_t block_stage;
2247 +       /* If direction = 1 allocate blocks in backward direction from the end
2248 +        * of disk to the beginning of disk.  */
2249 +       int backward:1;
2250 +
2251 +};
2252 +
2253 +/* These flags control block allocation/deallocation behavior */
2254 +enum reiser4_ba_flags {
2255 +       /* do allocatations from reserved (5%) area */
2256 +       BA_RESERVED         = (1 << 0),
2257 +
2258 +       /* block allocator can do commit trying to recover free space */
2259 +       BA_CAN_COMMIT       = (1 << 1),
2260 +
2261 +       /* if operation will be applied to formatted block */
2262 +       BA_FORMATTED        = (1 << 2),
2263 +
2264 +       /* defer actual block freeing until transaction commit */
2265 +       BA_DEFER            = (1 << 3),
2266 +
2267 +       /* allocate blocks for permanent fs objects (formatted or unformatted), not
2268 +          wandered of log blocks */
2269 +       BA_PERMANENT        = (1 << 4),
2270 +
2271 +       /* grab space even it was disabled */
2272 +       BA_FORCE            = (1 << 5),
2273 +
2274 +       /* use default start value for free blocks search. */
2275 +       BA_USE_DEFAULT_SEARCH_START = (1 << 6)
2276 +};
2277 +
2278 +typedef enum reiser4_ba_flags reiser4_ba_flags_t;
2279 +
2280 +extern void blocknr_hint_init(reiser4_blocknr_hint * hint);
2281 +extern void blocknr_hint_done(reiser4_blocknr_hint * hint);
2282 +extern void update_blocknr_hint_default(const struct super_block *, const reiser4_block_nr *);
2283 +extern void get_blocknr_hint_default(reiser4_block_nr *);
2284 +
2285 +extern reiser4_block_nr reiser4_fs_reserved_space(struct super_block * super);
2286 +
2287 +int assign_fake_blocknr_formatted(reiser4_block_nr *);
2288 +reiser4_block_nr fake_blocknr_unformatted(void);
2289 +
2290 +
2291 +/* free -> grabbed -> fake_allocated -> used */
2292 +
2293 +
2294 +int  reiser4_grab_space           (__u64 count, reiser4_ba_flags_t flags);
2295 +void all_grabbed2free             (void);
2296 +void grabbed2free                 (reiser4_context *,
2297 +                                  reiser4_super_info_data *, __u64 count);
2298 +void fake_allocated2free          (__u64 count, reiser4_ba_flags_t flags);
2299 +void grabbed2flush_reserved_nolock(txn_atom * atom, __u64 count);
2300 +void grabbed2flush_reserved       (__u64 count);
2301 +int  reiser4_alloc_blocks         (reiser4_blocknr_hint * hint,
2302 +                                  reiser4_block_nr * start,
2303 +                                  reiser4_block_nr * len,
2304 +                                  reiser4_ba_flags_t flags);
2305 +int reiser4_dealloc_blocks        (const reiser4_block_nr *,
2306 +                                  const reiser4_block_nr *,
2307 +                                  block_stage_t, reiser4_ba_flags_t flags);
2308 +
2309 +static inline int reiser4_alloc_block (reiser4_blocknr_hint * hint, reiser4_block_nr * start,
2310 +                                      reiser4_ba_flags_t flags)
2311 +{
2312 +       reiser4_block_nr one = 1;
2313 +       return reiser4_alloc_blocks(hint, start, &one, flags);
2314 +}
2315 +
2316 +static inline int reiser4_dealloc_block (const reiser4_block_nr * block, block_stage_t stage, reiser4_ba_flags_t flags)
2317 +{
2318 +       const reiser4_block_nr one = 1;
2319 +       return reiser4_dealloc_blocks(block, &one, stage, flags);
2320 +}
2321 +
2322 +#define reiser4_grab_space_force(count, flags)         \
2323 +       reiser4_grab_space(count, flags | BA_FORCE)
2324 +
2325 +extern void grabbed2free_mark(__u64 mark);
2326 +extern int  reiser4_grab_reserved(struct super_block *,
2327 +                                 __u64, reiser4_ba_flags_t);
2328 +extern void reiser4_release_reserved(struct super_block *super);
2329 +
2330 +/* grabbed -> fake_allocated */
2331 +
2332 +/* fake_allocated -> used */
2333 +
2334 +/* used -> fake_allocated -> grabbed -> free */
2335 +
2336 +extern void flush_reserved2grabbed(txn_atom * atom, __u64 count);
2337 +
2338 +extern int blocknr_is_fake(const reiser4_block_nr * da);
2339 +
2340 +extern void grabbed2cluster_reserved(int count);
2341 +extern void cluster_reserved2grabbed(int count);
2342 +extern void cluster_reserved2free(int count);
2343 +
2344 +extern int check_block_counters(const struct super_block *);
2345 +
2346 +#if REISER4_DEBUG
2347 +
2348 +extern void reiser4_check_blocks(const reiser4_block_nr *, const reiser4_block_nr *, int);
2349 +extern void reiser4_check_block(const reiser4_block_nr *, int);
2350 +
2351 +#else
2352 +
2353 +#  define reiser4_check_blocks(beg, len, val)  noop
2354 +#  define reiser4_check_block(beg, val)        noop
2355 +
2356 +#endif
2357 +
2358 +#if REISER4_DEBUG_OUTPUT
2359 +extern void print_block_counters(const char *,
2360 +                                const struct super_block *,
2361 +                                txn_atom *atom);
2362 +#else
2363 +#define print_block_counters(p, s, a) noop
2364 +#endif
2365 +
2366 +extern int pre_commit_hook(void);
2367 +extern void post_commit_hook(void);
2368 +extern void post_write_back_hook(void);
2369 +
2370 +#endif                         /* __FS_REISER4_BLOCK_ALLOC_H__ */
2371 +
2372 +/* Make Linus happy.
2373 +   Local variables:
2374 +   c-indentation-style: "K&R"
2375 +   mode-name: "LC"
2376 +   c-basic-offset: 8
2377 +   tab-width: 8
2378 +   fill-column: 120
2379 +   End:
2380 +*/
2381 diff -rupN linux-2.6.8-rc3/fs/reiser4/blocknrset.c linux-2.6.8-rc3-a/fs/reiser4/blocknrset.c
2382 --- linux-2.6.8-rc3/fs/reiser4/blocknrset.c     1970-01-01 03:00:00.000000000 +0300
2383 +++ linux-2.6.8-rc3-a/fs/reiser4/blocknrset.c   2004-08-05 21:20:52.943689952 +0400
2384 @@ -0,0 +1,365 @@
2385 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
2386 +
2387 +/* This file contains code for various block number sets used by the atom to
2388 +   track the deleted set and wandered block mappings. */
2389 +
2390 +#include "debug.h"
2391 +#include "dformat.h"
2392 +#include "type_safe_list.h"
2393 +#include "txnmgr.h"
2394 +
2395 +#include <linux/slab.h>
2396 +
2397 +/* The proposed data structure for storing unordered block number sets is a
2398 +   list of elements, each of which contains an array of block number or/and
2399 +   array of block number pairs. That element called blocknr_set_entry is used
2400 +   to store block numbers from the beginning and for extents from the end of
2401 +   the data field (char data[...]). The ->nr_blocks and ->nr_pairs fields
2402 +   count numbers of blocks and extents.
2403 +
2404 +   +------------------- blocknr_set_entry->data ------------------+
2405 +   |block1|block2| ... <free space> ... |pair3|pair2|pair1|
2406 +   +------------------------------------------------------------+
2407 +
2408 +   When current blocknr_set_entry is full, allocate a new one. */
2409 +
2410 +/* Usage examples: blocknr sets are used in reiser4 for storing atom's delete
2411 + * set (single blocks and block extents), in that case blocknr pair represent an
2412 + * extent; atom's wandered map is also stored as a blocknr set, blocknr pairs
2413 + * there represent a (real block) -> (wandered block) mapping. */
2414 +
2415 +typedef struct blocknr_pair blocknr_pair;
2416 +
2417 +/* The total size of a blocknr_set_entry. */
2418 +#define BLOCKNR_SET_ENTRY_SIZE 128
2419 +
2420 +/* The number of blocks that can fit the blocknr data area. */
2421 +#define BLOCKNR_SET_ENTRIES_NUMBER               \
2422 +       ((BLOCKNR_SET_ENTRY_SIZE -           \
2423 +         2 * sizeof (unsigned) -            \
2424 +         sizeof (blocknr_set_list_link)) /  \
2425 +        sizeof (reiser4_block_nr))
2426 +
2427 +/* An entry of the blocknr_set */
2428 +struct blocknr_set_entry {
2429 +       unsigned nr_singles;
2430 +       unsigned nr_pairs;
2431 +       blocknr_set_list_link link;
2432 +       reiser4_block_nr entries[BLOCKNR_SET_ENTRIES_NUMBER];
2433 +};
2434 +
2435 +/* A pair of blocks as recorded in the blocknr_set_entry data. */
2436 +struct blocknr_pair {
2437 +       reiser4_block_nr a;
2438 +       reiser4_block_nr b;
2439 +};
2440 +
2441 +/* The list definition. */
2442 +TYPE_SAFE_LIST_DEFINE(blocknr_set, blocknr_set_entry, link);
2443 +
2444 +/* Return the number of blocknr slots available in a blocknr_set_entry. */
2445 +/* Audited by: green(2002.06.11) */
2446 +static unsigned
2447 +bse_avail(blocknr_set_entry * bse)
2448 +{
2449 +       unsigned used = bse->nr_singles + 2 * bse->nr_pairs;
2450 +
2451 +       assert("jmacd-5088", BLOCKNR_SET_ENTRIES_NUMBER >= used);
2452 +       cassert(sizeof (blocknr_set_entry) == BLOCKNR_SET_ENTRY_SIZE);
2453 +
2454 +       return BLOCKNR_SET_ENTRIES_NUMBER - used;
2455 +}
2456 +
2457 +/* Initialize a blocknr_set_entry. */
2458 +/* Audited by: green(2002.06.11) */
2459 +static void
2460 +bse_init(blocknr_set_entry * bse)
2461 +{
2462 +       bse->nr_singles = 0;
2463 +       bse->nr_pairs = 0;
2464 +       blocknr_set_list_clean(bse);
2465 +}
2466 +
2467 +/* Allocate and initialize a blocknr_set_entry. */
2468 +/* Audited by: green(2002.06.11) */
2469 +static blocknr_set_entry *
2470 +bse_alloc(void)
2471 +{
2472 +       blocknr_set_entry *e;
2473 +
2474 +       if ((e = (blocknr_set_entry *) kmalloc(sizeof (blocknr_set_entry), GFP_KERNEL)) == NULL) {
2475 +               return NULL;
2476 +       }
2477 +
2478 +       bse_init(e);
2479 +
2480 +       return e;
2481 +}
2482 +
2483 +/* Free a blocknr_set_entry. */
2484 +/* Audited by: green(2002.06.11) */
2485 +static void
2486 +bse_free(blocknr_set_entry * bse)
2487 +{
2488 +       kfree(bse);
2489 +}
2490 +
2491 +/* Add a block number to a blocknr_set_entry */
2492 +/* Audited by: green(2002.06.11) */
2493 +static void
2494 +bse_put_single(blocknr_set_entry * bse, const reiser4_block_nr * block)
2495 +{
2496 +       assert("jmacd-5099", bse_avail(bse) >= 1);
2497 +
2498 +       bse->entries[bse->nr_singles++] = *block;
2499 +}
2500 +
2501 +/* Get a pair of block numbers */
2502 +/* Audited by: green(2002.06.11) */
2503 +static inline blocknr_pair *
2504 +bse_get_pair(blocknr_set_entry * bse, unsigned pno)
2505 +{
2506 +       assert("green-1", BLOCKNR_SET_ENTRIES_NUMBER >= 2 * (pno + 1));
2507 +
2508 +       return (blocknr_pair *) (bse->entries + BLOCKNR_SET_ENTRIES_NUMBER - 2 * (pno + 1));
2509 +}
2510 +
2511 +/* Add a pair of block numbers to a blocknr_set_entry */
2512 +/* Audited by: green(2002.06.11) */
2513 +static void
2514 +bse_put_pair(blocknr_set_entry * bse, const reiser4_block_nr * a, const reiser4_block_nr * b)
2515 +{
2516 +       blocknr_pair *pair;
2517 +
2518 +       assert("jmacd-5100", bse_avail(bse) >= 2 && a != NULL && b != NULL);
2519 +
2520 +       pair = bse_get_pair(bse, bse->nr_pairs++);
2521 +
2522 +       pair->a = *a;
2523 +       pair->b = *b;
2524 +}
2525 +
2526 +/* Add either a block or pair of blocks to the block number set.  The first
2527 +   blocknr (@a) must be non-NULL.  If @b is NULL a single blocknr is added, if
2528 +   @b is non-NULL a pair is added.  The block number set belongs to atom, and
2529 +   the call is made with the atom lock held.  There may not be enough space in
2530 +   the current blocknr_set_entry.  If new_bsep points to a non-NULL
2531 +   blocknr_set_entry then it will be added to the blocknr_set and new_bsep
2532 +   will be set to NULL.  If new_bsep contains NULL then the atom lock will be
2533 +   released and a new bse will be allocated in new_bsep.  E_REPEAT will be
2534 +   returned with the atom unlocked for the operation to be tried again.  If
2535 +   the operation succeeds, 0 is returned.  If new_bsep is non-NULL and not
2536 +   used during the call, it will be freed automatically. */
2537 +/* Audited by: green(2002.06.11) */
2538 +static int
2539 +blocknr_set_add(txn_atom * atom,
2540 +               blocknr_set * bset,
2541 +               blocknr_set_entry ** new_bsep, const reiser4_block_nr * a, const reiser4_block_nr * b)
2542 +{
2543 +       blocknr_set_entry *bse;
2544 +       unsigned entries_needed;
2545 +
2546 +       assert("jmacd-5101", a != NULL);
2547 +
2548 +       entries_needed = (b == NULL) ? 1 : 2;
2549 +       if (blocknr_set_list_empty(&bset->entries) || bse_avail(blocknr_set_list_front(&bset->entries))
2550 +           < entries_needed) {
2551 +               /* See if a bse was previously allocated. */
2552 +               if (*new_bsep == NULL) {
2553 +                       UNLOCK_ATOM(atom);
2554 +                       *new_bsep = bse_alloc();
2555 +                       return (*new_bsep != NULL) ? -E_REPEAT : RETERR(-ENOMEM);
2556 +               }
2557 +
2558 +               /* Put it on the head of the list. */
2559 +               blocknr_set_list_push_front(&bset->entries, *new_bsep);
2560 +
2561 +               *new_bsep = NULL;
2562 +       }
2563 +
2564 +       /* Add the single or pair. */
2565 +       bse = blocknr_set_list_front(&bset->entries);
2566 +       if (b == NULL) {
2567 +               bse_put_single(bse, a);
2568 +       } else {
2569 +               bse_put_pair(bse, a, b);
2570 +       }
2571 +
2572 +       /* If new_bsep is non-NULL then there was an allocation race, free this copy. */
2573 +       if (*new_bsep != NULL) {
2574 +               bse_free(*new_bsep);
2575 +               *new_bsep = NULL;
2576 +       }
2577 +
2578 +       return 0;
2579 +}
2580 +
2581 +/* Add an extent to the block set.  If the length is 1, it is treated as a
2582 +   single block (e.g., reiser4_set_add_block). */
2583 +/* Audited by: green(2002.06.11) */
2584 +/* Auditor note: Entire call chain cannot hold any spinlocks, because
2585 +   kmalloc might schedule. The only exception is atom spinlock, which is
2586 +   properly freed. */
2587 +reiser4_internal int
2588 +blocknr_set_add_extent(txn_atom * atom,
2589 +                      blocknr_set * bset,
2590 +                      blocknr_set_entry ** new_bsep, const reiser4_block_nr * start, const reiser4_block_nr * len)
2591 +{
2592 +       assert("jmacd-5102", start != NULL && len != NULL && *len > 0);
2593 +       return blocknr_set_add(atom, bset, new_bsep, start, *len == 1 ? NULL : len);
2594 +}
2595 +
2596 +/* Add a block pair to the block set. It adds exactly a pair, which is checked
2597 + * by an assertion that both arguments are not null.*/
2598 +/* Audited by: green(2002.06.11) */
2599 +/* Auditor note: Entire call chain cannot hold any spinlocks, because
2600 +   kmalloc might schedule. The only exception is atom spinlock, which is
2601 +   properly freed. */
2602 +reiser4_internal int
2603 +blocknr_set_add_pair(txn_atom * atom,
2604 +                    blocknr_set * bset,
2605 +                    blocknr_set_entry ** new_bsep, const reiser4_block_nr * a, const reiser4_block_nr * b)
2606 +{
2607 +       assert("jmacd-5103", a != NULL && b != NULL);
2608 +       return blocknr_set_add(atom, bset, new_bsep, a, b);
2609 +}
2610 +
2611 +/* Initialize a blocknr_set. */
2612 +/* Audited by: green(2002.06.11) */
2613 +reiser4_internal void
2614 +blocknr_set_init(blocknr_set * bset)
2615 +{
2616 +       blocknr_set_list_init(&bset->entries);
2617 +}
2618 +
2619 +/* Release the entries of a blocknr_set. */
2620 +/* Audited by: green(2002.06.11) */
2621 +reiser4_internal void
2622 +blocknr_set_destroy(blocknr_set * bset)
2623 +{
2624 +       while (!blocknr_set_list_empty(&bset->entries)) {
2625 +               bse_free(blocknr_set_list_pop_front(&bset->entries));
2626 +       }
2627 +}
2628 +
2629 +/* Merge blocknr_set entries out of @from into @into. */
2630 +/* Audited by: green(2002.06.11) */
2631 +/* Auditor comments: This merge does not know if merged sets contain
2632 +   blocks pairs (As for wandered sets) or extents, so it cannot really merge
2633 +   overlapping ranges if there is some. So I believe it may lead to
2634 +   some blocks being presented several times in one blocknr_set. To help
2635 +   debugging such problems it might help to check for duplicate entries on
2636 +   actual processing of this set. Testing this kind of stuff right here is
2637 +   also complicated by the fact that these sets are not sorted and going
2638 +   through whole set on each element addition is going to be CPU-heavy task */
2639 +reiser4_internal void
2640 +blocknr_set_merge(blocknr_set * from, blocknr_set * into)
2641 +{
2642 +       blocknr_set_entry *bse_into = NULL;
2643 +
2644 +       /* If @from is empty, no work to perform. */
2645 +       if (blocknr_set_list_empty(&from->entries)) {
2646 +               return;
2647 +       }
2648 +
2649 +       /* If @into is not empty, try merging partial-entries. */
2650 +       if (!blocknr_set_list_empty(&into->entries)) {
2651 +
2652 +               /* Neither set is empty, pop the front to members and try to combine them. */
2653 +               blocknr_set_entry *bse_from;
2654 +               unsigned into_avail;
2655 +
2656 +               bse_into = blocknr_set_list_pop_front(&into->entries);
2657 +               bse_from = blocknr_set_list_pop_front(&from->entries);
2658 +
2659 +               /* Combine singles. */
2660 +               for (into_avail = bse_avail(bse_into); into_avail != 0 && bse_from->nr_singles != 0; into_avail -= 1) {
2661 +                       bse_put_single(bse_into, &bse_from->entries[--bse_from->nr_singles]);
2662 +               }
2663 +
2664 +               /* Combine pairs. */
2665 +               for (; into_avail > 1 && bse_from->nr_pairs != 0; into_avail -= 2) {
2666 +                       blocknr_pair *pair = bse_get_pair(bse_from, --bse_from->nr_pairs);
2667 +                       bse_put_pair(bse_into, &pair->a, &pair->b);
2668 +               }
2669 +
2670 +               /* If bse_from is empty, delete it now. */
2671 +               if (bse_avail(bse_from) == BLOCKNR_SET_ENTRIES_NUMBER) {
2672 +                       bse_free(bse_from);
2673 +               } else {
2674 +                       /* Otherwise, bse_into is full or nearly full (e.g.,
2675 +                          it could have one slot avail and bse_from has one
2676 +                          pair left).  Push it back onto the list.  bse_from
2677 +                          becomes bse_into, which will be the new partial. */
2678 +                       blocknr_set_list_push_front(&into->entries, bse_into);
2679 +                       bse_into = bse_from;
2680 +               }
2681 +       }
2682 +
2683 +       /* Splice lists together. */
2684 +       blocknr_set_list_splice(&into->entries, &from->entries);
2685 +
2686 +       /* Add the partial entry back to the head of the list. */
2687 +       if (bse_into != NULL) {
2688 +               blocknr_set_list_push_front(&into->entries, bse_into);
2689 +       }
2690 +}
2691 +
2692 +/* Iterate over all blocknr set elements. */
2693 +reiser4_internal int
2694 +blocknr_set_iterator(txn_atom * atom, blocknr_set * bset, blocknr_set_actor_f actor, void *data, int delete)
2695 +{
2696 +
2697 +       blocknr_set_entry *entry;
2698 +
2699 +       assert("zam-429", atom != NULL);
2700 +       assert("zam-430", atom_is_protected(atom));
2701 +       assert("zam-431", bset != 0);
2702 +       assert("zam-432", actor != NULL);
2703 +
2704 +       entry = blocknr_set_list_front(&bset->entries);
2705 +       while (!blocknr_set_list_end(&bset->entries, entry)) {
2706 +               blocknr_set_entry *tmp = blocknr_set_list_next(entry);
2707 +               unsigned int i;
2708 +               int ret;
2709 +
2710 +               for (i = 0; i < entry->nr_singles; i++) {
2711 +                       ret = actor(atom, &entry->entries[i], NULL, data);
2712 +
2713 +                       /* We can't break a loop if delete flag is set. */
2714 +                       if (ret != 0 && !delete)
2715 +                               return ret;
2716 +               }
2717 +
2718 +               for (i = 0; i < entry->nr_pairs; i++) {
2719 +                       struct blocknr_pair *ab;
2720 +
2721 +                       ab = bse_get_pair(entry, i);
2722 +
2723 +                       ret = actor(atom, &ab->a, &ab->b, data);
2724 +
2725 +                       if (ret != 0 && !delete)
2726 +                               return ret;
2727 +               }
2728 +
2729 +               if (delete) {
2730 +                       blocknr_set_list_remove(entry);
2731 +                       bse_free(entry);
2732 +               }
2733 +
2734 +               entry = tmp;
2735 +       }
2736 +
2737 +       return 0;
2738 +}
2739 +
2740 +/*
2741 +   Local variables:
2742 +   c-indentation-style: "K&R"
2743 +   mode-name: "LC"
2744 +   c-basic-offset: 8
2745 +   tab-width: 8
2746 +   fill-column: 120
2747 +   scroll-step: 1
2748 +   End:
2749 +*/
2750 Files linux-2.6.8-rc3/fs/reiser4/bufmgr/block-capture.JPG and linux-2.6.8-rc3-a/fs/reiser4/bufmgr/block-capture.JPG differ
2751 Files linux-2.6.8-rc3/fs/reiser4/bufmgr/block-capture.fm and linux-2.6.8-rc3-a/fs/reiser4/bufmgr/block-capture.fm differ
2752 Files linux-2.6.8-rc3/fs/reiser4/bufmgr/block-capture.pdf and linux-2.6.8-rc3-a/fs/reiser4/bufmgr/block-capture.pdf differ
2753 diff -rupN linux-2.6.8-rc3/fs/reiser4/bufmgr/wander.txt linux-2.6.8-rc3-a/fs/reiser4/bufmgr/wander.txt
2754 --- linux-2.6.8-rc3/fs/reiser4/bufmgr/wander.txt        1970-01-01 03:00:00.000000000 +0300
2755 +++ linux-2.6.8-rc3-a/fs/reiser4/bufmgr/wander.txt      2004-08-05 21:20:52.988680462 +0400
2756 @@ -0,0 +1,184 @@
2757 +
2758 +Before discussing the format of the commit record occupying the
2759 +journal area, we must revisit the topic of free space bitmap
2760 +management.  At the time an atom is closing and formatting its commit
2761 +record, the question is how to deallocate the blocks deleted by the
2762 +atom.  Those blocks become free once the atom commits, but they cannot
2763 +be re-allocated before that point in time.
2764 +
2765 +Modified bitmaps are always part of the overwrite set, meaning copies
2766 +are written to wandered positions (i.e., part of the log) before later
2767 +being overwritten.
2768 +
2769 +We have defined these terms:
2770 +
2771 +WORKING BITMAPS: the "current" in-memory bitmaps
2772 +
2773 +COMMIT BITMAPS: bitmap copies written to wandered, overwrite positions
2774 +
2775 +DELETE SET: the set of deleted blocks plus the set of former positions
2776 +of relocated blocks.  These block positions are deallocated when the
2777 +atom commits.
2778 +
2779 +WANDERED SET: the set of temporary locations used to store overwrite
2780 +blocks before they are actually overwritten.  These block positions
2781 +are deallocated some time after the atom commits, when it is ensured
2782 +that the atom will no longer replay during crash recovery.
2783 +
2784 +Both the delete set and the wandered set are blocks to be deleted, but
2785 +the details of handling these deletions are necessarily different.
2786 +
2787 +---- Consider first the handling of the DELETE SET.
2788 +
2789 +There are two ways to handle the delete set.  Before reading their
2790 +descriptions, let me offer my opinion.  The first is MORE complicated
2791 +but requires LESS data to be logged in the commit record.  The second
2792 +is LESS complicated but requires MORE data to be logged in the commit
2793 +record.
2794 +
2795 +Strategy #1: MORE COMPLICATED, LESS LOGGED DATA
2796 +
2797 +  At the time an atom closes, it creates a snapshot of all the
2798 +  modified bitmaps.  In other words, it creates commit bitmaps which
2799 +  are copies of the working bitmaps.  The delete set are immediately
2800 +  deallocated in the commit bitmaps, which are written to their
2801 +  wandered positions and later overwritten in their actual positions.
2802 +
2803 +  This way, the commit record does not contain any record of the
2804 +  delete set.
2805 +
2806 +  But there are problems with this approach, too.  First, there is
2807 +  extra memory pressure associated with maintaining extra copies of
2808 +  modified bitmaps.  Second, it is less straight forward than it may
2809 +  appear at first.  Suppose there are two atoms that commit in
2810 +  sequence, such that the first does not complete its commit (i.e.,
2811 +  finish all the required writes) before the second prepares to
2812 +  commit.  Which bitmaps does the second committing atom copy as its
2813 +  commit bitmaps?  It does not just copy the working bitmaps, since
2814 +  those do not yet represent the first atom deallocations.
2815 +
2816 +  Instead, it looks like we would end up maintaining multiple copies
2817 +  of every bitmap.  Each atom's commit bitmaps are the commit bitmaps
2818 +  of the previous atom plus whatever modifications were made by the
2819 +  atom itself.  This means in addition to maintaining the working
2820 +  bitmaps, we end up maintaining separate commit bitmaps.  It is not
2821 +  just as simple as copying the working bitmaps at the time of commit.
2822 +
2823 +  This solution looks far too complicated to me.  I admit that I have
2824 +  not fully tried to understand the complexity, but I do not think the
2825 +  advantages (smaller commit records) will outweigh the additional
2826 +  complexity, not to mention the additional memory pressure.
2827 +
2828 +Strategy #2: LESS COMPLICATED, MORE LOGGED DATA
2829 +
2830 +  In this solution, the commit bitmaps are the same as the working
2831 +  bitmaps--no copies are made.  We commit the working bitmaps without
2832 +  deallocating the delete set and we include the delete set in the
2833 +  commit record instead.
2834 +
2835 +  Before I describe exactly how deallocation works in this case, let
2836 +  me add that there is another reason why this method is preferred.
2837 +  The wandered set has to be deleted after the atom commits, since it
2838 +  does not become available until the atom will no longer be
2839 +  replayed.  With this approach to freeing the delete set, both kinds
2840 +  of deletion can be handled in the same manner, since they both take
2841 +  place after the atom commits.
2842 +
2843 +  In other words, since we have to address deallocating the wandered
2844 +  set after commit anyway, we might as well use the same mechanism for
2845 +  deallocating the delete set.  It means that additional data is
2846 +  logged, but it reduces complexity in my opinion.
2847 +
2848 +  Here's how it works.  The atom stores a record of its delete set in
2849 +  memory.  When a block is deallocated or relocated, the bit is of
2850 +  course not immediately deallocated in the working bitmaps.
2851 +
2852 +  The delete set is included in the commit record, which is written to
2853 +  the journal area.  The delete set is just a set of block numbers, so
2854 +  there are several possible representations.  The implementation
2855 +  could actually dynamically chose the representation to achieve the
2856 +  best compression: (a) list of blocks, (b) bitmap, and (c) extent
2857 +  compression.  The second two options are likely to achieve
2858 +  significant compression of the delete set unless fragmentation
2859 +  becomes a problem.
2860 +
2861 +  The atom maintains its in-memory copy of the delete set until the
2862 +  commit record is flushed to the disk.  At this point, those blocks
2863 +  become available for new atoms to re-allocate.  The atom releases
2864 +  these blocks back into the working bitmaps through the process of
2865 +  "reposession".  The reposession process makes a younger atom
2866 +  responsible for committing a deallocation from a previous atom.
2867 +
2868 +  For each block in the committed atom's delete set, a younger atom is
2869 +  selected (or created) to handle the deallocation of that block.  The
2870 +  working bitmap corresponding to the block being deleted is or was
2871 +  already captured by the younger (reposessing) atom.  The block is
2872 +  simply marked as deallocated in the working bitmap block captured.
2873 +
2874 +  The reposessing atom may immediately use this block or not, but in
2875 +  either case the deallocation is committed once the reposessing atom
2876 +  commits.  For recovery purposes (not discussed here), each atom also
2877 +  includes a list of atoms for which it resposesses.
2878 +
2879 +---- The commit record
2880 +
2881 +The commit record includes three lists:
2882 +
2883 +  DELETE SET: The set of blocks deallocated by this atom, represented
2884 +  as either a list, bitmap, or using extents.
2885 +
2886 +  WANDER SET: A list of block-pairs giving the original location and
2887 +  the temporary wandered location.  During replay the temporary
2888 +  location is copied to the original location.  After replay is no
2889 +  longer needed, the temporary locations are deallocated using
2890 +  reposession as previously described.
2891 +
2892 +  REPOSESSES FOR SET: A list of the previous atoms for which this atom
2893 +  reposesses deallocated blocks.  This is used to know which atoms
2894 +  deallocations must be replayed during crash recovery.
2895 +
2896 +I propose that all of this information is included in the commit
2897 +record, which is written to the journal area.  There may be multiple
2898 +journal areas (a significant complication) or there may not, but the
2899 +key point is that all of this data is written into a reserved,
2900 +cyclical journal area.  Because the journal area is reserved and
2901 +written in a simple cyclical manner, there are no allocation decisions
2902 +needed to find space for these commit records.
2903 +
2904 +---- The example
2905 +
2906 +Consider a roughly 50G file being modified in a 100G file system.
2907 +Realize that due to maintaining the preserve set, it is not possible
2908 +to transactionally write a file larger than 50G on a 100G file system.
2909 +In the absolute worst case, no extent compression is possible and the
2910 +best representation of the delete set requires a bitmap covering the
2911 +entire file system.
2912 +
2913 +A 100G file system with 4K blocks has 3.27MB of bitmaps, and this is
2914 +the same as the worst-case representation of the delete set, assuming
2915 +just about every other block is deleted.  In reality, we expect the
2916 +delete set to be much smaller because extent-compression would achieve
2917 +significant savings.
2918 +
2919 +The wander set could possibly be compressed, but that is a more
2920 +difficult task.  Suppose we attempt to overwrite the entire 50GB file
2921 +instead of relocating it.  A 50G file has 13 million blocks, therefore
2922 +the wander set requires storing 26 million block address pairs.  With
2923 +8-byte block addresses that requires writing 210MB of wander set
2924 +data.  Ouch!
2925 +
2926 +We should hope that the size of the wander set does not grow so large.
2927 +After all, its parent the extent record must be modified in this case,
2928 +so these blocks are all candidates for relocation.  It would take a
2929 +dumb allocate/flush plugin to try to overwrite a 50G file instead of
2930 +relocating it.
2931 +
2932 +---- The conclusion
2933 +
2934 +I maintain that it is much simpler to write all of this data inside
2935 +reserved log areas.  It is possible that we could write this data
2936 +outside the log, but then it will complicate the allocation and
2937 +deallocation proceedure, since space for the log itself must then be
2938 +allocated using ordinary methods.
2939 +
2940 +Comments?
2941 diff -rupN linux-2.6.8-rc3/fs/reiser4/carry.c linux-2.6.8-rc3-a/fs/reiser4/carry.c
2942 --- linux-2.6.8-rc3/fs/reiser4/carry.c  1970-01-01 03:00:00.000000000 +0300
2943 +++ linux-2.6.8-rc3-a/fs/reiser4/carry.c        2004-08-05 21:20:52.774725591 +0400
2944 @@ -0,0 +1,1437 @@
2945 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
2946 +/* Functions to "carry" tree modification(s) upward. */
2947 +/* Tree is modified one level at a time. As we modify a level we accumulate a
2948 +   set of changes that need to be propagated to the next level.  We manage
2949 +   node locking such that any searches that collide with carrying are
2950 +   restarted, from the root if necessary.
2951 +
2952 +   Insertion of a new item may result in items being moved among nodes and
2953 +   this requires the delimiting key to be updated at the least common parent
2954 +   of the nodes modified to preserve search tree invariants. Also, insertion
2955 +   may require allocation of a new node. A pointer to the new node has to be
2956 +   inserted into some node on the parent level, etc.
2957 +
2958 +   Tree carrying is meant to be analogous to arithmetic carrying.
2959 +
2960 +   A carry operation is always associated with some node (&carry_node).
2961 +
2962 +   Carry process starts with some initial set of operations to be performed
2963 +   and an initial set of already locked nodes.  Operations are performed one
2964 +   by one. Performing each single operation has following possible effects:
2965 +
2966 +    - content of carry node associated with operation is modified
2967 +    - new carry nodes are locked and involved into carry process on this level
2968 +    - new carry operations are posted to the next level
2969 +
2970 +   After all carry operations on this level are done, process is repeated for
2971 +   the accumulated sequence on carry operations for the next level. This
2972 +   starts by trying to lock (in left to right order) all carry nodes
2973 +   associated with carry operations on the parent level. After this, we decide
2974 +   whether more nodes are required on the left of already locked set. If so,
2975 +   all locks taken on the parent level are released, new carry nodes are
2976 +   added, and locking process repeats.
2977 +
2978 +   It may happen that balancing process fails owing to unrecoverable error on
2979 +   some of upper levels of a tree (possible causes are io error, failure to
2980 +   allocate new node, etc.). In this case we should unmount the filesystem,
2981 +   rebooting if it is the root, and possibly advise the use of fsck.
2982 +
2983 +   USAGE:
2984 +
2985 +
2986 +    int some_tree_operation( znode *node, ... )
2987 +    {
2988 +       // Allocate on a stack pool of carry objects: operations and nodes.
2989 +       // Most carry processes will only take objects from here, without
2990 +       // dynamic allocation.
2991 +
2992 +I feel uneasy about this pool.  It adds to code complexity, I understand why it exists, but.... -Hans
2993 +
2994 +       carry_pool  pool;
2995 +       carry_level lowest_level;
2996 +       carry_op   *op;
2997 +
2998 +       init_carry_pool( &pool );
2999 +       init_carry_level( &lowest_level, &pool );
3000 +
3001 +       // operation may be one of:
3002 +       //   COP_INSERT    --- insert new item into node
3003 +       //   COP_CUT       --- remove part of or whole node
3004 +       //   COP_PASTE     --- increase size of item
3005 +       //   COP_DELETE    --- delete pointer from parent node
3006 +       //   COP_UPDATE    --- update delimiting key in least
3007 +       //                     common ancestor of two
3008 +
3009 +       op = post_carry( &lowest_level, operation, node, 0 );
3010 +       if( IS_ERR( op ) || ( op == NULL ) ) {
3011 +           handle error
3012 +       } else {
3013 +           // fill in remaining fields in @op, according to carry.h:carry_op
3014 +           result = carry( &lowest_level, NULL );
3015 +       }
3016 +       done_carry_pool( &pool );
3017 +    }
3018 +
3019 +   When you are implementing node plugin method that participates in carry
3020 +   (shifting, insertion, deletion, etc.), do the following:
3021 +
3022 +   int foo_node_method( znode *node, ..., carry_level *todo )
3023 +   {
3024 +       carry_op   *op;
3025 +
3026 +       ....
3027 +
3028 +       // note, that last argument to post_carry() is non-null
3029 +       // here, because @op is to be applied to the parent of @node, rather
3030 +       // than to the @node itself as in the previous case.
3031 +
3032 +       op = node_post_carry( todo, operation, node, 1 );
3033 +       // fill in remaining fields in @op, according to carry.h:carry_op
3034 +
3035 +       ....
3036 +
3037 +   }
3038 +
3039 +   BATCHING:
3040 +
3041 +   One of the main advantages of level-by-level balancing implemented here is
3042 +   ability to batch updates on a parent level and to peform them more
3043 +   efficiently as a result.
3044 +
3045 +   Description To Be Done (TBD).
3046 +
3047 +   DIFFICULTIES AND SUBTLE POINTS:
3048 +
3049 +   1. complex plumbing is required, because:
3050 +
3051 +       a. effective allocation through pools is needed
3052 +
3053 +       b. target of operation is not exactly known when operation is
3054 +       posted. This is worked around through bitfields in &carry_node and
3055 +       logic in lock_carry_node()
3056 +
3057 +       c. of interaction with locking code: node should be added into sibling
3058 +       list when pointer to it is inserted into its parent, which is some time
3059 +       after node was created. Between these moments, node is somewhat in
3060 +       suspended state and is only registered in the carry lists
3061 +
3062 +    2. whole balancing logic is implemented here, in particular, insertion
3063 +    logic is coded in make_space().
3064 +
3065 +    3. special cases like insertion (add_tree_root()) or deletion
3066 +    (kill_tree_root()) of tree root and morphing of paste into insert
3067 +    (insert_paste()) have to be handled.
3068 +
3069 +    4. there is non-trivial interdependency between allocation of new nodes
3070 +    and almost everything else. This is mainly due to the (1.c) above. I shall
3071 +    write about this later.
3072 +
3073 +*/
3074 +
3075 +#include "forward.h"
3076 +#include "debug.h"
3077 +#include "key.h"
3078 +#include "coord.h"
3079 +#include "plugin/item/item.h"
3080 +#include "plugin/item/extent.h"
3081 +#include "plugin/node/node.h"
3082 +#include "jnode.h"
3083 +#include "znode.h"
3084 +#include "tree_mod.h"
3085 +#include "tree_walk.h"
3086 +#include "block_alloc.h"
3087 +#include "pool.h"
3088 +#include "tree.h"
3089 +#include "carry.h"
3090 +#include "carry_ops.h"
3091 +#include "super.h"
3092 +#include "reiser4.h"
3093 +#include "prof.h"
3094 +
3095 +#include <linux/types.h>
3096 +
3097 +/* level locking/unlocking */
3098 +static int lock_carry_level(carry_level * level);
3099 +static void unlock_carry_level(carry_level * level, int failure);
3100 +static void done_carry_level(carry_level * level);
3101 +static void unlock_carry_node(carry_level * level, carry_node * node, int fail);
3102 +
3103 +int lock_carry_node(carry_level * level, carry_node * node);
3104 +int lock_carry_node_tail(carry_node * node);
3105 +
3106 +/* carry processing proper */
3107 +static int carry_on_level(carry_level * doing, carry_level * todo);
3108 +
3109 +/* handlers for carry operations. */
3110 +
3111 +static void fatal_carry_error(carry_level * doing, int ecode);
3112 +static int add_new_root(carry_level * level, carry_node * node, znode * fake);
3113 +
3114 +static int carry_estimate_reserve(carry_level * level);
3115 +
3116 +#if REISER4_DEBUG
3117 +typedef enum {
3118 +       CARRY_TODO,
3119 +       CARRY_DOING
3120 +} carry_queue_state;
3121 +static int carry_level_invariant(carry_level * level, carry_queue_state state);
3122 +#endif
3123 +
3124 +/* main entry point for tree balancing.
3125 +
3126 +   Tree carry performs operations from @doing and while doing so accumulates
3127 +   information about operations to be performed on the next level ("carried"
3128 +   to the parent level). Carried operations are performed, causing possibly
3129 +   more operations to be carried upward etc. carry() takes care about
3130 +   locking and pinning znodes while operating on them.
3131 +
3132 +   For usage, see comment at the top of fs/reiser4/carry.c
3133 +
3134 +*/
3135 +reiser4_internal int
3136 +carry(carry_level * doing /* set of carry operations to be performed */ ,
3137 +      carry_level * done       /* set of nodes, already performed at the
3138 +                                * previous level. NULL in most cases */ )
3139 +{
3140 +       int result = 0;
3141 +       carry_level done_area;
3142 +       carry_level todo_area;
3143 +       /* queue of new requests */
3144 +       carry_level *todo;
3145 +       int wasreserved;
3146 +       int reserve;
3147 +       ON_DEBUG(STORE_COUNTERS;)
3148 +
3149 +       assert("nikita-888", doing != NULL);
3150 +
3151 +       trace_stamp(TRACE_CARRY);
3152 +
3153 +       todo = &todo_area;
3154 +       init_carry_level(todo, doing->pool);
3155 +       if (done == NULL) {
3156 +               /* queue of requests performed on the previous level */
3157 +               done = &done_area;
3158 +               init_carry_level(done, doing->pool);
3159 +       }
3160 +
3161 +       wasreserved = perthread_pages_count();
3162 +       reserve = carry_estimate_reserve(doing);
3163 +       result = perthread_pages_reserve(reserve, GFP_KERNEL);
3164 +       if (result != 0)
3165 +               return result;
3166 +
3167 +       /* iterate until there is nothing more to do */
3168 +       while (result == 0 && carry_op_num(doing) > 0) {
3169 +               carry_level *tmp;
3170 +
3171 +               ON_STATS(todo->level_no = doing->level_no + 1);
3172 +
3173 +               /* at this point @done is locked. */
3174 +               /* repeat lock/do/unlock while
3175 +
3176 +                  (1) lock_carry_level() fails due to deadlock avoidance, or
3177 +
3178 +                  (2) carry_on_level() decides that more nodes have to
3179 +                  be involved.
3180 +
3181 +                  (3) some unexpected error occured while balancing on the
3182 +                  upper levels. In this case all changes are rolled back.
3183 +
3184 +               */
3185 +               while (1) {
3186 +                       result = lock_carry_level(doing);
3187 +                       if (result == 0) {
3188 +                               /* perform operations from @doing and
3189 +                                  accumulate new requests in @todo */
3190 +                               result = carry_on_level(doing, todo);
3191 +                               if (result == 0)
3192 +                                       break;
3193 +                               else if (result != -E_REPEAT ||
3194 +                                        !doing->restartable) {
3195 +                                       warning("nikita-1043",
3196 +                                               "Fatal error during carry: %i",
3197 +                                               result);
3198 +                                       print_level("done", done);
3199 +                                       print_level("doing", doing);
3200 +                                       print_level("todo", todo);
3201 +                                       /* do some rough stuff like aborting
3202 +                                          all pending transcrashes and thus
3203 +                                          pushing tree back to the consistent
3204 +                                          state. Alternatvely, just panic.
3205 +                                       */
3206 +                                       fatal_carry_error(doing, result);
3207 +                                       return result;
3208 +                               }
3209 +                       } else if (result != -E_REPEAT) {
3210 +                               fatal_carry_error(doing, result);
3211 +                               return result;
3212 +                       }
3213 +                       reiser4_stat_level_inc(doing, carry_restart);
3214 +                       unlock_carry_level(doing, 1);
3215 +               }
3216 +               /* at this point @done can be safely unlocked */
3217 +               done_carry_level(done);
3218 +               reiser4_stat_level_inc(doing, carry_done);
3219 +               /* cyclically shift queues */
3220 +               tmp = done;
3221 +               done = doing;
3222 +               doing = todo;
3223 +               todo = tmp;
3224 +               init_carry_level(todo, doing->pool);
3225 +
3226 +               /* give other threads chance to run */
3227 +               preempt_point();
3228 +       }
3229 +       done_carry_level(done);
3230 +
3231 +       assert("nikita-3460", perthread_pages_count() - wasreserved >= 0);
3232 +       perthread_pages_release(perthread_pages_count() - wasreserved);
3233 +
3234 +       /* all counters, but x_refs should remain the same. x_refs can change
3235 +          owing to transaction manager */
3236 +       ON_DEBUG(CHECK_COUNTERS;)
3237 +       return result;
3238 +}
3239 +
3240 +/* perform carry operations on given level.
3241 +
3242 +   Optimizations proposed by pooh:
3243 +
3244 +   (1) don't lock all nodes from queue at the same time. Lock nodes lazily as
3245 +   required;
3246 +
3247 +   (2) unlock node if there are no more operations to be performed upon it and
3248 +   node didn't add any operation to @todo. This can be implemented by
3249 +   attaching to each node two counters: counter of operaions working on this
3250 +   node and counter and operations carried upward from this node.
3251 +
3252 +*/
3253 +static int
3254 +carry_on_level(carry_level * doing     /* queue of carry operations to
3255 +                                        * do on this level */ ,
3256 +              carry_level * todo       /* queue where new carry
3257 +                                        * operations to be performed on
3258 +                                        * the * parent level are
3259 +                                        * accumulated during @doing
3260 +                                        * processing. */ )
3261 +{
3262 +       int result;
3263 +       int (*f) (carry_op *, carry_level *, carry_level *);
3264 +       carry_op *op;
3265 +       carry_op *tmp_op;
3266 +
3267 +       assert("nikita-1034", doing != NULL);
3268 +       assert("nikita-1035", todo != NULL);
3269 +
3270 +       trace_stamp(TRACE_CARRY);
3271 +
3272 +       /* node can be inconsistent while in-transit */
3273 +       DISABLE_NODE_CHECK;
3274 +
3275 +       /* @doing->nodes are locked. */
3276 +
3277 +       /* This function can be split into two phases: analysis and modification.
3278 +
3279 +          Analysis calculates precisely what items should be moved between
3280 +          nodes. This information is gathered in some structures attached to
3281 +          each carry_node in a @doing queue. Analysis also determines whether
3282 +          new nodes are to be allocated etc.
3283 +
3284 +          After analysis is completed, actual modification is performed. Here
3285 +          we can take advantage of "batch modification": if there are several
3286 +          operations acting on the same node, modifications can be performed
3287 +          more efficiently when batched together.
3288 +
3289 +          Above is an optimization left for the future.
3290 +       */
3291 +       /* Important, but delayed optimization: it's possible to batch
3292 +          operations together and perform them more efficiently as a
3293 +          result. For example, deletion of several neighboring items from a
3294 +          node can be converted to a single ->cut() operation.
3295 +
3296 +          Before processing queue, it should be scanned and "mergeable"
3297 +          operations merged.
3298 +       */
3299 +       result = 0;
3300 +       for_all_ops(doing, op, tmp_op) {
3301 +               carry_opcode opcode;
3302 +
3303 +               assert("nikita-1041", op != NULL);
3304 +               opcode = op->op;
3305 +               assert("nikita-1042", op->op < COP_LAST_OP);
3306 +               f = op_dispatch_table[op->op].handler;
3307 +               result = f(op, doing, todo);
3308 +               /* locking can fail with -E_REPEAT. Any different error is fatal
3309 +                  and will be handled by fatal_carry_error() sledgehammer.
3310 +               */
3311 +               if (result != 0)
3312 +                       break;
3313 +       }
3314 +       if (result == 0) {
3315 +               carry_plugin_info info;
3316 +               carry_node *scan;
3317 +               carry_node *tmp_scan;
3318 +
3319 +               info.doing = doing;
3320 +               info.todo = todo;
3321 +
3322 +               assert("nikita-3002", carry_level_invariant(doing, CARRY_DOING));
3323 +               for_all_nodes(doing, scan, tmp_scan) {
3324 +                       znode *node;
3325 +
3326 +                       node = carry_real(scan);
3327 +                       assert("nikita-2547", node != NULL);
3328 +                       if (node_is_empty(node)) {
3329 +                               result = node_plugin_by_node(node)->prepare_removal(node, &info);
3330 +                               if (result != 0)
3331 +                                       break;
3332 +                       }
3333 +               }
3334 +       }
3335 +       ENABLE_NODE_CHECK;
3336 +       return result;
3337 +}
3338 +
3339 +/* post carry operation
3340 +
3341 +   This is main function used by external carry clients: node layout plugins
3342 +   and tree operations to create new carry operation to be performed on some
3343 +   level.
3344 +
3345 +   New operation will be included in the @level queue. To actually perform it,
3346 +   call carry( level, ... ). This function takes write lock on @node. Carry
3347 +   manages all its locks by itself, don't worry about this.
3348 +
3349 +   This function adds operation and node at the end of the queue. It is up to
3350 +   caller to guarantee proper ordering of node queue.
3351 +
3352 +*/
3353 +reiser4_internal carry_op *
3354 +post_carry(carry_level * level /* queue where new operation is to
3355 +                                * be posted at */ ,
3356 +          carry_opcode op /* opcode of operation */ ,
3357 +          znode * node         /* node on which this operation
3358 +                                * will operate */ ,
3359 +          int apply_to_parent_p        /* whether operation will operate
3360 +                                        * directly on @node or on it
3361 +                                        * parent. */ )
3362 +{
3363 +       carry_op *result;
3364 +       carry_node *child;
3365 +
3366 +       assert("nikita-1046", level != NULL);
3367 +       assert("nikita-1788", znode_is_write_locked(node));
3368 +
3369 +       result = add_op(level, POOLO_LAST, NULL);
3370 +       if (IS_ERR(result))
3371 +               return result;
3372 +       child = add_carry(level, POOLO_LAST, NULL);
3373 +       if (IS_ERR(child)) {
3374 +               reiser4_pool_free(&level->pool->op_pool, &result->header);
3375 +               return (carry_op *) child;
3376 +       }
3377 +       result->node = child;
3378 +       result->op = op;
3379 +       child->parent = apply_to_parent_p;
3380 +       if (ZF_ISSET(node, JNODE_ORPHAN))
3381 +               child->left_before = 1;
3382 +       child->node = node;
3383 +       return result;
3384 +}
3385 +
3386 +/* number of carry operations in a @level */
3387 +reiser4_internal int
3388 +carry_op_num(const carry_level * level)
3389 +{
3390 +       return level->ops_num;
3391 +}
3392 +
3393 +/* initialise carry queue */
3394 +reiser4_internal void
3395 +init_carry_level(carry_level * level /* level to initialise */ ,
3396 +                carry_pool * pool      /* pool @level will allocate objects
3397 +                                        * from */ )
3398 +{
3399 +       assert("nikita-1045", level != NULL);
3400 +       assert("nikita-967", pool != NULL);
3401 +
3402 +       xmemset(level, 0, sizeof *level);
3403 +       level->pool = pool;
3404 +
3405 +       pool_level_list_init(&level->nodes);
3406 +       pool_level_list_init(&level->ops);
3407 +}
3408 +
3409 +/* initialise pools within queue */
3410 +reiser4_internal void
3411 +init_carry_pool(carry_pool * pool /* pool to initialise */ )
3412 +{
3413 +       assert("nikita-945", pool != NULL);
3414 +
3415 +       reiser4_init_pool(&pool->op_pool, sizeof (carry_op), CARRIES_POOL_SIZE, (char *) pool->op);
3416 +       reiser4_init_pool(&pool->node_pool, sizeof (carry_node), NODES_LOCKED_POOL_SIZE, (char *) pool->node);
3417 +}
3418 +
3419 +/* finish with queue pools */
3420 +reiser4_internal void
3421 +done_carry_pool(carry_pool * pool UNUSED_ARG /* pool to destroy */ )
3422 +{
3423 +       reiser4_done_pool(&pool->op_pool);
3424 +       reiser4_done_pool(&pool->node_pool);
3425 +}
3426 +
3427 +/* add new carry node to the @level.
3428 +
3429 +   Returns pointer to the new carry node allocated from pool.  It's up to
3430 +   callers to maintain proper order in the @level. Assumption is that if carry
3431 +   nodes on one level are already sorted and modifications are peroformed from
3432 +   left to right, carry nodes added on the parent level will be ordered
3433 +   automatically. To control ordering use @order and @reference parameters.
3434 +
3435 +*/
3436 +reiser4_internal carry_node *
3437 +add_carry_skip(carry_level * level     /* &carry_level to add node
3438 +                                        * to */ ,
3439 +              pool_ordering order      /* where to insert: at the
3440 +                                        * beginning of @level,
3441 +                                        * before @reference, after
3442 +                                        * @reference, at the end
3443 +                                        * of @level */ ,
3444 +              carry_node * reference   /* reference node for
3445 +                                        * insertion */ )
3446 +{
3447 +       ON_DEBUG(carry_node * orig_ref = reference);
3448 +
3449 +       trace_stamp(TRACE_CARRY);
3450 +       if (order == POOLO_BEFORE) {
3451 +               reference = find_left_carry(reference, level);
3452 +               if (reference == NULL)
3453 +                       reference = carry_node_front(level);
3454 +               else
3455 +                       reference = carry_node_next(reference);
3456 +       } else if (order == POOLO_AFTER) {
3457 +               reference = find_right_carry(reference, level);
3458 +               if (reference == NULL)
3459 +                       reference = carry_node_back(level);
3460 +               else
3461 +                       reference = carry_node_prev(reference);
3462 +       }
3463 +       assert("nikita-2209",
3464 +              ergo(orig_ref != NULL,
3465 +                   carry_real(reference) == carry_real(orig_ref)));
3466 +       return add_carry(level, order, reference);
3467 +}
3468 +
3469 +reiser4_internal carry_node *
3470 +add_carry(carry_level * level  /* &carry_level to add node
3471 +                                * to */ ,
3472 +         pool_ordering order   /* where to insert: at the
3473 +                                * beginning of @level, before
3474 +                                * @reference, after @reference,
3475 +                                * at the end of @level */ ,
3476 +         carry_node * reference        /* reference node for
3477 +                                        * insertion */ )
3478 +{
3479 +       carry_node *result;
3480 +
3481 +       result = (carry_node *) add_obj(&level->pool->node_pool, &level->nodes, order, &reference->header);
3482 +       if (!IS_ERR(result) && (result != NULL))
3483 +               ++level->nodes_num;
3484 +       return result;
3485 +}
3486 +
3487 +/* add new carry operation to the @level.
3488 +
3489 +   Returns pointer to the new carry operations allocated from pool. It's up to
3490 +   callers to maintain proper order in the @level. To control ordering use
3491 +   @order and @reference parameters.
3492 +
3493 +*/
3494 +reiser4_internal carry_op *
3495 +add_op(carry_level * level /* &carry_level to add node to */ ,
3496 +       pool_ordering order     /* where to insert: at the beginning of
3497 +                                * @level, before @reference, after
3498 +                                * @reference, at the end of @level */ ,
3499 +       carry_op * reference /* reference node for insertion */ )
3500 +{
3501 +       carry_op *result;
3502 +
3503 +       trace_stamp(TRACE_CARRY);
3504 +       result = (carry_op *) add_obj(&level->pool->op_pool, &level->ops, order, &reference->header);
3505 +       if (!IS_ERR(result) && (result != NULL))
3506 +               ++level->ops_num;
3507 +       return result;
3508 +}
3509 +
3510 +/* Return node on the right of which @node was created.
3511 +
3512 +   Each node is created on the right of some existing node (or it is new root,
3513 +   which is special case not handled here).
3514 +
3515 +   @node is new node created on some level, but not yet inserted into its
3516 +   parent, it has corresponding bit (JNODE_ORPHAN) set in zstate.
3517 +
3518 +*/
3519 +reiser4_internal carry_node *
3520 +find_begetting_brother(carry_node * node       /* node to start search
3521 +                                                * from */ ,
3522 +                      carry_level * kin UNUSED_ARG     /* level to
3523 +                                                        * scan */ )
3524 +{
3525 +       carry_node *scan;
3526 +
3527 +       assert("nikita-1614", node != NULL);
3528 +       assert("nikita-1615", kin != NULL);
3529 +       assert("nikita-1616", LOCK_CNT_GTZ(rw_locked_tree));
3530 +       assert("nikita-1619", ergo(carry_real(node) != NULL,
3531 +                                  ZF_ISSET(carry_real(node), JNODE_ORPHAN)));
3532 +
3533 +       for (scan = node;; scan = carry_node_prev(scan)) {
3534 +               assert("nikita-1617", !carry_node_end(kin, scan));
3535 +               if ((scan->node != node->node) && !ZF_ISSET(scan->node, JNODE_ORPHAN)) {
3536 +                       assert("nikita-1618", carry_real(scan) != NULL);
3537 +                       break;
3538 +               }
3539 +       }
3540 +       return scan;
3541 +}
3542 +
3543 +static cmp_t
3544 +carry_node_cmp(carry_level * level, carry_node * n1, carry_node * n2)
3545 +{
3546 +       assert("nikita-2199", n1 != NULL);
3547 +       assert("nikita-2200", n2 != NULL);
3548 +
3549 +       if (n1 == n2)
3550 +               return EQUAL_TO;
3551 +       while (1) {
3552 +               n1 = carry_node_next(n1);
3553 +               if (carry_node_end(level, n1))
3554 +                       return GREATER_THAN;
3555 +               if (n1 == n2)
3556 +                       return LESS_THAN;
3557 +       }
3558 +       impossible("nikita-2201", "End of level reached");
3559 +}
3560 +
3561 +reiser4_internal carry_node *
3562 +find_carry_node(carry_level * level, const znode * node)
3563 +{
3564 +       carry_node *scan;
3565 +       carry_node *tmp_scan;
3566 +
3567 +       assert("nikita-2202", level != NULL);
3568 +       assert("nikita-2203", node != NULL);
3569 +
3570 +       for_all_nodes(level, scan, tmp_scan) {
3571 +               if (carry_real(scan) == node)
3572 +                       return scan;
3573 +       }
3574 +       return NULL;
3575 +}
3576 +
3577 +reiser4_internal znode *
3578 +carry_real(const carry_node * node)
3579 +{
3580 +       assert("nikita-3061", node != NULL);
3581 +
3582 +       return node->lock_handle.node;
3583 +}
3584 +
3585 +reiser4_internal carry_node *
3586 +insert_carry_node(carry_level * doing, carry_level * todo, const znode * node)
3587 +{
3588 +       carry_node *base;
3589 +       carry_node *scan;
3590 +       carry_node *tmp_scan;
3591 +       carry_node *proj;
3592 +
3593 +       base = find_carry_node(doing, node);
3594 +       assert("nikita-2204", base != NULL);
3595 +
3596 +       for_all_nodes(todo, scan, tmp_scan) {
3597 +               proj = find_carry_node(doing, scan->node);
3598 +               assert("nikita-2205", proj != NULL);
3599 +               if (carry_node_cmp(doing, proj, base) != LESS_THAN)
3600 +                       break;
3601 +       }
3602 +       return scan;
3603 +}
3604 +
3605 +reiser4_internal carry_node *
3606 +add_carry_atplace(carry_level *doing, carry_level *todo, znode *node)
3607 +{
3608 +       carry_node *reference;
3609 +
3610 +       assert("nikita-2994", doing != NULL);
3611 +       assert("nikita-2995", todo != NULL);
3612 +       assert("nikita-2996", node != NULL);
3613 +
3614 +       reference = insert_carry_node(doing, todo, node);
3615 +       assert("nikita-2997", reference != NULL);
3616 +
3617 +       return add_carry(todo, POOLO_BEFORE, reference);
3618 +}
3619 +
3620 +/* like post_carry(), but designed to be called from node plugin methods.
3621 +   This function is different from post_carry() in that it finds proper place
3622 +   to insert node in the queue. */
3623 +reiser4_internal carry_op *
3624 +node_post_carry(carry_plugin_info * info       /* carry parameters
3625 +                                                * passed down to node
3626 +                                                * plugin */ ,
3627 +               carry_opcode op /* opcode of operation */ ,
3628 +               znode * node    /* node on which this
3629 +                                * operation will operate */ ,
3630 +               int apply_to_parent_p   /* whether operation will
3631 +                                        * operate directly on @node
3632 +                                        * or on it parent. */ )
3633 +{
3634 +       carry_op *result;
3635 +       carry_node *child;
3636 +
3637 +       assert("nikita-2207", info != NULL);
3638 +       assert("nikita-2208", info->todo != NULL);
3639 +
3640 +       if (info->doing == NULL)
3641 +               return post_carry(info->todo, op, node, apply_to_parent_p);
3642 +
3643 +       result = add_op(info->todo, POOLO_LAST, NULL);
3644 +       if (IS_ERR(result))
3645 +               return result;
3646 +       child = add_carry_atplace(info->doing, info->todo, node);
3647 +       if (IS_ERR(child)) {
3648 +               reiser4_pool_free(&info->todo->pool->op_pool, &result->header);
3649 +               return (carry_op *) child;
3650 +       }
3651 +       result->node = child;
3652 +       result->op = op;
3653 +       child->parent = apply_to_parent_p;
3654 +       if (ZF_ISSET(node, JNODE_ORPHAN))
3655 +               child->left_before = 1;
3656 +       child->node = node;
3657 +       return result;
3658 +}
3659 +
3660 +/* lock all carry nodes in @level */
3661 +static int
3662 +lock_carry_level(carry_level * level /* level to lock */ )
3663 +{
3664 +       int result;
3665 +       carry_node *node;
3666 +       carry_node *tmp_node;
3667 +
3668 +       assert("nikita-881", level != NULL);
3669 +       assert("nikita-2229", carry_level_invariant(level, CARRY_TODO));
3670 +
3671 +       trace_stamp(TRACE_CARRY);
3672 +
3673 +       /* lock nodes from left to right */
3674 +       result = 0;
3675 +       for_all_nodes(level, node, tmp_node) {
3676 +               result = lock_carry_node(level, node);
3677 +               if (result != 0)
3678 +                       break;
3679 +       }
3680 +       return result;
3681 +}
3682 +
3683 +/* Synchronize delimiting keys between @node and its left neighbor.
3684 +
3685 +   To reduce contention on dk key and simplify carry code, we synchronize
3686 +   delimiting keys only when carry ultimately leaves tree level (carrying
3687 +   changes upward) and unlocks nodes at this level.
3688 +
3689 +   This function first finds left neighbor of @node and then updates left
3690 +   neighbor's right delimiting key to conincide with least key in @node.
3691 +
3692 +*/
3693 +static void
3694 +sync_dkeys(znode *spot /* node to update */)
3695 +{
3696 +       reiser4_key pivot;
3697 +       reiser4_tree *tree;
3698 +
3699 +       assert("nikita-1610", spot != NULL);
3700 +       assert("nikita-1612", LOCK_CNT_NIL(rw_locked_dk));
3701 +
3702 +       tree = znode_get_tree(spot);
3703 +       WLOCK_DK(tree);
3704 +
3705 +       assert("nikita-2192", znode_is_loaded(spot));
3706 +
3707 +       /* sync left delimiting key of @spot with key in its leftmost item */
3708 +       if (node_is_empty(spot))
3709 +               pivot = *znode_get_rd_key(spot);
3710 +       else
3711 +               leftmost_key_in_node(spot, &pivot);
3712 +
3713 +       znode_set_ld_key(spot, &pivot);
3714 +
3715 +       RLOCK_TREE(tree);
3716 +       /* there can be sequence of empty nodes pending removal on the left of
3717 +          @spot. Scan them and update their left and right delimiting keys to
3718 +          match left delimiting key of @spot. Also, update right delimiting
3719 +          key of first non-empty left neighbor.
3720 +       */
3721 +       while (1) {
3722 +               if (!ZF_ISSET(spot, JNODE_LEFT_CONNECTED))
3723 +                       break;
3724 +
3725 +               spot = spot->left;
3726 +               if (spot == NULL)
3727 +                       break;
3728 +
3729 +#if 0
3730 +               /* on the leaf level we can only increase right delimiting key
3731 +                * of a node on which we don't hold a long term lock. */
3732 +               assert("nikita-2930",
3733 +                      ergo(!znode_is_write_locked(spot) &&
3734 +                           znode_get_level(spot) == LEAF_LEVEL,
3735 +                           keyge(&pivot, znode_get_rd_key(spot))));
3736 +#endif
3737 +
3738 +               znode_set_rd_key(spot, &pivot);
3739 +               /* don't sink into the domain of another balancing */
3740 +               if (!znode_is_write_locked(spot))
3741 +                       break;
3742 +               if (ZF_ISSET(spot, JNODE_HEARD_BANSHEE))
3743 +                       znode_set_ld_key(spot, &pivot);
3744 +               else
3745 +                       break;
3746 +       }
3747 +
3748 +       RUNLOCK_TREE(tree);
3749 +       WUNLOCK_DK(tree);
3750 +}
3751 +
3752 +void
3753 +check_dkeys(const znode *node);
3754 +
3755 +/* unlock all carry nodes in @level */
3756 +static void
3757 +unlock_carry_level(carry_level * level /* level to unlock */ ,
3758 +                  int failure  /* true if unlocking owing to
3759 +                                * failure */ )
3760 +{
3761 +       carry_node *node;
3762 +       carry_node *tmp_node;
3763 +
3764 +       assert("nikita-889", level != NULL);
3765 +
3766 +       trace_stamp(TRACE_CARRY);
3767 +
3768 +       if (!failure) {
3769 +               znode *spot;
3770 +
3771 +               spot = NULL;
3772 +               /* update delimiting keys */
3773 +               for_all_nodes(level, node, tmp_node) {
3774 +                       if (carry_real(node) != spot) {
3775 +                               spot = carry_real(node);
3776 +                               sync_dkeys(spot);
3777 +                       }
3778 +               }
3779 +       }
3780 +
3781 +       /* nodes can be unlocked in arbitrary order.  In preemptible
3782 +          environment it's better to unlock in reverse order of locking,
3783 +          though.
3784 +       */
3785 +       for_all_nodes_back(level, node, tmp_node) {
3786 +               /* all allocated nodes should be already linked to their
3787 +                  parents at this moment. */
3788 +               assert("nikita-1631", ergo(!failure, !ZF_ISSET(carry_real(node),
3789 +                                                              JNODE_ORPHAN)));
3790 +               if (!failure)
3791 +                       node_check(carry_real(node), REISER4_NODE_DKEYS);
3792 +               ON_DEBUG(check_dkeys(carry_real(node)));
3793 +               unlock_carry_node(level, node, failure);
3794 +       }
3795 +       level->new_root = NULL;
3796 +}
3797 +
3798 +/* finish with @level
3799 +
3800 +   Unlock nodes and release all allocated resources */
3801 +static void
3802 +done_carry_level(carry_level * level /* level to finish */ )
3803 +{
3804 +       carry_node *node;
3805 +       carry_node *tmp_node;
3806 +       carry_op *op;
3807 +       carry_op *tmp_op;
3808 +
3809 +       assert("nikita-1076", level != NULL);
3810 +
3811 +       trace_stamp(TRACE_CARRY);
3812 +
3813 +       unlock_carry_level(level, 0);
3814 +       for_all_nodes(level, node, tmp_node) {
3815 +               assert("nikita-2113", locks_list_is_clean(&node->lock_handle));
3816 +               assert("nikita-2114", owners_list_is_clean(&node->lock_handle));
3817 +               reiser4_pool_free(&level->pool->node_pool, &node->header);
3818 +       }
3819 +       for_all_ops(level, op, tmp_op)
3820 +           reiser4_pool_free(&level->pool->op_pool, &op->header);
3821 +}
3822 +
3823 +/* helper function to complete locking of carry node
3824 +
3825 +   Finish locking of carry node. There are several ways in which new carry
3826 +   node can be added into carry level and locked. Normal is through
3827 +   lock_carry_node(), but also from find_{left|right}_neighbor(). This
3828 +   function factors out common final part of all locking scenarios. It
3829 +   supposes that @node -> lock_handle is lock handle for lock just taken and
3830 +   fills ->real_node from this lock handle.
3831 +
3832 +*/
3833 +reiser4_internal int
3834 +lock_carry_node_tail(carry_node * node /* node to complete locking of */ )
3835 +{
3836 +       assert("nikita-1052", node != NULL);
3837 +       assert("nikita-1187", carry_real(node) != NULL);
3838 +       assert("nikita-1188", !node->unlock);
3839 +
3840 +       node->unlock = 1;
3841 +       /* Load node content into memory and install node plugin by
3842 +          looking at the node header.
3843 +
3844 +          Most of the time this call is cheap because the node is
3845 +          already in memory.
3846 +
3847 +          Corresponding zrelse() is in unlock_carry_node()
3848 +       */
3849 +       return zload(carry_real(node));
3850 +}
3851 +
3852 +/* lock carry node
3853 +
3854 +   "Resolve" node to real znode, lock it and mark as locked.
3855 +   This requires recursive locking of znodes.
3856 +
3857 +   When operation is posted to the parent level, node it will be applied to is
3858 +   not yet known. For example, when shifting data between two nodes,
3859 +   delimiting has to be updated in parent or parents of nodes involved. But
3860 +   their parents is not yet locked and, moreover said nodes can be reparented
3861 +   by concurrent balancing.
3862 +
3863 +   To work around this, carry operation is applied to special "carry node"
3864 +   rather than to the znode itself. Carry node consists of some "base" or
3865 +   "reference" znode and flags indicating how to get to the target of carry
3866 +   operation (->real_node field of carry_node) from base.
3867 +
3868 +*/
3869 +reiser4_internal int
3870 +lock_carry_node(carry_level * level /* level @node is in */ ,
3871 +               carry_node * node /* node to lock */ )
3872 +{
3873 +       int result;
3874 +       znode *reference_point;
3875 +       lock_handle lh;
3876 +       lock_handle tmp_lh;
3877 +
3878 +       assert("nikita-887", level != NULL);
3879 +       assert("nikita-882", node != NULL);
3880 +
3881 +       trace_stamp(TRACE_CARRY);
3882 +
3883 +       result = 0;
3884 +       reference_point = node->node;
3885 +       init_lh(&lh);
3886 +       init_lh(&tmp_lh);
3887 +       if (node->left_before) {
3888 +               /* handling of new nodes, allocated on the previous level:
3889 +
3890 +                  some carry ops were propably posted from the new node, but
3891 +                  this node neither has parent pointer set, nor is
3892 +                  connected. This will be done in ->create_hook() for
3893 +                  internal item.
3894 +
3895 +                  No then less, parent of new node has to be locked. To do
3896 +                  this, first go to the "left" in the carry order. This
3897 +                  depends on the decision to always allocate new node on the
3898 +                  right of existing one.
3899 +
3900 +                  Loop handles case when multiple nodes, all orphans, were
3901 +                  inserted.
3902 +
3903 +                  Strictly speaking, taking tree lock is not necessary here,
3904 +                  because all nodes scanned by loop in
3905 +                  find_begetting_brother() are write-locked by this thread,
3906 +                  and thus, their sibling linkage cannot change.
3907 +
3908 +               */
3909 +               reference_point = UNDER_RW
3910 +                   (tree, znode_get_tree(reference_point), read,
3911 +                    find_begetting_brother(node, level)->node);
3912 +               assert("nikita-1186", reference_point != NULL);
3913 +       }
3914 +       if (node->parent && (result == 0)) {
3915 +               result = reiser4_get_parent(&tmp_lh, reference_point, ZNODE_WRITE_LOCK, 0);
3916 +               if (result != 0) {
3917 +                       ;       /* nothing */
3918 +               } else if (znode_get_level(tmp_lh.node) == 0) {
3919 +                       assert("nikita-1347", znode_above_root(tmp_lh.node));
3920 +                       result = add_new_root(level, node, tmp_lh.node);
3921 +                       if (result == 0) {
3922 +                               reference_point = level->new_root;
3923 +                               move_lh(&lh, &node->lock_handle);
3924 +                       }
3925 +               } else if ((level->new_root != NULL) && (level->new_root != znode_parent_nolock(reference_point))) {
3926 +                       /* parent of node exists, but this level aready
3927 +                          created different new root, so */
3928 +                       warning("nikita-1109",
3929 +                               /* it should be "radicis", but tradition is
3930 +                                  tradition.  do banshees read latin? */
3931 +                               "hodie natus est radici frater");
3932 +                       result = -EIO;
3933 +               } else {
3934 +                       move_lh(&lh, &tmp_lh);
3935 +                       reference_point = lh.node;
3936 +               }
3937 +       }
3938 +       if (node->left && (result == 0)) {
3939 +               assert("nikita-1183", node->parent);
3940 +               assert("nikita-883", reference_point != NULL);
3941 +               result = reiser4_get_left_neighbor(
3942 +                       &tmp_lh, reference_point, ZNODE_WRITE_LOCK, GN_CAN_USE_UPPER_LEVELS);
3943 +               if (result == 0) {
3944 +                       done_lh(&lh);
3945 +                       move_lh(&lh, &tmp_lh);
3946 +                       reference_point = lh.node;
3947 +               }
3948 +       }
3949 +       if (!node->parent && !node->left && !node->left_before) {
3950 +               result = longterm_lock_znode(&lh, reference_point, ZNODE_WRITE_LOCK, ZNODE_LOCK_HIPRI);
3951 +       }
3952 +       if (result == 0) {
3953 +               move_lh(&node->lock_handle, &lh);
3954 +               result = lock_carry_node_tail(node);
3955 +       }
3956 +       done_lh(&tmp_lh);
3957 +       done_lh(&lh);
3958 +       return result;
3959 +}
3960 +
3961 +/* release a lock on &carry_node.
3962 +
3963 +   Release if necessary lock on @node. This opearion is pair of
3964 +   lock_carry_node() and is idempotent: you can call it more than once on the
3965 +   same node.
3966 +
3967 +*/
3968 +static void
3969 +unlock_carry_node(carry_level * level,
3970 +                 carry_node * node /* node to be released */ ,
3971 +                 int failure   /* 0 if node is unlocked due
3972 +                                * to some error */ )
3973 +{
3974 +       znode *real_node;
3975 +
3976 +       assert("nikita-884", node != NULL);
3977 +
3978 +       trace_stamp(TRACE_CARRY);
3979 +
3980 +       real_node = carry_real(node);
3981 +       /* pair to zload() in lock_carry_node_tail() */
3982 +       zrelse(real_node);
3983 +       if (node->unlock && (real_node != NULL)) {
3984 +               assert("nikita-899", real_node == node->lock_handle.node);
3985 +               longterm_unlock_znode(&node->lock_handle);
3986 +       }
3987 +       if (failure) {
3988 +               if (node->deallocate && (real_node != NULL)) {
3989 +                       /* free node in bitmap
3990 +
3991 +                          Prepare node for removal. Last zput() will finish
3992 +                          with it.
3993 +                       */
3994 +                       ZF_SET(real_node, JNODE_HEARD_BANSHEE);
3995 +               }
3996 +               if (node->free) {
3997 +                       assert("nikita-2177", locks_list_is_clean(&node->lock_handle));
3998 +                       assert("nikita-2112", owners_list_is_clean(&node->lock_handle));
3999 +                       reiser4_pool_free(&level->pool->node_pool, &node->header);
4000 +               }
4001 +       }
4002 +}
4003 +
4004 +/* fatal_carry_error() - all-catching error handling function
4005 +
4006 +   It is possible that carry faces unrecoverable error, like unability to
4007 +   insert pointer at the internal level. Our simple solution is just panic in
4008 +   this situation. More sophisticated things like attempt to remount
4009 +   file-system as read-only can be implemented without much difficlties.
4010 +
4011 +   It is believed, that:
4012 +
4013 +   1. in stead of panicking, all current transactions can be aborted rolling
4014 +   system back to the consistent state.
4015 +
4016 +Umm, if you simply panic without doing anything more at all, then all current
4017 +transactions are aborted and the system is rolled back to a consistent state,
4018 +by virtue of the design of the transactional mechanism. Well, wait, let's be
4019 +precise.  If an internal node is corrupted on disk due to hardware failure,
4020 +then there may be no consistent state that can be rolled back to, so instead
4021 +we should say that it will rollback the transactions, which barring other
4022 +factors means rolling back to a consistent state.
4023 +
4024 +# Nikita: there is a subtle difference between panic and aborting
4025 +# transactions: machine doesn't reboot. Processes aren't killed. Processes
4026 +# don't using reiser4 (not that we care about such processes), or using other
4027 +# reiser4 mounts (about them we do care) will simply continue to run. With
4028 +# some luck, even application using aborted file system can survive: it will
4029 +# get some error, like EBADF, from each file descriptor on failed file system,
4030 +# but applications that do care about tolerance will cope with this (squid
4031 +# will).
4032 +
4033 +It would be a nice feature though to support rollback without rebooting
4034 +followed by remount, but this can wait for later versions.
4035 +
4036 +
4037 +   2. once isolated transactions will be implemented it will be possible to
4038 +   roll back offending transaction.
4039 +
4040 +2. is additional code complexity of inconsistent value (it implies that a broken tree should be kept in operation), so we must think about
4041 +it more before deciding if it should be done.  -Hans
4042 +
4043 +*/
4044 +static void
4045 +fatal_carry_error(carry_level * doing UNUSED_ARG       /* carry level
4046 +                                                        * where
4047 +                                                        * unrecoverable
4048 +                                                        * error
4049 +                                                        * occurred */ ,
4050 +                 int ecode /* error code */ )
4051 +{
4052 +       assert("nikita-1230", doing != NULL);
4053 +       assert("nikita-1231", ecode < 0);
4054 +
4055 +       reiser4_panic("nikita-1232", "Carry failed: %i", ecode);
4056 +}
4057 +
4058 +/* add new root to the tree
4059 +
4060 +   This function itself only manages changes in carry structures and delegates
4061 +   all hard work (allocation of znode for new root, changes of parent and
4062 +   sibling pointers to the add_tree_root().
4063 +
4064 +   Locking: old tree root is locked by carry at this point. Fake znode is also
4065 +   locked.
4066 +
4067 +*/
4068 +static int
4069 +add_new_root(carry_level * level       /* carry level in context of which
4070 +                                        * operation is performed */ ,
4071 +            carry_node * node /* carry node for existing root */ ,
4072 +            znode * fake       /* "fake" znode already locked by
4073 +                                * us */ )
4074 +{
4075 +       int result;
4076 +
4077 +       assert("nikita-1104", level != NULL);
4078 +       assert("nikita-1105", node != NULL);
4079 +
4080 +       assert("nikita-1403", znode_is_write_locked(node->node));
4081 +       assert("nikita-1404", znode_is_write_locked(fake));
4082 +
4083 +       /* trying to create new root. */
4084 +       /* @node is root and it's already locked by us. This
4085 +          means that nobody else can be trying to add/remove
4086 +          tree root right now.
4087 +       */
4088 +       if (level->new_root == NULL)
4089 +               level->new_root = add_tree_root(node->node, fake);
4090 +       if (!IS_ERR(level->new_root)) {
4091 +               assert("nikita-1210", znode_is_root(level->new_root));
4092 +               node->deallocate = 1;
4093 +               result = longterm_lock_znode(&node->lock_handle, level->new_root, ZNODE_WRITE_LOCK, ZNODE_LOCK_LOPRI);
4094 +               if (result == 0)
4095 +                       zput(level->new_root);
4096 +       } else {
4097 +               result = PTR_ERR(level->new_root);
4098 +               level->new_root = NULL;
4099 +       }
4100 +       return result;
4101 +}
4102 +
4103 +/* allocate new znode and add the operation that inserts the
4104 +   pointer to it into the parent node into the todo level
4105 +
4106 +   Allocate new znode, add it into carry queue and post into @todo queue
4107 +   request to add pointer to new node into its parent.
4108 +
4109 +   This is carry related routing that calls new_node() to allocate new
4110 +   node.
4111 +*/
4112 +reiser4_internal carry_node *
4113 +add_new_znode(znode * brother  /* existing left neighbor of new
4114 +                                * node */ ,
4115 +             carry_node * ref  /* carry node after which new
4116 +                                * carry node is to be inserted
4117 +                                * into queue. This affects
4118 +                                * locking. */ ,
4119 +             carry_level * doing       /* carry queue where new node is
4120 +                                        * to be added */ ,
4121 +             carry_level * todo        /* carry queue where COP_INSERT
4122 +                                        * operation to add pointer to
4123 +                                        * new node will ne added */ )
4124 +{
4125 +       carry_node *fresh;
4126 +       znode *new_znode;
4127 +       carry_op *add_pointer;
4128 +       carry_plugin_info info;
4129 +
4130 +       assert("nikita-1048", brother != NULL);
4131 +       assert("nikita-1049", todo != NULL);
4132 +
4133 +       /* There is a lot of possible variations here: to what parent
4134 +          new node will be attached and where. For simplicity, always
4135 +          do the following:
4136 +
4137 +          (1) new node and @brother will have the same parent.
4138 +
4139 +          (2) new node is added on the right of @brother
4140 +
4141 +       */
4142 +
4143 +       fresh = add_carry_skip(doing, ref ? POOLO_AFTER : POOLO_LAST, ref);
4144 +       if (IS_ERR(fresh))
4145 +               return fresh;
4146 +
4147 +       fresh->deallocate = 1;
4148 +       fresh->free = 1;
4149 +
4150 +       new_znode = new_node(brother, znode_get_level(brother));
4151 +       if (IS_ERR(new_znode))
4152 +               /* @fresh will be deallocated automatically by error
4153 +                  handling code in the caller. */
4154 +               return (carry_node *) new_znode;
4155 +
4156 +       /* new_znode returned znode with x_count 1. Caller has to decrease
4157 +          it. make_space() does. */
4158 +
4159 +       ZF_SET(new_znode, JNODE_ORPHAN);
4160 +       fresh->node = new_znode;
4161 +
4162 +       while (ZF_ISSET(carry_real(ref), JNODE_ORPHAN)) {
4163 +               ref = carry_node_prev(ref);
4164 +               assert("nikita-1606", !carry_node_end(doing, ref));
4165 +       }
4166 +
4167 +       info.todo = todo;
4168 +       info.doing = doing;
4169 +       add_pointer = node_post_carry(&info, COP_INSERT, carry_real(ref), 1);
4170 +       if (IS_ERR(add_pointer)) {
4171 +               /* no need to deallocate @new_znode here: it will be
4172 +                  deallocated during carry error handling. */
4173 +               return (carry_node *) add_pointer;
4174 +       }
4175 +
4176 +       add_pointer->u.insert.type = COPT_CHILD;
4177 +       add_pointer->u.insert.child = fresh;
4178 +       add_pointer->u.insert.brother = brother;
4179 +       /* initially new node spawns empty key range */
4180 +       WLOCK_DK(znode_get_tree(brother));
4181 +       znode_set_ld_key(new_znode,
4182 +                        znode_set_rd_key(new_znode, znode_get_rd_key(brother)));
4183 +       WUNLOCK_DK(znode_get_tree(brother));
4184 +       return fresh;
4185 +}
4186 +
4187 +/*
4188 + * Estimate how many pages of memory have to be reserved to complete execution
4189 + * of @level.
4190 + */
4191 +static int carry_estimate_reserve(carry_level * level)
4192 +{
4193 +       carry_op *op;
4194 +       carry_op *tmp_op;
4195 +       int result;
4196 +
4197 +       result = 0;
4198 +       for_all_ops(level, op, tmp_op)
4199 +               result += op_dispatch_table[op->op].estimate(op, level);
4200 +       return result;
4201 +}
4202 +
4203 +/* DEBUGGING FUNCTIONS.
4204 +
4205 +   Probably we also should leave them on even when
4206 +   debugging is turned off to print dumps at errors.
4207 +*/
4208 +#if REISER4_DEBUG
4209 +static int
4210 +carry_level_invariant(carry_level * level, carry_queue_state state)
4211 +{
4212 +       carry_node *node;
4213 +       carry_node *tmp_node;
4214 +
4215 +       if (level == NULL)
4216 +               return 0;
4217 +
4218 +       if (level->track_type != 0 &&
4219 +           level->track_type != CARRY_TRACK_NODE &&
4220 +           level->track_type != CARRY_TRACK_CHANGE)
4221 +               return 0;
4222 +
4223 +       /* check that nodes are in ascending order */
4224 +       for_all_nodes(level, node, tmp_node) {
4225 +               znode *left;
4226 +               znode *right;
4227 +
4228 +               reiser4_key lkey;
4229 +               reiser4_key rkey;
4230 +
4231 +               if (node != carry_node_front(level)) {
4232 +                       if (state == CARRY_TODO) {
4233 +                               right = node->node;
4234 +                               left = carry_node_prev(node)->node;
4235 +                       } else {
4236 +                               right = carry_real(node);
4237 +                               left = carry_real(carry_node_prev(node));
4238 +                       }
4239 +                       if (right == NULL || left == NULL)
4240 +                               continue;
4241 +                       if (node_is_empty(right) || node_is_empty(left))
4242 +                               continue;
4243 +                       if (!keyle(leftmost_key_in_node(left, &lkey),
4244 +                                  leftmost_key_in_node(right, &rkey))) {
4245 +                               print_znode("left", left);
4246 +                               print_node_content("left", left, ~0);
4247 +                               print_znode("right", right);
4248 +                               print_node_content("right", right, ~0);
4249 +                               return 0;
4250 +                       }
4251 +               }
4252 +       }
4253 +       return 1;
4254 +}
4255 +#endif
4256 +
4257 +#if REISER4_DEBUG_OUTPUT
4258 +/* get symbolic name for boolean */
4259 +static const char *
4260 +tf(int boolean /* truth value */ )
4261 +{
4262 +       return boolean ? "t" : "f";
4263 +}
4264 +
4265 +/* symbolic name for carry operation */
4266 +static const char *
4267 +carry_op_name(carry_opcode op /* carry opcode */ )
4268 +{
4269 +       switch (op) {
4270 +       case COP_INSERT:
4271 +               return "COP_INSERT";
4272 +       case COP_DELETE:
4273 +               return "COP_DELETE";
4274 +       case COP_CUT:
4275 +               return "COP_CUT";
4276 +       case COP_PASTE:
4277 +               return "COP_PASTE";
4278 +       case COP_UPDATE:
4279 +               return "COP_UPDATE";
4280 +       case COP_EXTENT:
4281 +               return "COP_EXTENT";
4282 +       case COP_INSERT_FLOW:
4283 +               return "COP_INSERT_FLOW";
4284 +       default:{
4285 +                       /* not mt safe, but who cares? */
4286 +                       static char buf[20];
4287 +
4288 +                       sprintf(buf, "unknown op: %x", op);
4289 +                       return buf;
4290 +               }
4291 +       }
4292 +}
4293 +
4294 +/* dump information about carry node */
4295 +reiser4_internal void
4296 +print_carry(const char *prefix /* prefix to print */ ,
4297 +           carry_node * node /* node to print */ )
4298 +{
4299 +       if (node == NULL) {
4300 +               printk("%s: null\n", prefix);
4301 +               return;
4302 +       }
4303 +       printk("%s: %p parent: %s, left: %s, unlock: %s, free: %s, dealloc: %s\n",
4304 +              prefix, node, tf(node->parent), tf(node->left), tf(node->unlock), tf(node->free), tf(node->deallocate));
4305 +       print_znode("\tnode", node->node);
4306 +       print_znode("\treal_node", carry_real(node));
4307 +}
4308 +
4309 +/* dump information about carry operation */
4310 +reiser4_internal void
4311 +print_op(const char *prefix /* prefix to print */ ,
4312 +        carry_op * op /* operation to print */ )
4313 +{
4314 +       if (op == NULL) {
4315 +               printk("%s: null\n", prefix);
4316 +               return;
4317 +       }
4318 +       printk("%s: %p carry_opcode: %s\n", prefix, op, carry_op_name(op->op));
4319 +       print_carry("\tnode", op->node);
4320 +       switch (op->op) {
4321 +       case COP_INSERT:
4322 +       case COP_PASTE:
4323 +               print_coord("\tcoord", op->u.insert.d ? op->u.insert.d->coord : NULL, 0);
4324 +               print_key("\tkey", op->u.insert.d ? op->u.insert.d->key : NULL);
4325 +               print_carry("\tchild", op->u.insert.child);
4326 +               break;
4327 +       case COP_DELETE:
4328 +               print_carry("\tchild", op->u.delete.child);
4329 +               break;
4330 +       case COP_CUT:
4331 +               if (op->u.cut_or_kill.is_cut) {
4332 +                       print_coord("\tfrom", op->u.cut_or_kill.u.kill->params.from, 0);
4333 +                       print_coord("\tto", op->u.cut_or_kill.u.kill->params.to, 0);
4334 +               } else {
4335 +                       print_coord("\tfrom", op->u.cut_or_kill.u.cut->params.from, 0);
4336 +                       print_coord("\tto", op->u.cut_or_kill.u.cut->params.to, 0);
4337 +               }
4338 +               break;
4339 +       case COP_UPDATE:
4340 +               print_carry("\tleft", op->u.update.left);
4341 +               break;
4342 +       default:
4343 +               /* do nothing */
4344 +               break;
4345 +       }
4346 +}
4347 +
4348 +/* dump information about all nodes and operations in a @level */
4349 +reiser4_internal void
4350 +print_level(const char *prefix /* prefix to print */ ,
4351 +           carry_level * level /* level to print */ )
4352 +{
4353 +       carry_node *node;
4354 +       carry_node *tmp_node;
4355 +       carry_op *op;
4356 +       carry_op *tmp_op;
4357 +
4358 +       if (level == NULL) {
4359 +               printk("%s: null\n", prefix);
4360 +               return;
4361 +       }
4362 +       printk("%s: %p, restartable: %s\n",
4363 +              prefix, level, tf(level->restartable));
4364 +
4365 +       for_all_nodes(level, node, tmp_node)
4366 +           print_carry("\tcarry node", node);
4367 +       for_all_ops(level, op, tmp_op)
4368 +           print_op("\tcarry op", op);
4369 +}
4370 +#endif
4371 +
4372 +/* Make Linus happy.
4373 +   Local variables:
4374 +   c-indentation-style: "K&R"
4375 +   mode-name: "LC"
4376 +   c-basic-offset: 8
4377 +   tab-width: 8
4378 +   fill-column: 120
4379 +   scroll-step: 1
4380 +   End:
4381 +*/
4382 diff -rupN linux-2.6.8-rc3/fs/reiser4/carry.h linux-2.6.8-rc3-a/fs/reiser4/carry.h
4383 --- linux-2.6.8-rc3/fs/reiser4/carry.h  1970-01-01 03:00:00.000000000 +0300
4384 +++ linux-2.6.8-rc3-a/fs/reiser4/carry.h        2004-08-05 21:20:53.433586621 +0400
4385 @@ -0,0 +1,439 @@
4386 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
4387 +
4388 +/* Functions and data types to "carry" tree modification(s) upward.
4389 +   See fs/reiser4/carry.c for details. */
4390 +
4391 +#if !defined( __FS_REISER4_CARRY_H__ )
4392 +#define __FS_REISER4_CARRY_H__
4393 +
4394 +#include "forward.h"
4395 +#include "debug.h"
4396 +#include "pool.h"
4397 +#include "znode.h"
4398 +
4399 +#include <linux/types.h>
4400 +
4401 +/* &carry_node - "location" of carry node.
4402 +
4403 +   "location" of node that is involved or going to be involved into
4404 +   carry process. Node where operation will be carried to on the
4405 +   parent level cannot be recorded explicitly. Operation will be carried
4406 +   usually to the parent of some node (where changes are performed at
4407 +   the current level) or, to the left neighbor of its parent. But while
4408 +   modifications are performed at the current level, parent may
4409 +   change. So, we have to allow some indirection (or, positevly,
4410 +   flexibility) in locating carry nodes.
4411 +
4412 +*/
4413 +typedef struct carry_node {
4414 +       /* pool linkage */
4415 +       reiser4_pool_header header;
4416 +
4417 +       /* base node from which real_node is calculated. See
4418 +           fs/reiser4/carry.c:lock_carry_node(). */
4419 +       znode *node;
4420 +
4421 +       /* how to get ->real_node */
4422 +       /* to get ->real_node obtain parent of ->node*/
4423 +       __u32 parent:1;
4424 +       /* to get ->real_node obtain left neighbor of parent of
4425 +           ->node*/
4426 +       __u32 left:1;
4427 +       __u32 left_before:1;
4428 +
4429 +       /* locking */
4430 +
4431 +       /* this node was locked by carry process and should be
4432 +           unlocked when carry leaves a level */
4433 +       __u32 unlock:1;
4434 +
4435 +       /* disk block for this node was allocated by carry process and
4436 +           should be deallocated when carry leaves a level */
4437 +       __u32 deallocate:1;
4438 +       /* this carry node was allocated by carry process and should be
4439 +           freed when carry leaves a level */
4440 +       __u32 free:1;
4441 +
4442 +       /* type of lock we want to take on this node */
4443 +       lock_handle lock_handle;
4444 +} carry_node;
4445 +
4446 +/* &carry_opcode - elementary operations that can be carried upward
4447 +
4448 +   Operations that carry() can handle. This list is supposed to be
4449 +   expanded.
4450 +
4451 +   Each carry operation (cop) is handled by appropriate function defined
4452 +   in fs/reiser4/carry.c. For example COP_INSERT is handled by
4453 +   fs/reiser4/carry.c:carry_insert() etc. These functions in turn
4454 +   call plugins of nodes affected by operation to modify nodes' content
4455 +   and to gather operations to be performed on the next level.
4456 +
4457 +*/
4458 +typedef enum {
4459 +       /* insert new item into node. */
4460 +       COP_INSERT,
4461 +       /* delete pointer from parent node */
4462 +       COP_DELETE,
4463 +       /* remove part of or whole node. */
4464 +       COP_CUT,
4465 +       /* increase size of item. */
4466 +       COP_PASTE,
4467 +       /* insert extent (that is sequence of unformatted nodes). */
4468 +       COP_EXTENT,
4469 +       /* update delimiting key in least common ancestor of two
4470 +          nodes. This is performed when items are moved between two
4471 +          nodes.
4472 +       */
4473 +       COP_UPDATE,
4474 +       /* insert flow */
4475 +       COP_INSERT_FLOW,
4476 +       COP_LAST_OP,
4477 +} carry_opcode;
4478 +
4479 +#define CARRY_FLOW_NEW_NODES_LIMIT 20
4480 +
4481 +/* mode (or subtype) of COP_{INSERT|PASTE} operation. Specifies how target
4482 +   item is determined. */
4483 +typedef enum {
4484 +       /* target item is one containing pointer to the ->child node */
4485 +       COPT_CHILD,
4486 +       /* target item is given explicitly by @coord */
4487 +       COPT_ITEM_DATA,
4488 +       /* target item is given by key */
4489 +       COPT_KEY,
4490 +       /* see insert_paste_common() for more comments on this. */
4491 +       COPT_PASTE_RESTARTED,
4492 +} cop_insert_pos_type;
4493 +
4494 +/* flags to cut and delete */
4495 +typedef enum {
4496 +       /* don't kill node even if it became completely empty as results of
4497 +        * cut. This is needed for eottl handling. See carry_extent() for
4498 +        * details. */
4499 +       DELETE_RETAIN_EMPTY = (1 << 0)
4500 +} cop_delete_flag;
4501 +
4502 +/*
4503 + * carry() implements "lock handle tracking" feature.
4504 + *
4505 + * Callers supply carry with node where to perform initial operation and lock
4506 + * handle on this node. Trying to optimize node utilization carry may actually
4507 + * move insertion point to different node. Callers expect that lock handle
4508 + * will rebe transferred to the new node also.
4509 + *
4510 + */
4511 +typedef enum {
4512 +       /* transfer lock handle along with insertion point */
4513 +       CARRY_TRACK_CHANGE = 1,
4514 +       /* acquire new lock handle to the node where insertion point is. This
4515 +        * is used when carry() client doesn't initially possess lock handle
4516 +        * on the insertion point node, for example, by extent insertion
4517 +        * code. See carry_extent(). */
4518 +       CARRY_TRACK_NODE   = 2
4519 +} carry_track_type;
4520 +
4521 +/* data supplied to COP_{INSERT|PASTE} by callers */
4522 +typedef struct carry_insert_data {
4523 +       /* position where new item is to be inserted */
4524 +       coord_t *coord;
4525 +       /* new item description */
4526 +       reiser4_item_data *data;
4527 +       /* key of new item */
4528 +       const reiser4_key *key;
4529 +} carry_insert_data;
4530 +
4531 +/* cut and kill are similar, so carry_cut_data and carry_kill_data share the below structure of parameters */
4532 +struct cut_kill_params {
4533 +       /* coord where cut starts (inclusive) */
4534 +       coord_t *from;
4535 +       /* coord where cut stops (inclusive, this item/unit will also be
4536 +        * cut) */
4537 +       coord_t *to;
4538 +       /* starting key. This is necessary when item and unit pos don't
4539 +        * uniquely identify what portion or tree to remove. For example, this
4540 +        * indicates what portion of extent unit will be affected. */
4541 +       const reiser4_key *from_key;
4542 +       /* exclusive stop key */
4543 +       const reiser4_key *to_key;
4544 +       /* if this is not NULL, smallest actually removed key is stored
4545 +        * here. */
4546 +       reiser4_key *smallest_removed;
4547 +};
4548 +
4549 +struct carry_cut_data {
4550 +       struct cut_kill_params params;
4551 +};
4552 +
4553 +struct carry_kill_data {
4554 +       struct cut_kill_params params;
4555 +       /* parameter to be passed to the ->kill_hook() method of item
4556 +        * plugin */
4557 +       /*void *iplug_params;*/ /* FIXME: unused currently */
4558 +       /* if not NULL---inode whose items are being removed. This is needed
4559 +        * for ->kill_hook() of extent item to update VM structures when
4560 +        * removing pages. */
4561 +       struct inode *inode;
4562 +       /* sibling list maintenance is complicated by existence of eottl. When
4563 +        * eottl whose left and right neighbors are formatted leaves is
4564 +        * removed, one has to connect said leaves in the sibling list. This
4565 +        * cannot be done when extent removal is just started as locking rules
4566 +        * require sibling list update to happen atomically with removal of
4567 +        * extent item. Therefore: 1. pointers to left and right neighbors
4568 +        * have to be passed down to the ->kill_hook() of extent item, and
4569 +        * 2. said neighbors have to be locked. */
4570 +       lock_handle *left;
4571 +       lock_handle *right;
4572 +       /* flags modifying behavior of kill. Currently, it may have DELETE_RETAIN_EMPTY set. */
4573 +       unsigned flags;
4574 +};
4575 +
4576 +/* &carry_tree_op - operation to "carry" upward.
4577 +
4578 +   Description of an operation we want to "carry" to the upper level of
4579 +   a tree: e.g, when we insert something and there is not enough space
4580 +   we allocate a new node and "carry" the operation of inserting a
4581 +   pointer to the new node to the upper level, on removal of empty node,
4582 +   we carry up operation of removing appropriate entry from parent.
4583 +
4584 +   There are two types of carry ops: when adding or deleting node we
4585 +   node at the parent level where appropriate modification has to be
4586 +   performed is known in advance. When shifting items between nodes
4587 +   (split, merge), delimiting key should be changed in the least common
4588 +   parent of the nodes involved that is not known in advance.
4589 +
4590 +   For the operations of the first type we store in &carry_op pointer to
4591 +   the &carry_node at the parent level. For the operation of the second
4592 +   type we store &carry_node or parents of the left and right nodes
4593 +   modified and keep track of them upward until they coincide.
4594 +
4595 +*/
4596 +typedef struct carry_op {
4597 +       /* pool linkage */
4598 +       reiser4_pool_header header;
4599 +       carry_opcode op;
4600 +       /* node on which operation is to be performed:
4601 +
4602 +          for insert, paste: node where new item is to be inserted
4603 +
4604 +          for delete: node where pointer is to be deleted
4605 +
4606 +          for cut: node to cut from
4607 +
4608 +          for update: node where delimiting key is to be modified
4609 +
4610 +          for modify: parent of modified node
4611 +
4612 +       */
4613 +       carry_node *node;
4614 +       union {
4615 +               struct {
4616 +                       /* (sub-)type of insertion/paste. Taken from
4617 +                          cop_insert_pos_type. */
4618 +                       __u8 type;
4619 +                       /* various operation flags. Taken from
4620 +                          cop_insert_flag. */
4621 +                       __u8 flags;
4622 +                       carry_insert_data *d;
4623 +                       carry_node *child;
4624 +                       znode *brother;
4625 +               } insert, paste, extent;
4626 +
4627 +               struct {
4628 +                       int is_cut;
4629 +                       union {
4630 +                               carry_kill_data *kill;
4631 +                               carry_cut_data *cut;
4632 +                       } u;
4633 +               } cut_or_kill;
4634 +
4635 +               struct {
4636 +                       carry_node *left;
4637 +               } update;
4638 +               struct {
4639 +                       /* changed child */
4640 +                       carry_node *child;
4641 +                       /* bitmask of changes. See &cop_modify_flag */
4642 +                       __u32 flag;
4643 +               } modify;
4644 +               struct {
4645 +                       /* flags to deletion operation. Are taken from
4646 +                          cop_delete_flag */
4647 +                       __u32 flags;
4648 +                       /* child to delete from parent. If this is
4649 +                          NULL, delete op->node.  */
4650 +                       carry_node *child;
4651 +               } delete;
4652 +               struct {
4653 +                       /* various operation flags. Taken from
4654 +                          cop_insert_flag. */
4655 +                       __u32 flags;
4656 +                       flow_t *flow;
4657 +                       coord_t *insert_point;
4658 +                       reiser4_item_data *data;
4659 +                       /* flow insertion is limited by number of new blocks
4660 +                          added in that operation which do not get any data
4661 +                          but part of flow. This limit is set by macro
4662 +                          CARRY_FLOW_NEW_NODES_LIMIT. This field stores number
4663 +                          of nodes added already during one carry_flow */
4664 +                       int new_nodes;
4665 +               } insert_flow;
4666 +       } u;
4667 +} carry_op;
4668 +
4669 +/* &carry_op_pool - preallocated pool of carry operations, and nodes */
4670 +typedef struct carry_pool {
4671 +       carry_op op[CARRIES_POOL_SIZE];
4672 +       reiser4_pool op_pool;
4673 +       carry_node node[NODES_LOCKED_POOL_SIZE];
4674 +       reiser4_pool node_pool;
4675 +} carry_pool;
4676 +
4677 +/* &carry_tree_level - carry process on given level
4678 +
4679 +   Description of balancing process on the given level.
4680 +
4681 +   No need for locking here, as carry_tree_level is essentially per
4682 +   thread thing (for now).
4683 +
4684 +*/
4685 +struct carry_level {
4686 +       /* this level may be restarted */
4687 +       __u32 restartable:1;
4688 +       /* list of carry nodes on this level, ordered by key order */
4689 +       pool_level_list_head nodes;
4690 +       pool_level_list_head ops;
4691 +       /* pool where new objects are allocated from */
4692 +       carry_pool *pool;
4693 +       int ops_num;
4694 +       int nodes_num;
4695 +       /* new root created on this level, if any */
4696 +       znode *new_root;
4697 +       /* This is set by caller (insert_by_key(), resize_item(), etc.) when
4698 +          they want ->tracked to automagically wander to the node where
4699 +          insertion point moved after insert or paste.
4700 +       */
4701 +       carry_track_type track_type;
4702 +       /* lock handle supplied by user that we are tracking. See
4703 +          above. */
4704 +       lock_handle *tracked;
4705 +#if REISER4_STATS
4706 +       tree_level level_no;
4707 +#endif
4708 +};
4709 +
4710 +/* information carry passes to plugin methods that may add new operations to
4711 +   the @todo queue  */
4712 +struct carry_plugin_info {
4713 +       carry_level *doing;
4714 +       carry_level *todo;
4715 +};
4716 +
4717 +int carry(carry_level * doing, carry_level * done);
4718 +
4719 +carry_node *add_carry(carry_level * level, pool_ordering order, carry_node * reference);
4720 +carry_node *add_carry_skip(carry_level * level, pool_ordering order, carry_node * reference);
4721 +carry_op *add_op(carry_level * level, pool_ordering order, carry_op * reference);
4722 +
4723 +extern carry_node *insert_carry_node(carry_level * doing,
4724 +                                    carry_level * todo, const znode * node);
4725 +
4726 +extern carry_node *add_carry_atplace(carry_level *doing,
4727 +                                    carry_level *todo, znode *node);
4728 +
4729 +extern carry_node *find_begetting_brother(carry_node * node, carry_level * kin);
4730 +
4731 +extern void init_carry_pool(carry_pool * pool);
4732 +extern void done_carry_pool(carry_pool * pool);
4733 +
4734 +extern void init_carry_level(carry_level * level, carry_pool * pool);
4735 +
4736 +extern carry_op *post_carry(carry_level * level, carry_opcode op, znode * node, int apply_to_parent);
4737 +extern carry_op *node_post_carry(carry_plugin_info * info, carry_opcode op, znode * node, int apply_to_parent_p);
4738 +
4739 +extern int carry_op_num(const carry_level * level);
4740 +
4741 +carry_node *add_new_znode(znode * brother, carry_node * reference, carry_level * doing, carry_level * todo);
4742 +
4743 +carry_node *find_carry_node(carry_level * level, const znode * node);
4744 +
4745 +extern znode *carry_real(const carry_node * node);
4746 +
4747 +/* helper macros to iterate over carry queues */
4748 +
4749 +#define carry_node_next( node )                                        \
4750 +       ( ( carry_node * ) pool_level_list_next( &( node ) -> header ) )
4751 +
4752 +#define carry_node_prev( node )                                        \
4753 +       ( ( carry_node * ) pool_level_list_prev( &( node ) -> header ) )
4754 +
4755 +#define carry_node_front( level )                                      \
4756 +       ( ( carry_node * ) pool_level_list_front( &( level ) -> nodes ) )
4757 +
4758 +#define carry_node_back( level )                                       \
4759 +       ( ( carry_node * ) pool_level_list_back( &( level ) -> nodes ) )
4760 +
4761 +#define carry_node_end( level, node )                                  \
4762 +       ( pool_level_list_end( &( level ) -> nodes, &( node ) -> header ) )
4763 +
4764 +/* macro to iterate over all operations in a @level */
4765 +#define for_all_ops( level /* carry level (of type carry_level *) */,          \
4766 +                    op    /* pointer to carry operation, modified by loop (of  \
4767 +                           * type carry_op *) */,                              \
4768 +                    tmp   /* pointer to carry operation (of type carry_op *),  \
4769 +                           * used to make iterator stable in the face of       \
4770 +                           * deletions from the level */ )                     \
4771 +for( op = ( carry_op * ) pool_level_list_front( &level -> ops ),               \
4772 +     tmp = ( carry_op * ) pool_level_list_next( &op -> header ) ;              \
4773 +     ! pool_level_list_end( &level -> ops, &op -> header ) ;                   \
4774 +     op = tmp, tmp = ( carry_op * ) pool_level_list_next( &op -> header ) )
4775 +
4776 +/* macro to iterate over all nodes in a @level */
4777 +#define for_all_nodes( level /* carry level (of type carry_level *) */,                \
4778 +                      node  /* pointer to carry node, modified by loop (of     \
4779 +                             * type carry_node *) */,                          \
4780 +                      tmp   /* pointer to carry node (of type carry_node *),   \
4781 +                             * used to make iterator stable in the face of *   \
4782 +                             * deletions from the level */ )                   \
4783 +for( node = carry_node_front( level ),                                         \
4784 +     tmp = carry_node_next( node ) ; ! carry_node_end( level, node ) ;         \
4785 +     node = tmp, tmp = carry_node_next( node ) )
4786 +
4787 +/* macro to iterate over all nodes in a @level in reverse order
4788 +
4789 +   This is used, because nodes are unlocked in reversed order of locking */
4790 +#define for_all_nodes_back( level /* carry level (of type carry_level *) */,   \
4791 +                           node  /* pointer to carry node, modified by loop    \
4792 +                                  * (of type carry_node *) */,                 \
4793 +                           tmp   /* pointer to carry node (of type carry_node  \
4794 +                                  * *), used to make iterator stable in the    \
4795 +                                  * face of deletions from the level */ )      \
4796 +for( node = carry_node_back( level ),          \
4797 +     tmp = carry_node_prev( node ) ; ! carry_node_end( level, node ) ;         \
4798 +     node = tmp, tmp = carry_node_prev( node ) )
4799 +
4800 +/* debugging function */
4801 +
4802 +#if REISER4_DEBUG_OUTPUT
4803 +extern void print_carry(const char *prefix, carry_node * node);
4804 +extern void print_op(const char *prefix, carry_op * op);
4805 +extern void print_level(const char *prefix, carry_level * level);
4806 +#else
4807 +#define print_carry( p, n ) noop
4808 +#define print_op( p, o ) noop
4809 +#define print_level( p, l ) noop
4810 +#endif
4811 +
4812 +/* __FS_REISER4_CARRY_H__ */
4813 +#endif
4814 +
4815 +/* Make Linus happy.
4816 +   Local variables:
4817 +   c-indentation-style: "K&R"
4818 +   mode-name: "LC"
4819 +   c-basic-offset: 8
4820 +   tab-width: 8
4821 +   fill-column: 120
4822 +   scroll-step: 1
4823 +   End:
4824 +*/
4825 diff -rupN linux-2.6.8-rc3/fs/reiser4/carry_ops.c linux-2.6.8-rc3-a/fs/reiser4/carry_ops.c
4826 --- linux-2.6.8-rc3/fs/reiser4/carry_ops.c      1970-01-01 03:00:00.000000000 +0300
4827 +++ linux-2.6.8-rc3-a/fs/reiser4/carry_ops.c    2004-08-05 21:20:53.018674135 +0400
4828 @@ -0,0 +1,2171 @@
4829 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
4830 +
4831 +/* implementation of carry operations */
4832 +
4833 +#include "forward.h"
4834 +#include "debug.h"
4835 +#include "key.h"
4836 +#include "coord.h"
4837 +#include "plugin/item/item.h"
4838 +#include "plugin/node/node.h"
4839 +#include "jnode.h"
4840 +#include "znode.h"
4841 +#include "block_alloc.h"
4842 +#include "tree_walk.h"
4843 +#include "pool.h"
4844 +#include "tree_mod.h"
4845 +#include "carry.h"
4846 +#include "carry_ops.h"
4847 +#include "tree.h"
4848 +#include "super.h"
4849 +#include "reiser4.h"
4850 +
4851 +#include <linux/types.h>
4852 +#include <linux/err.h>
4853 +
4854 +static int carry_shift_data(sideof side, coord_t * insert_coord, znode * node,
4855 +                           carry_level * doing, carry_level * todo, unsigned int including_insert_coord_p);
4856 +
4857 +extern int lock_carry_node(carry_level * level, carry_node * node);
4858 +extern int lock_carry_node_tail(carry_node * node);
4859 +
4860 +/* find left neighbor of a carry node
4861 +
4862 +   Look for left neighbor of @node and add it to the @doing queue. See
4863 +   comments in the body.
4864 +
4865 +*/
4866 +static carry_node *
4867 +find_left_neighbor(carry_op * op       /* node to find left
4868 +                                        * neighbor of */ ,
4869 +                  carry_level * doing /* level to scan */ )
4870 +{
4871 +       int result;
4872 +       carry_node *node;
4873 +       carry_node *left;
4874 +       int flags;
4875 +       reiser4_tree *tree;
4876 +
4877 +       node = op->node;
4878 +
4879 +       tree = current_tree;
4880 +       RLOCK_TREE(tree);
4881 +       /* first, check whether left neighbor is already in a @doing queue */
4882 +       if (carry_real(node)->left != NULL) {
4883 +               /* NOTE: there is locking subtlety here. Look into
4884 +                * find_right_neighbor() for more info */
4885 +               if (find_carry_node(doing, carry_real(node)->left) != NULL) {
4886 +                       RUNLOCK_TREE(tree);
4887 +                       left = node;
4888 +                       do {
4889 +                               left = carry_node_prev(left);
4890 +                               assert("nikita-3408", !carry_node_end(doing,
4891 +                                                                     left));
4892 +                       } while (carry_real(left) == carry_real(node));
4893 +                       reiser4_stat_level_inc(doing, carry_left_in_carry);
4894 +                       return left;
4895 +               }
4896 +       }
4897 +       RUNLOCK_TREE(tree);
4898 +
4899 +       left = add_carry_skip(doing, POOLO_BEFORE, node);
4900 +       if (IS_ERR(left))
4901 +               return left;
4902 +
4903 +       left->node = node->node;
4904 +       left->free = 1;
4905 +
4906 +       flags = GN_TRY_LOCK;
4907 +       if (!op->u.insert.flags & COPI_LOAD_LEFT)
4908 +               flags |= GN_NO_ALLOC;
4909 +
4910 +       /* then, feeling lucky, peek left neighbor in the cache. */
4911 +       result = reiser4_get_left_neighbor(&left->lock_handle, carry_real(node),
4912 +                                          ZNODE_WRITE_LOCK, flags);
4913 +       if (result == 0) {
4914 +               /* ok, node found and locked. */
4915 +               result = lock_carry_node_tail(left);
4916 +               if (result != 0)
4917 +                       left = ERR_PTR(result);
4918 +               reiser4_stat_level_inc(doing, carry_left_in_cache);
4919 +       } else if (result == -E_NO_NEIGHBOR || result == -ENOENT) {
4920 +               /* node is leftmost node in a tree, or neighbor wasn't in
4921 +                  cache, or there is an extent on the left. */
4922 +               if (REISER4_STATS && (result == -ENOENT))
4923 +                       reiser4_stat_level_inc(doing, carry_left_missed);
4924 +               if (REISER4_STATS && (result == -E_NO_NEIGHBOR))
4925 +                       reiser4_stat_level_inc(doing, carry_left_not_avail);
4926 +               reiser4_pool_free(&doing->pool->node_pool, &left->header);
4927 +               left = NULL;
4928 +       } else if (doing->restartable) {
4929 +               /* if left neighbor is locked, and level is restartable, add
4930 +                  new node to @doing and restart. */
4931 +               assert("nikita-913", node->parent != 0);
4932 +               assert("nikita-914", node->node != NULL);
4933 +               left->left = 1;
4934 +               left->free = 0;
4935 +               left = ERR_PTR(-E_REPEAT);
4936 +       } else {
4937 +               /* left neighbor is locked, level cannot be restarted. Just
4938 +                  ignore left neighbor. */
4939 +               reiser4_pool_free(&doing->pool->node_pool, &left->header);
4940 +               left = NULL;
4941 +               reiser4_stat_level_inc(doing, carry_left_refuse);
4942 +       }
4943 +       return left;
4944 +}
4945 +
4946 +/* find right neighbor of a carry node
4947 +
4948 +   Look for right neighbor of @node and add it to the @doing queue. See
4949 +   comments in the body.
4950 +
4951 +*/
4952 +static carry_node *
4953 +find_right_neighbor(carry_op * op      /* node to find right
4954 +                                        * neighbor of */ ,
4955 +                   carry_level * doing /* level to scan */ )
4956 +{
4957 +       int result;
4958 +       carry_node *node;
4959 +       carry_node *right;
4960 +       lock_handle lh;
4961 +       int flags;
4962 +       reiser4_tree *tree;
4963 +
4964 +       init_lh(&lh);
4965 +
4966 +       node = op->node;
4967 +
4968 +       tree = current_tree;
4969 +       RLOCK_TREE(tree);
4970 +       /* first, check whether right neighbor is already in a @doing queue */
4971 +       if (carry_real(node)->right != NULL) {
4972 +               /*
4973 +                * Tree lock is taken here anyway, because, even if _outcome_
4974 +                * of (find_carry_node() != NULL) doesn't depends on
4975 +                * concurrent updates to ->right, find_carry_node() cannot
4976 +                * work with second argument NULL. Hence, following comment is
4977 +                * of historic importance only.
4978 +                *
4979 +                * Subtle:
4980 +                *
4981 +                * Q: why don't we need tree lock here, looking for the right
4982 +                * neighbor?
4983 +                *
4984 +                * A: even if value of node->real_node->right were changed
4985 +                * during find_carry_node() execution, outcome of execution
4986 +                * wouldn't change, because (in short) other thread cannot add
4987 +                * elements to the @doing, and if node->real_node->right
4988 +                * already was in @doing, value of node->real_node->right
4989 +                * couldn't change, because node cannot be inserted between
4990 +                * locked neighbors.
4991 +                */
4992 +               if (find_carry_node(doing, carry_real(node)->right) != NULL) {
4993 +                       RUNLOCK_TREE(tree);
4994 +                       /*
4995 +                        * What we are doing here (this is also applicable to
4996 +                        * the find_left_neighbor()).
4997 +                        *
4998 +                        * tree_walk.c code requires that insertion of a
4999 +                        * pointer to a child, modification of parent pointer
5000 +                        * in the child, and insertion of the child into
5001 +                        * sibling list are atomic (see
5002 +                        * plugin/item/internal.c:create_hook_internal()).
5003 +                        *
5004 +                        * carry allocates new node long before pointer to it
5005 +                        * is inserted into parent and, actually, long before
5006 +                        * parent is even known. Such allocated-but-orphaned
5007 +                        * nodes are only trackable through carry level lists.
5008 +                        *
5009 +                        * Situation that is handled here is following: @node
5010 +                        * has valid ->right pointer, but there is
5011 +                        * allocated-but-orphaned node in the carry queue that
5012 +                        * is logically between @node and @node->right. Here
5013 +                        * we are searching for it. Critical point is that
5014 +                        * this is only possible if @node->right is also in
5015 +                        * the carry queue (this is checked above), because
5016 +                        * this is the only way new orphaned node could be
5017 +                        * inserted between them (before inserting new node,
5018 +                        * make_space() first tries to shift to the right, so,
5019 +                        * right neighbor will be locked and queued).
5020 +                        *
5021 +                        */
5022 +                       right = node;
5023 +                       do {
5024 +                               right = carry_node_next(right);
5025 +                               assert("nikita-3408", !carry_node_end(doing,
5026 +                                                                     right));
5027 +                       } while (carry_real(right) == carry_real(node));
5028 +                       reiser4_stat_level_inc(doing, carry_right_in_carry);
5029 +                       return right;
5030 +               }
5031 +       }
5032 +       RUNLOCK_TREE(tree);
5033 +
5034 +       flags = GN_CAN_USE_UPPER_LEVELS;
5035 +       if (!op->u.insert.flags & COPI_LOAD_RIGHT)
5036 +               flags = GN_NO_ALLOC;
5037 +
5038 +       /* then, try to lock right neighbor */
5039 +       init_lh(&lh);
5040 +       result = reiser4_get_right_neighbor(&lh, carry_real(node),
5041 +                                           ZNODE_WRITE_LOCK, flags);
5042 +       if (result == 0) {
5043 +               /* ok, node found and locked. */
5044 +               reiser4_stat_level_inc(doing, carry_right_in_cache);
5045 +               right = add_carry_skip(doing, POOLO_AFTER, node);
5046 +               if (!IS_ERR(right)) {
5047 +                       right->node = lh.node;
5048 +                       move_lh(&right->lock_handle, &lh);
5049 +                       right->free = 1;
5050 +                       result = lock_carry_node_tail(right);
5051 +                       if (result != 0)
5052 +                               right = ERR_PTR(result);
5053 +               }
5054 +       } else if ((result == -E_NO_NEIGHBOR) || (result == -ENOENT)) {
5055 +               /* node is rightmost node in a tree, or neighbor wasn't in
5056 +                  cache, or there is an extent on the right. */
5057 +               right = NULL;
5058 +               if (REISER4_STATS && (result == -ENOENT))
5059 +                       reiser4_stat_level_inc(doing, carry_right_missed);
5060 +               if (REISER4_STATS && (result == -E_NO_NEIGHBOR))
5061 +                       reiser4_stat_level_inc(doing, carry_right_not_avail);
5062 +       } else
5063 +               right = ERR_PTR(result);
5064 +       done_lh(&lh);
5065 +       return right;
5066 +}
5067 +
5068 +/* how much free space in a @node is needed for @op
5069 +
5070 +   How much space in @node is required for completion of @op, where @op is
5071 +   insert or paste operation.
5072 +*/
5073 +static unsigned int
5074 +space_needed_for_op(znode * node       /* znode data are
5075 +                                        * inserted or
5076 +                                        * pasted in */ ,
5077 +                   carry_op * op       /* carry
5078 +                                          operation */ )
5079 +{
5080 +       assert("nikita-919", op != NULL);
5081 +
5082 +       switch (op->op) {
5083 +       default:
5084 +               impossible("nikita-1701", "Wrong opcode");
5085 +       case COP_INSERT:
5086 +               return space_needed(node, NULL, op->u.insert.d->data, 1);
5087 +       case COP_PASTE:
5088 +               return space_needed(node, op->u.insert.d->coord, op->u.insert.d->data, 0);
5089 +       }
5090 +}
5091 +
5092 +/* how much space in @node is required to insert or paste @data at
5093 +   @coord. */
5094 +reiser4_internal unsigned int
5095 +space_needed(const znode * node        /* node data are inserted or
5096 +                                * pasted in */ ,
5097 +            const coord_t * coord      /* coord where data are
5098 +                                          * inserted or pasted
5099 +                                          * at */ ,
5100 +            const reiser4_item_data * data     /* data to insert or
5101 +                                                * paste */ ,
5102 +            int insertion /* non-0 is inserting, 0---paste */ )
5103 +{
5104 +       int result;
5105 +       item_plugin *iplug;
5106 +
5107 +       assert("nikita-917", node != NULL);
5108 +       assert("nikita-918", node_plugin_by_node(node) != NULL);
5109 +       assert("vs-230", !insertion || (coord == NULL));
5110 +
5111 +       result = 0;
5112 +       iplug = data->iplug;
5113 +       if (iplug->b.estimate != NULL) {
5114 +               /* ask item plugin how much space is needed to insert this
5115 +                  item */
5116 +               result += iplug->b.estimate(insertion ? NULL : coord, data);
5117 +       } else {
5118 +               /* reasonable default */
5119 +               result += data->length;
5120 +       }
5121 +       if (insertion) {
5122 +               node_plugin *nplug;
5123 +
5124 +               nplug = node->nplug;
5125 +               /* and add node overhead */
5126 +               if (nplug->item_overhead != NULL) {
5127 +                       result += nplug->item_overhead(node, 0);
5128 +               }
5129 +       }
5130 +       return result;
5131 +}
5132 +
5133 +/* find &coord in parent where pointer to new child is to be stored. */
5134 +static int
5135 +find_new_child_coord(carry_op * op     /* COP_INSERT carry operation to
5136 +                                        * insert pointer to new
5137 +                                        * child */ )
5138 +{
5139 +       int result;
5140 +       znode *node;
5141 +       znode *child;
5142 +
5143 +       assert("nikita-941", op != NULL);
5144 +       assert("nikita-942", op->op == COP_INSERT);
5145 +
5146 +       trace_stamp(TRACE_CARRY);
5147 +
5148 +       node = carry_real(op->node);
5149 +       assert("nikita-943", node != NULL);
5150 +       assert("nikita-944", node_plugin_by_node(node) != NULL);
5151 +
5152 +       child = carry_real(op->u.insert.child);
5153 +       result = find_new_child_ptr(node, child, op->u.insert.brother, op->u.insert.d->coord);
5154 +
5155 +       build_child_ptr_data(child, op->u.insert.d->data);
5156 +       return result;
5157 +}
5158 +
5159 +/* additional amount of free space in @node required to complete @op */
5160 +static int
5161 +free_space_shortage(znode * node /* node to check */ ,
5162 +                   carry_op * op /* operation being performed */ )
5163 +{
5164 +       assert("nikita-1061", node != NULL);
5165 +       assert("nikita-1062", op != NULL);
5166 +
5167 +       switch (op->op) {
5168 +       default:
5169 +               impossible("nikita-1702", "Wrong opcode");
5170 +       case COP_INSERT:
5171 +       case COP_PASTE:
5172 +               return space_needed_for_op(node, op) - znode_free_space(node);
5173 +       case COP_EXTENT:
5174 +               /* when inserting extent shift data around until insertion
5175 +                  point is utmost in the node. */
5176 +               if (coord_wrt(op->u.insert.d->coord) == COORD_INSIDE)
5177 +                       return +1;
5178 +               else
5179 +                       return -1;
5180 +       }
5181 +}
5182 +
5183 +/* helper function: update node pointer in operation after insertion
5184 +   point was probably shifted into @target. */
5185 +static znode *
5186 +sync_op(carry_op * op, carry_node * target)
5187 +{
5188 +       znode *insertion_node;
5189 +
5190 +       /* reget node from coord: shift might move insertion coord to
5191 +          the neighbor */
5192 +       insertion_node = op->u.insert.d->coord->node;
5193 +       /* if insertion point was actually moved into new node,
5194 +          update carry node pointer in operation. */
5195 +       if (insertion_node != carry_real(op->node)) {
5196 +               op->node = target;
5197 +               assert("nikita-2540", carry_real(target) == insertion_node);
5198 +       }
5199 +       assert("nikita-2541",
5200 +              carry_real(op->node) == op->u.insert.d->coord->node);
5201 +       return insertion_node;
5202 +}
5203 +
5204 +/*
5205 + * complete make_space() call: update tracked lock handle if necessary. See
5206 + * comments for fs/reiser4/carry.h:carry_track_type
5207 + */
5208 +static int
5209 +make_space_tail(carry_op * op, carry_level * doing, znode * orig_node)
5210 +{
5211 +       int result;
5212 +       carry_track_type tracking;
5213 +       znode *node;
5214 +
5215 +       tracking = doing->track_type;
5216 +       node = op->u.insert.d->coord->node;
5217 +
5218 +       if (tracking == CARRY_TRACK_NODE ||
5219 +           (tracking == CARRY_TRACK_CHANGE && node != orig_node)) {
5220 +               /* inserting or pasting into node different from
5221 +                  original. Update lock handle supplied by caller. */
5222 +               assert("nikita-1417", doing->tracked != NULL);
5223 +               done_lh(doing->tracked);
5224 +               init_lh(doing->tracked);
5225 +               result = longterm_lock_znode(doing->tracked, node,
5226 +                                            ZNODE_WRITE_LOCK, ZNODE_LOCK_HIPRI);
5227 +               reiser4_stat_level_inc(doing, track_lh);
5228 +               ON_TRACE(TRACE_CARRY, "tracking: %i: %p -> %p\n",
5229 +                        tracking, orig_node, node);
5230 +       } else
5231 +               result = 0;
5232 +       return result;
5233 +}
5234 +
5235 +/* This is insertion policy function. It shifts data to the left and right
5236 +   neighbors of insertion coord and allocates new nodes until there is enough
5237 +   free space to complete @op.
5238 +
5239 +   See comments in the body.
5240 +
5241 +   Assumes that the node format favors insertions at the right end of the node
5242 +   as node40 does.
5243 +
5244 +   See carry_flow() on detail about flow insertion
5245 +*/
5246 +static int
5247 +make_space(carry_op * op /* carry operation, insert or paste */ ,
5248 +          carry_level * doing /* current carry queue */ ,
5249 +          carry_level * todo /* carry queue on the parent level */ )
5250 +{
5251 +       znode *node;
5252 +       int result;
5253 +       int not_enough_space;
5254 +       int blk_alloc;
5255 +       znode *orig_node;
5256 +       __u32 flags;
5257 +
5258 +       coord_t *coord;
5259 +
5260 +       assert("nikita-890", op != NULL);
5261 +       assert("nikita-891", todo != NULL);
5262 +       assert("nikita-892",
5263 +              op->op == COP_INSERT ||
5264 +              op->op == COP_PASTE || op->op == COP_EXTENT);
5265 +       assert("nikita-1607",
5266 +              carry_real(op->node) == op->u.insert.d->coord->node);
5267 +
5268 +       trace_stamp(TRACE_CARRY);
5269 +
5270 +       flags = op->u.insert.flags;
5271 +
5272 +       /* NOTE check that new node can only be allocated after checking left
5273 +        * and right neighbors. This is necessary for proper work of
5274 +        * find_{left,right}_neighbor(). */
5275 +       assert("nikita-3410", ergo(flags & COPI_DONT_ALLOCATE,
5276 +                                  flags & COPI_DONT_SHIFT_LEFT));
5277 +       assert("nikita-3411", ergo(flags & COPI_DONT_ALLOCATE,
5278 +                                  flags & COPI_DONT_SHIFT_RIGHT));
5279 +
5280 +       coord = op->u.insert.d->coord;
5281 +       orig_node = node = coord->node;
5282 +
5283 +       assert("nikita-908", node != NULL);
5284 +       assert("nikita-909", node_plugin_by_node(node) != NULL);
5285 +
5286 +       result = 0;
5287 +       /* If there is not enough space in a node, try to shift something to
5288 +          the left neighbor. This is a bit tricky, as locking to the left is
5289 +          low priority. This is handled by restart logic in carry().
5290 +       */
5291 +       not_enough_space = free_space_shortage(node, op);
5292 +       if (not_enough_space <= 0)
5293 +               /* it is possible that carry was called when there actually
5294 +                  was enough space in the node. For example, when inserting
5295 +                  leftmost item so that delimiting keys have to be updated.
5296 +               */
5297 +               return make_space_tail(op, doing, orig_node);
5298 +       if (!(flags & COPI_DONT_SHIFT_LEFT)) {
5299 +               carry_node *left;
5300 +               /* make note in statistics of an attempt to move
5301 +                  something into the left neighbor */
5302 +               reiser4_stat_level_inc(doing, insert_looking_left);
5303 +               left = find_left_neighbor(op, doing);
5304 +               if (unlikely(IS_ERR(left))) {
5305 +                       if (PTR_ERR(left) == -E_REPEAT)
5306 +                               return -E_REPEAT;
5307 +                       else {
5308 +                               /* some error other than restart request
5309 +                                  occurred. This shouldn't happen. Issue a
5310 +                                  warning and continue as if left neighbor
5311 +                                  weren't existing.
5312 +                               */
5313 +                               warning("nikita-924",
5314 +                                       "Error accessing left neighbor: %li",
5315 +                                       PTR_ERR(left));
5316 +                               print_znode("node", node);
5317 +                       }
5318 +               } else if (left != NULL) {
5319 +
5320 +                       /* shift everything possible on the left of and
5321 +                          including insertion coord into the left neighbor */
5322 +                       result = carry_shift_data(LEFT_SIDE, coord,
5323 +                                                 carry_real(left), doing, todo,
5324 +                                                 flags & COPI_GO_LEFT);
5325 +
5326 +                       /* reget node from coord: shift_left() might move
5327 +                          insertion coord to the left neighbor */
5328 +                       node = sync_op(op, left);
5329 +
5330 +                       not_enough_space = free_space_shortage(node, op);
5331 +                       /* There is not enough free space in @node, but
5332 +                          may be, there is enough free space in
5333 +                          @left. Various balancing decisions are valid here.
5334 +                          The same for the shifiting to the right.
5335 +                       */
5336 +               }
5337 +       }
5338 +       /* If there still is not enough space, shift to the right */
5339 +       if (not_enough_space > 0 && !(flags & COPI_DONT_SHIFT_RIGHT)) {
5340 +               carry_node *right;
5341 +
5342 +               reiser4_stat_level_inc(doing, insert_looking_right);
5343 +               right = find_right_neighbor(op, doing);
5344 +               if (IS_ERR(right)) {
5345 +                       warning("nikita-1065",
5346 +                               "Error accessing right neighbor: %li",
5347 +                               PTR_ERR(right));
5348 +                       print_znode("node", node);
5349 +               } else if (right != NULL) {
5350 +                       /* node containing insertion point, and its right
5351 +                          neighbor node are write locked by now.
5352 +
5353 +                          shift everything possible on the right of but
5354 +                          excluding insertion coord into the right neighbor
5355 +                       */
5356 +                       result = carry_shift_data(RIGHT_SIDE, coord,
5357 +                                                 carry_real(right),
5358 +                                                 doing, todo,
5359 +                                                 flags & COPI_GO_RIGHT);
5360 +                       /* reget node from coord: shift_right() might move
5361 +                          insertion coord to the right neighbor */
5362 +                       node = sync_op(op, right);
5363 +                       not_enough_space = free_space_shortage(node, op);
5364 +               }
5365 +       }
5366 +       /* If there is still not enough space, allocate new node(s).
5367 +
5368 +          We try to allocate new blocks if COPI_DONT_ALLOCATE is not set in
5369 +          the carry operation flags (currently this is needed during flush
5370 +          only).
5371 +       */
5372 +       for (blk_alloc = 0;
5373 +            not_enough_space > 0 && result == 0 && blk_alloc < 2 &&
5374 +                    !(flags & COPI_DONT_ALLOCATE); ++blk_alloc) {
5375 +               carry_node *fresh;      /* new node we are allocating */
5376 +               coord_t coord_shadow;   /* remembered insertion point before
5377 +                                        * shifting data into new node */
5378 +               carry_node *node_shadow;        /* remembered insertion node before
5379 +                                                * shifting */
5380 +               unsigned int gointo;    /* whether insertion point should move
5381 +                                        * into newly allocated node */
5382 +
5383 +               reiser4_stat_level_inc(doing, insert_alloc_new);
5384 +               if (blk_alloc > 0)
5385 +                       reiser4_stat_level_inc(doing, insert_alloc_many);
5386 +
5387 +               /* allocate new node on the right of @node. Znode and disk
5388 +                  fake block number for new node are allocated.
5389 +
5390 +                  add_new_znode() posts carry operation COP_INSERT with
5391 +                  COPT_CHILD option to the parent level to add
5392 +                  pointer to newly created node to its parent.
5393 +
5394 +                  Subtle point: if several new nodes are required to complete
5395 +                  insertion operation at this level, they will be inserted
5396 +                  into their parents in the order of creation, which means
5397 +                  that @node will be valid "cookie" at the time of insertion.
5398 +
5399 +               */
5400 +               fresh = add_new_znode(node, op->node, doing, todo);
5401 +               if (IS_ERR(fresh))
5402 +                       return PTR_ERR(fresh);
5403 +
5404 +               /* Try to shift into new node. */
5405 +               result = lock_carry_node(doing, fresh);
5406 +               zput(carry_real(fresh));
5407 +               if (result != 0) {
5408 +                       warning("nikita-947",
5409 +                               "Cannot lock new node: %i", result);
5410 +                       print_znode("new", carry_real(fresh));
5411 +                       print_znode("node", node);
5412 +                       return result;
5413 +               }
5414 +
5415 +               /* both nodes are write locked by now.
5416 +
5417 +                  shift everything possible on the right of and
5418 +                  including insertion coord into the right neighbor.
5419 +               */
5420 +               coord_dup(&coord_shadow, op->u.insert.d->coord);
5421 +               node_shadow = op->node;
5422 +               /* move insertion point into newly created node if:
5423 +
5424 +                   . insertion point is rightmost in the source node, or
5425 +                   . this is not the first node we are allocating in a row.
5426 +               */
5427 +               gointo =
5428 +                       (blk_alloc > 0) ||
5429 +                       coord_is_after_rightmost(op->u.insert.d->coord);
5430 +
5431 +               result = carry_shift_data(RIGHT_SIDE, coord, carry_real(fresh),
5432 +                                         doing, todo, gointo);
5433 +               /* if insertion point was actually moved into new node,
5434 +                  update carry node pointer in operation. */
5435 +               node = sync_op(op, fresh);
5436 +               not_enough_space = free_space_shortage(node, op);
5437 +               if ((not_enough_space > 0) && (node != coord_shadow.node)) {
5438 +                       /* there is not enough free in new node. Shift
5439 +                          insertion point back to the @shadow_node so that
5440 +                          next new node would be inserted between
5441 +                          @shadow_node and @fresh.
5442 +                       */
5443 +                       coord_normalize(&coord_shadow);
5444 +                       coord_dup(coord, &coord_shadow);
5445 +                       node = coord->node;
5446 +                       op->node = node_shadow;
5447 +                       if (1 || (flags & COPI_STEP_BACK)) {
5448 +                               /* still not enough space?! Maybe there is
5449 +                                  enough space in the source node (i.e., node
5450 +                                  data are moved from) now.
5451 +                               */
5452 +                               not_enough_space = free_space_shortage(node, op);
5453 +                       }
5454 +               }
5455 +       }
5456 +       if (not_enough_space > 0) {
5457 +               if (!(flags & COPI_DONT_ALLOCATE))
5458 +                       warning("nikita-948", "Cannot insert new item");
5459 +               result = -E_NODE_FULL;
5460 +       }
5461 +       assert("nikita-1622", ergo(result == 0,
5462 +                                  carry_real(op->node) == coord->node));
5463 +       assert("nikita-2616", coord == op->u.insert.d->coord);
5464 +       if (result == 0)
5465 +               result = make_space_tail(op, doing, orig_node);
5466 +       return result;
5467 +}
5468 +
5469 +/* insert_paste_common() - common part of insert and paste operations
5470 +
5471 +   This function performs common part of COP_INSERT and COP_PASTE.
5472 +
5473 +   There are two ways in which insertion/paste can be requested:
5474 +
5475 +    . by directly supplying reiser4_item_data. In this case, op ->
5476 +    u.insert.type is set to COPT_ITEM_DATA.
5477 +
5478 +    . by supplying child pointer to which is to inserted into parent. In this
5479 +    case op -> u.insert.type == COPT_CHILD.
5480 +
5481 +    . by supplying key of new item/unit. This is currently only used during
5482 +    extent insertion
5483 +
5484 +   This is required, because when new node is allocated we don't know at what
5485 +   position pointer to it is to be stored in the parent. Actually, we don't
5486 +   even know what its parent will be, because parent can be re-balanced
5487 +   concurrently and new node re-parented, and because parent can be full and
5488 +   pointer to the new node will go into some other node.
5489 +
5490 +   insert_paste_common() resolves pointer to child node into position in the
5491 +   parent by calling find_new_child_coord(), that fills
5492 +   reiser4_item_data. After this, insertion/paste proceeds uniformly.
5493 +
5494 +   Another complication is with finding free space during pasting. It may
5495 +   happen that while shifting items to the neighbors and newly allocated
5496 +   nodes, insertion coord can no longer be in the item we wanted to paste
5497 +   into. At this point, paste becomes (morphs) into insert. Moreover free
5498 +   space analysis has to be repeated, because amount of space required for
5499 +   insertion is different from that of paste (item header overhead, etc).
5500 +
5501 +   This function "unifies" different insertion modes (by resolving child
5502 +   pointer or key into insertion coord), and then calls make_space() to free
5503 +   enough space in the node by shifting data to the left and right and by
5504 +   allocating new nodes if necessary. Carry operation knows amount of space
5505 +   required for its completion. After enough free space is obtained, caller of
5506 +   this function (carry_{insert,paste,etc.}) performs actual insertion/paste
5507 +   by calling item plugin method.
5508 +
5509 +*/
5510 +static int
5511 +insert_paste_common(carry_op * op      /* carry operation being
5512 +                                        * performed */ ,
5513 +                   carry_level * doing /* current carry level */ ,
5514 +                   carry_level * todo /* next carry level */ ,
5515 +                   carry_insert_data * cdata   /* pointer to
5516 +                                                * cdata */ ,
5517 +                   coord_t * coord /* insertion/paste coord */ ,
5518 +                   reiser4_item_data * data    /* data to be
5519 +                                                * inserted/pasted */ )
5520 +{
5521 +       assert("nikita-981", op != NULL);
5522 +       assert("nikita-980", todo != NULL);
5523 +       assert("nikita-979", (op->op == COP_INSERT) || (op->op == COP_PASTE) || (op->op == COP_EXTENT));
5524 +
5525 +       trace_stamp(TRACE_CARRY);
5526 +
5527 +       if (op->u.insert.type == COPT_PASTE_RESTARTED) {
5528 +               /* nothing to do. Fall through to make_space(). */
5529 +               ;
5530 +       } else if (op->u.insert.type == COPT_KEY) {
5531 +               node_search_result intra_node;
5532 +               znode *node;
5533 +               /* Problem with doing batching at the lowest level, is that
5534 +                  operations here are given by coords where modification is
5535 +                  to be performed, and one modification can invalidate coords
5536 +                  of all following operations.
5537 +
5538 +                  So, we are implementing yet another type for operation that
5539 +                  will use (the only) "locator" stable across shifting of
5540 +                  data between nodes, etc.: key (COPT_KEY).
5541 +
5542 +                  This clause resolves key to the coord in the node.
5543 +
5544 +                  But node can change also. Probably some pieces have to be
5545 +                  added to the lock_carry_node(), to lock node by its key.
5546 +
5547 +               */
5548 +               /* NOTE-NIKITA Lookup bias is fixed to FIND_EXACT. Complain
5549 +                  if you need something else. */
5550 +               op->u.insert.d->coord = coord;
5551 +               node = carry_real(op->node);
5552 +               intra_node = node_plugin_by_node(node)->lookup
5553 +                   (node, op->u.insert.d->key, FIND_EXACT, op->u.insert.d->coord);
5554 +               if ((intra_node != NS_FOUND) && (intra_node != NS_NOT_FOUND)) {
5555 +                       warning("nikita-1715", "Intra node lookup failure: %i", intra_node);
5556 +                       print_znode("node", node);
5557 +                       return intra_node;
5558 +               }
5559 +       } else if (op->u.insert.type == COPT_CHILD) {
5560 +               /* if we are asked to insert pointer to the child into
5561 +                  internal node, first convert pointer to the child into
5562 +                  coord within parent node.
5563 +               */
5564 +               znode *child;
5565 +               int result;
5566 +
5567 +               op->u.insert.d = cdata;
5568 +               op->u.insert.d->coord = coord;
5569 +               op->u.insert.d->data = data;
5570 +               op->u.insert.d->coord->node = carry_real(op->node);
5571 +               result = find_new_child_coord(op);
5572 +               child = carry_real(op->u.insert.child);
5573 +               if (result != NS_NOT_FOUND) {
5574 +                       warning("nikita-993", "Cannot find a place for child pointer: %i", result);
5575 +                       print_znode("child", child);
5576 +                       print_znode("parent", carry_real(op->node));
5577 +                       return result;
5578 +               }
5579 +               /* This only happens when we did multiple insertions at
5580 +                  the previous level, trying to insert single item and
5581 +                  it so happened, that insertion of pointers to all new
5582 +                  nodes before this one already caused parent node to
5583 +                  split (may be several times).
5584 +
5585 +                  I am going to come up with better solution.
5586 +
5587 +                  You are not expected to understand this.
5588 +                         -- v6root/usr/sys/ken/slp.c
5589 +
5590 +                  Basically, what happens here is the following: carry came
5591 +                  to the parent level and is about to insert internal item
5592 +                  pointing to the child node that it just inserted in the
5593 +                  level below. Position where internal item is to be inserted
5594 +                  was found by find_new_child_coord() above, but node of the
5595 +                  current carry operation (that is, parent node of child
5596 +                  inserted on the previous level), was determined earlier in
5597 +                  the lock_carry_level/lock_carry_node. It could so happen
5598 +                  that other carry operations already performed on the parent
5599 +                  level already split parent node, so that insertion point
5600 +                  moved into another node. Handle this by creating new carry
5601 +                  node for insertion point if necessary.
5602 +               */
5603 +               if (carry_real(op->node) != op->u.insert.d->coord->node) {
5604 +                       pool_ordering direction;
5605 +                       znode *z1;
5606 +                       znode *z2;
5607 +                       reiser4_key k1;
5608 +                       reiser4_key k2;
5609 +
5610 +                       /*
5611 +                        * determine in what direction insertion point
5612 +                        * moved. Do this by comparing delimiting keys.
5613 +                        */
5614 +                       z1 = op->u.insert.d->coord->node;
5615 +                       z2 = carry_real(op->node);
5616 +                       if (keyle(leftmost_key_in_node(z1, &k1),
5617 +                                 leftmost_key_in_node(z2, &k2)))
5618 +                               /* insertion point moved to the left */
5619 +                               direction = POOLO_BEFORE;
5620 +                       else
5621 +                               /* insertion point moved to the right */
5622 +                               direction = POOLO_AFTER;
5623 +
5624 +                       op->node = add_carry_skip(doing, direction, op->node);
5625 +                       if (IS_ERR(op->node))
5626 +                               return PTR_ERR(op->node);
5627 +                       op->node->node = op->u.insert.d->coord->node;
5628 +                       op->node->free = 1;
5629 +                       result = lock_carry_node(doing, op->node);
5630 +                       if (result != 0)
5631 +                               return result;
5632 +               }
5633 +
5634 +               /*
5635 +                * set up key of an item being inserted: we are inserting
5636 +                * internal item and its key is (by the very definition of
5637 +                * search tree) is leftmost key in the child node.
5638 +                */
5639 +               op->u.insert.d->key = UNDER_RW(dk, znode_get_tree(child), read,
5640 +                                              leftmost_key_in_node(child, znode_get_ld_key(child)));
5641 +               op->u.insert.d->data->arg = op->u.insert.brother;
5642 +       } else {
5643 +               assert("vs-243", op->u.insert.d->coord != NULL);
5644 +               op->u.insert.d->coord->node = carry_real(op->node);
5645 +       }
5646 +
5647 +       /* find free space. */
5648 +       return make_space(op, doing, todo);
5649 +}
5650 +
5651 +/* handle carry COP_INSERT operation.
5652 +
5653 +   Insert new item into node. New item can be given in one of two ways:
5654 +
5655 +   - by passing &tree_coord and &reiser4_item_data as part of @op. This is
5656 +   only applicable at the leaf/twig level.
5657 +
5658 +   - by passing a child node pointer to which is to be inserted by this
5659 +   operation.
5660 +
5661 +*/
5662 +static int
5663 +carry_insert(carry_op * op /* operation to perform */ ,
5664 +            carry_level * doing        /* queue of operations @op
5665 +                                        * is part of */ ,
5666 +            carry_level * todo /* queue where new operations
5667 +                                * are accumulated */ )
5668 +{
5669 +       znode *node;
5670 +       carry_insert_data cdata;
5671 +       coord_t coord;
5672 +       reiser4_item_data data;
5673 +       carry_plugin_info info;
5674 +       int result;
5675 +
5676 +       assert("nikita-1036", op != NULL);
5677 +       assert("nikita-1037", todo != NULL);
5678 +       assert("nikita-1038", op->op == COP_INSERT);
5679 +
5680 +       trace_stamp(TRACE_CARRY);
5681 +       reiser4_stat_level_inc(doing, insert);
5682 +
5683 +       coord_init_zero(&coord);
5684 +
5685 +       /* perform common functionality of insert and paste. */
5686 +       result = insert_paste_common(op, doing, todo, &cdata, &coord, &data);
5687 +       if (result != 0)
5688 +               return result;
5689 +
5690 +       node = op->u.insert.d->coord->node;
5691 +       assert("nikita-1039", node != NULL);
5692 +       assert("nikita-1040", node_plugin_by_node(node) != NULL);
5693 +
5694 +       assert("nikita-949", space_needed_for_op(node, op) <= znode_free_space(node));
5695 +
5696 +       /* ask node layout to create new item. */
5697 +       info.doing = doing;
5698 +       info.todo = todo;
5699 +       result = node_plugin_by_node(node)->create_item
5700 +           (op->u.insert.d->coord, op->u.insert.d->key, op->u.insert.d->data, &info);
5701 +       doing->restartable = 0;
5702 +       znode_make_dirty(node);
5703 +
5704 +       return result;
5705 +}
5706 +
5707 +/*
5708 + * Flow insertion code. COP_INSERT_FLOW is special tree operation that is
5709 + * supplied with a "flow" (that is, a stream of data) and inserts it into tree
5710 + * by slicing into multiple items.
5711 + */
5712 +
5713 +#define flow_insert_point(op) ( ( op ) -> u.insert_flow.insert_point )
5714 +#define flow_insert_flow(op) ( ( op ) -> u.insert_flow.flow )
5715 +#define flow_insert_data(op) ( ( op ) -> u.insert_flow.data )
5716 +
5717 +static size_t
5718 +item_data_overhead(carry_op * op)
5719 +{
5720 +       if (flow_insert_data(op)->iplug->b.estimate == NULL)
5721 +               return 0;
5722 +       return (flow_insert_data(op)->iplug->b.estimate(NULL /* estimate insertion */, flow_insert_data(op)) -
5723 +               flow_insert_data(op)->length);
5724 +}
5725 +
5726 +/* FIXME-VS: this is called several times during one make_flow_for_insertion
5727 +   and it will always return the same result. Some optimization could be made
5728 +   by calculating this value once at the beginning and passing it around. That
5729 +   would reduce some flexibility in future changes
5730 +*/
5731 +static int can_paste(coord_t *, const reiser4_key *, const reiser4_item_data *);
5732 +static size_t
5733 +flow_insertion_overhead(carry_op * op)
5734 +{
5735 +       znode *node;
5736 +       size_t insertion_overhead;
5737 +
5738 +       node = flow_insert_point(op)->node;
5739 +       insertion_overhead = 0;
5740 +       if (node->nplug->item_overhead &&
5741 +           !can_paste(flow_insert_point(op), &flow_insert_flow(op)->key, flow_insert_data(op)))
5742 +               insertion_overhead = node->nplug->item_overhead(node, 0) + item_data_overhead(op);
5743 +       return insertion_overhead;
5744 +}
5745 +
5746 +/* how many bytes of flow does fit to the node */
5747 +static int
5748 +what_can_fit_into_node(carry_op * op)
5749 +{
5750 +       size_t free, overhead;
5751 +
5752 +       overhead = flow_insertion_overhead(op);
5753 +       free = znode_free_space(flow_insert_point(op)->node);
5754 +       if (free <= overhead)
5755 +               return 0;
5756 +       free -= overhead;
5757 +       /* FIXME: flow->length is loff_t only to not get overflowed in case of expandign truncate */
5758 +       if (free < op->u.insert_flow.flow->length)
5759 +               return free;
5760 +       return (int)op->u.insert_flow.flow->length;
5761 +}
5762 +
5763 +/* in make_space_for_flow_insertion we need to check either whether whole flow
5764 +   fits into a node or whether minimal fraction of flow fits into a node */
5765 +static int
5766 +enough_space_for_whole_flow(carry_op * op)
5767 +{
5768 +       return (unsigned) what_can_fit_into_node(op) == op->u.insert_flow.flow->length;
5769 +}
5770 +
5771 +#define MIN_FLOW_FRACTION 1
5772 +static int
5773 +enough_space_for_min_flow_fraction(carry_op * op)
5774 +{
5775 +       assert("vs-902", coord_is_after_rightmost(flow_insert_point(op)));
5776 +
5777 +       return what_can_fit_into_node(op) >= MIN_FLOW_FRACTION;
5778 +}
5779 +
5780 +/* this returns 0 if left neighbor was obtained successfully and everything
5781 +   upto insertion point including it were shifted and left neighbor still has
5782 +   some free space to put minimal fraction of flow into it */
5783 +static int
5784 +make_space_by_shift_left(carry_op * op, carry_level * doing, carry_level * todo)
5785 +{
5786 +       carry_node *left;
5787 +       znode *orig;
5788 +
5789 +       left = find_left_neighbor(op, doing);
5790 +       if (unlikely(IS_ERR(left))) {
5791 +               warning("vs-899", "make_space_by_shift_left: " "error accessing left neighbor: %li", PTR_ERR(left));
5792 +               return 1;
5793 +       }
5794 +       if (left == NULL)
5795 +               /* left neighbor either does not exist or is unformatted
5796 +                  node */
5797 +               return 1;
5798 +
5799 +       orig = flow_insert_point(op)->node;
5800 +       /* try to shift content of node @orig from its head upto insert point
5801 +          including insertion point into the left neighbor */
5802 +       carry_shift_data(LEFT_SIDE, flow_insert_point(op),
5803 +                        carry_real(left), doing, todo, 1 /* including insert
5804 +                                                          * point */);
5805 +       if (carry_real(left) != flow_insert_point(op)->node) {
5806 +               /* insertion point did not move */
5807 +               return 1;
5808 +       }
5809 +
5810 +       /* insertion point is set after last item in the node */
5811 +       assert("vs-900", coord_is_after_rightmost(flow_insert_point(op)));
5812 +
5813 +       if (!enough_space_for_min_flow_fraction(op)) {
5814 +               /* insertion point node does not have enough free space to put
5815 +                  even minimal portion of flow into it, therefore, move
5816 +                  insertion point back to orig node (before first item) */
5817 +               coord_init_before_first_item(flow_insert_point(op), orig);
5818 +               return 1;
5819 +       }
5820 +
5821 +       /* part of flow is to be written to the end of node */
5822 +       op->node = left;
5823 +       return 0;
5824 +}
5825 +
5826 +/* this returns 0 if right neighbor was obtained successfully and everything to
5827 +   the right of insertion point was shifted to it and node got enough free
5828 +   space to put minimal fraction of flow into it */
5829 +static int
5830 +make_space_by_shift_right(carry_op * op, carry_level * doing, carry_level * todo)
5831 +{
5832 +       carry_node *right;
5833 +
5834 +       right = find_right_neighbor(op, doing);
5835 +       if (unlikely(IS_ERR(right))) {
5836 +               warning("nikita-1065", "shift_right_excluding_insert_point: "
5837 +                       "error accessing right neighbor: %li", PTR_ERR(right));
5838 +               return 1;
5839 +       }
5840 +       if (right) {
5841 +               /* shift everything possible on the right of but excluding
5842 +                  insertion coord into the right neighbor */
5843 +               carry_shift_data(RIGHT_SIDE, flow_insert_point(op),
5844 +                                carry_real(right), doing, todo, 0 /* not
5845 +                                                                   * including
5846 +                                                                   * insert
5847 +                                                                   * point */);
5848 +       } else {
5849 +               /* right neighbor either does not exist or is unformatted
5850 +                  node */
5851 +               ;
5852 +       }
5853 +       if (coord_is_after_rightmost(flow_insert_point(op))) {
5854 +               if (enough_space_for_min_flow_fraction(op)) {
5855 +                       /* part of flow is to be written to the end of node */
5856 +                       return 0;
5857 +               }
5858 +       }
5859 +
5860 +       /* new node is to be added if insert point node did not get enough
5861 +          space for whole flow */
5862 +       return 1;
5863 +}
5864 +
5865 +/* this returns 0 when insert coord is set at the node end and fraction of flow
5866 +   fits into that node */
5867 +static int
5868 +make_space_by_new_nodes(carry_op * op, carry_level * doing, carry_level * todo)
5869 +{
5870 +       int result;
5871 +       znode *node;
5872 +       carry_node *new;
5873 +
5874 +       node = flow_insert_point(op)->node;
5875 +
5876 +       if (op->u.insert_flow.new_nodes == CARRY_FLOW_NEW_NODES_LIMIT)
5877 +               return RETERR(-E_NODE_FULL);
5878 +       /* add new node after insert point node */
5879 +       new = add_new_znode(node, op->node, doing, todo);
5880 +       if (unlikely(IS_ERR(new))) {
5881 +               return PTR_ERR(new);
5882 +       }
5883 +       result = lock_carry_node(doing, new);
5884 +       zput(carry_real(new));
5885 +       if (unlikely(result)) {
5886 +               return result;
5887 +       }
5888 +       op->u.insert_flow.new_nodes++;
5889 +       if (!coord_is_after_rightmost(flow_insert_point(op))) {
5890 +               carry_shift_data(RIGHT_SIDE, flow_insert_point(op),
5891 +                                carry_real(new), doing, todo, 0 /* not
5892 +                                                                 * including
5893 +                                                                 * insert
5894 +                                                                 * point */);
5895 +
5896 +               assert("vs-901", coord_is_after_rightmost(flow_insert_point(op)));
5897 +
5898 +               if (enough_space_for_min_flow_fraction(op)) {
5899 +                       return 0;
5900 +               }
5901 +               if (op->u.insert_flow.new_nodes == CARRY_FLOW_NEW_NODES_LIMIT)
5902 +                       return RETERR(-E_NODE_FULL);
5903 +
5904 +               /* add one more new node */
5905 +               new = add_new_znode(node, op->node, doing, todo);
5906 +               if (unlikely(IS_ERR(new))) {
5907 +                       return PTR_ERR(new);
5908 +               }
5909 +               result = lock_carry_node(doing, new);
5910 +               zput(carry_real(new));
5911 +               if (unlikely(result)) {
5912 +                       return result;
5913 +               }
5914 +               op->u.insert_flow.new_nodes++;
5915 +       }
5916 +
5917 +       /* move insertion point to new node */
5918 +       coord_init_before_first_item(flow_insert_point(op), carry_real(new));
5919 +       op->node = new;
5920 +       return 0;
5921 +}
5922 +
5923 +static int
5924 +make_space_for_flow_insertion(carry_op * op, carry_level * doing, carry_level * todo)
5925 +{
5926 +       __u32 flags = op->u.insert_flow.flags;
5927 +
5928 +       if (enough_space_for_whole_flow(op)) {
5929 +               /* whole flow fits into insert point node */
5930 +               return 0;
5931 +       }
5932 +
5933 +       if (!(flags & COPI_DONT_SHIFT_LEFT) && (make_space_by_shift_left(op, doing, todo) == 0)) {
5934 +               /* insert point is shifted to left neighbor of original insert
5935 +                  point node and is set after last unit in that node. It has
5936 +                  enough space to fit at least minimal fraction of flow. */
5937 +               return 0;
5938 +       }
5939 +
5940 +       if (enough_space_for_whole_flow(op)) {
5941 +               /* whole flow fits into insert point node */
5942 +               return 0;
5943 +       }
5944 +
5945 +       if (!(flags & COPI_DONT_SHIFT_RIGHT) && (make_space_by_shift_right(op, doing, todo) == 0)) {
5946 +               /* insert point is still set to the same node, but there is
5947 +                  nothing to the right of insert point. */
5948 +               return 0;
5949 +       }
5950 +
5951 +       if (enough_space_for_whole_flow(op)) {
5952 +               /* whole flow fits into insert point node */
5953 +               return 0;
5954 +       }
5955 +
5956 +       return make_space_by_new_nodes(op, doing, todo);
5957 +}
5958 +
5959 +/* implements COP_INSERT_FLOW operation */
5960 +static int
5961 +carry_insert_flow(carry_op * op, carry_level * doing, carry_level * todo)
5962 +{
5963 +       int result;
5964 +       flow_t *f;
5965 +       coord_t *insert_point;
5966 +       node_plugin *nplug;
5967 +       int something_written;
5968 +       carry_plugin_info info;
5969 +       znode *orig_node;
5970 +       lock_handle *orig_lh;
5971 +
5972 +       f = op->u.insert_flow.flow;
5973 +       result = 0;
5974 +
5975 +       /* this flag is used to distinguish a need to have carry to propagate
5976 +          leaf level modifications up in the tree when make_space fails not in
5977 +          first iteration of the loop below */
5978 +       something_written = 0;
5979 +
5980 +       /* carry system needs this to work */
5981 +       info.doing = doing;
5982 +       info.todo = todo;
5983 +
5984 +       orig_node = flow_insert_point(op)->node;
5985 +       orig_lh = doing->tracked;
5986 +
5987 +       while (f->length) {
5988 +               result = make_space_for_flow_insertion(op, doing, todo);
5989 +               if (result)
5990 +                       break;
5991 +
5992 +               insert_point = flow_insert_point(op);
5993 +               nplug = node_plugin_by_node(insert_point->node);
5994 +
5995 +               /* compose item data for insertion/pasting */
5996 +               flow_insert_data(op)->data = f->data;
5997 +               flow_insert_data(op)->length = what_can_fit_into_node(op);
5998 +
5999 +               if (can_paste(insert_point, &f->key, flow_insert_data(op))) {
6000 +                       /* insert point is set to item of file we are writing to and we have to append to it */
6001 +                       assert("vs-903", insert_point->between == AFTER_UNIT);
6002 +                       nplug->change_item_size(insert_point, flow_insert_data(op)->length);
6003 +                       flow_insert_data(op)->iplug->b.paste(insert_point, flow_insert_data(op), &info);
6004 +               } else {
6005 +                       /* new item must be inserted */
6006 +                       pos_in_node_t new_pos;
6007 +                       flow_insert_data(op)->length += item_data_overhead(op);
6008 +
6009 +                       /* FIXME-VS: this is because node40_create_item changes
6010 +                          insert_point for obscure reasons */
6011 +                       switch (insert_point->between) {
6012 +                       case AFTER_ITEM:
6013 +                               new_pos = insert_point->item_pos + 1;
6014 +                               break;
6015 +                       case EMPTY_NODE:
6016 +                               new_pos = 0;
6017 +                               break;
6018 +                       case BEFORE_ITEM:
6019 +                               assert("vs-905", insert_point->item_pos == 0);
6020 +                               new_pos = 0;
6021 +                               break;
6022 +                       default:
6023 +                               impossible("vs-906", "carry_insert_flow: invalid coord");
6024 +                               new_pos = 0;
6025 +                               break;
6026 +                       }
6027 +
6028 +                       nplug->create_item(insert_point, &f->key, flow_insert_data(op), &info);
6029 +                       coord_set_item_pos(insert_point, new_pos);
6030 +               }
6031 +               coord_init_after_item_end(insert_point);
6032 +               doing->restartable = 0;
6033 +               znode_make_dirty(insert_point->node);
6034 +
6035 +               move_flow_forward(f, (unsigned) flow_insert_data(op)->length);
6036 +               something_written = 1;
6037 +       }
6038 +
6039 +       if (orig_node != flow_insert_point(op)->node) {
6040 +               /* move lock to new insert point */
6041 +               done_lh(orig_lh);
6042 +               init_lh(orig_lh);
6043 +               result = longterm_lock_znode(orig_lh, flow_insert_point(op)->node, ZNODE_WRITE_LOCK, ZNODE_LOCK_HIPRI);
6044 +       }
6045 +
6046 +       return result;
6047 +}
6048 +
6049 +/* implements COP_DELETE operation
6050 +
6051 +   Remove pointer to @op -> u.delete.child from it's parent.
6052 +
6053 +   This function also handles killing of a tree root is last pointer from it
6054 +   was removed. This is complicated by our handling of "twig" level: root on
6055 +   twig level is never killed.
6056 +
6057 +*/
6058 +static int
6059 +carry_delete(carry_op * op /* operation to be performed */ ,
6060 +            carry_level * doing UNUSED_ARG     /* current carry
6061 +                                                * level */ ,
6062 +            carry_level * todo /* next carry level */ )
6063 +{
6064 +       int result;
6065 +       coord_t coord;
6066 +       coord_t coord2;
6067 +       znode *parent;
6068 +       znode *child;
6069 +       carry_plugin_info info;
6070 +       reiser4_tree *tree;
6071 +
6072 +       /*
6073 +        * This operation is called to delete internal item pointing to the
6074 +        * child node that was removed by carry from the tree on the previous
6075 +        * tree level.
6076 +        */
6077 +
6078 +       assert("nikita-893", op != NULL);
6079 +       assert("nikita-894", todo != NULL);
6080 +       assert("nikita-895", op->op == COP_DELETE);
6081 +       trace_stamp(TRACE_CARRY);
6082 +       reiser4_stat_level_inc(doing, delete);
6083 +
6084 +       coord_init_zero(&coord);
6085 +       coord_init_zero(&coord2);
6086 +
6087 +       parent = carry_real(op->node);
6088 +       child = op->u.delete.child ?
6089 +               carry_real(op->u.delete.child) : op->node->node;
6090 +       tree = znode_get_tree(child);
6091 +       RLOCK_TREE(tree);
6092 +
6093 +       /*
6094 +        * @parent was determined when carry entered parent level
6095 +        * (lock_carry_level/lock_carry_node). Since then, actual parent of
6096 +        * @child node could change due to other carry operations performed on
6097 +        * the parent level. Check for this.
6098 +        */
6099 +
6100 +       if (znode_parent(child) != parent) {
6101 +               /* NOTE-NIKITA add stat counter for this. */
6102 +               parent = znode_parent(child);
6103 +               assert("nikita-2581", find_carry_node(doing, parent));
6104 +       }
6105 +       RUNLOCK_TREE(tree);
6106 +
6107 +       assert("nikita-1213", znode_get_level(parent) > LEAF_LEVEL);
6108 +
6109 +       /* Twig level horrors: tree should be of height at least 2. So, last
6110 +          pointer from the root at twig level is preserved even if child is
6111 +          empty. This is ugly, but so it was architectured.
6112 +       */
6113 +
6114 +       if (znode_is_root(parent) &&
6115 +           znode_get_level(parent) <= REISER4_MIN_TREE_HEIGHT &&
6116 +           node_num_items(parent) == 1) {
6117 +               /* Delimiting key manipulations. */
6118 +               WLOCK_DK(tree);
6119 +               znode_set_ld_key(child, znode_set_ld_key(parent, min_key()));
6120 +               znode_set_rd_key(child, znode_set_rd_key(parent, max_key()));
6121 +               WUNLOCK_DK(tree);
6122 +
6123 +               /* @child escaped imminent death! */
6124 +               ZF_CLR(child, JNODE_HEARD_BANSHEE);
6125 +               return 0;
6126 +       }
6127 +
6128 +       /* convert child pointer to the coord_t */
6129 +       result = find_child_ptr(parent, child, &coord);
6130 +       if (result != NS_FOUND) {
6131 +               warning("nikita-994", "Cannot find child pointer: %i", result);
6132 +               print_znode("child", child);
6133 +               print_znode("parent", parent);
6134 +               print_coord_content("coord", &coord);
6135 +               return result;
6136 +       }
6137 +
6138 +       coord_dup(&coord2, &coord);
6139 +       info.doing = doing;
6140 +       info.todo = todo;
6141 +       {
6142 +               /*
6143 +                * Actually kill internal item: prepare structure with
6144 +                * arguments for ->cut_and_kill() method...
6145 +                */
6146 +
6147 +               struct carry_kill_data kdata;
6148 +               kdata.params.from = &coord;
6149 +               kdata.params.to = &coord2;
6150 +               kdata.params.from_key = NULL;
6151 +               kdata.params.to_key = NULL;
6152 +               kdata.params.smallest_removed = NULL;
6153 +               kdata.flags = op->u.delete.flags;
6154 +               kdata.inode = 0;
6155 +               kdata.left = 0;
6156 +               kdata.right = 0;
6157 +               /* ... and call it. */
6158 +               result = node_plugin_by_node(parent)->cut_and_kill(&kdata,
6159 +                                                                  &info);
6160 +       }
6161 +       doing->restartable = 0;
6162 +
6163 +       /* check whether root should be killed violently */
6164 +       if (znode_is_root(parent) &&
6165 +           /* don't kill roots at and lower than twig level */
6166 +           znode_get_level(parent) > REISER4_MIN_TREE_HEIGHT &&
6167 +           node_num_items(parent) == 1) {
6168 +               result = kill_tree_root(coord.node);
6169 +       }
6170 +
6171 +       return result < 0 ? : 0;
6172 +}
6173 +
6174 +/* implements COP_CUT opration
6175 +
6176 +   Cuts part or whole content of node.
6177 +
6178 +*/
6179 +static int
6180 +carry_cut(carry_op * op /* operation to be performed */ ,
6181 +         carry_level * doing   /* current carry level */ ,
6182 +         carry_level * todo /* next carry level */ )
6183 +{
6184 +       int result;
6185 +       carry_plugin_info info;
6186 +       node_plugin *nplug;
6187 +
6188 +       assert("nikita-896", op != NULL);
6189 +       assert("nikita-897", todo != NULL);
6190 +       assert("nikita-898", op->op == COP_CUT);
6191 +       trace_stamp(TRACE_CARRY);
6192 +       reiser4_stat_level_inc(doing, cut);
6193 +
6194 +       info.doing = doing;
6195 +       info.todo = todo;
6196 +
6197 +       nplug = node_plugin_by_node(carry_real(op->node));
6198 +       if (op->u.cut_or_kill.is_cut)
6199 +               result = nplug->cut(op->u.cut_or_kill.u.cut, &info);
6200 +       else
6201 +               result = nplug->cut_and_kill(op->u.cut_or_kill.u.kill, &info);
6202 +
6203 +       doing->restartable = 0;
6204 +       return result < 0 ? : 0;
6205 +}
6206 +
6207 +/* helper function for carry_paste(): returns true if @op can be continued as
6208 +   paste  */
6209 +static int
6210 +can_paste(coord_t * icoord, const reiser4_key * key, const reiser4_item_data * data)
6211 +{
6212 +       coord_t circa;
6213 +       item_plugin *new_iplug;
6214 +       item_plugin *old_iplug;
6215 +       int result = 0;         /* to keep gcc shut */
6216 +
6217 +       assert("", icoord->between != AT_UNIT);
6218 +
6219 +       /* obviously, one cannot paste when node is empty---there is nothing
6220 +          to paste into. */
6221 +       if (node_is_empty(icoord->node))
6222 +               return 0;
6223 +       /* if insertion point is at the middle of the item, then paste */
6224 +       if (!coord_is_between_items(icoord))
6225 +               return 1;
6226 +       coord_dup(&circa, icoord);
6227 +       circa.between = AT_UNIT;
6228 +
6229 +       old_iplug = item_plugin_by_coord(&circa);
6230 +       new_iplug = data->iplug;
6231 +
6232 +       /* check whether we can paste to the item @icoord is "at" when we
6233 +          ignore ->between field */
6234 +       if (old_iplug == new_iplug && item_can_contain_key(&circa, key, data)) {
6235 +               result = 1;
6236 +       } else if (icoord->between == BEFORE_UNIT || icoord->between == BEFORE_ITEM) {
6237 +               /* otherwise, try to glue to the item at the left, if any */
6238 +               coord_dup(&circa, icoord);
6239 +               if (coord_set_to_left(&circa)) {
6240 +                       result = 0;
6241 +                       coord_init_before_item(icoord);
6242 +               } else {
6243 +                       old_iplug = item_plugin_by_coord(&circa);
6244 +                       result = (old_iplug == new_iplug) && item_can_contain_key(icoord, key, data);
6245 +                       if (result) {
6246 +                               coord_dup(icoord, &circa);
6247 +                               icoord->between = AFTER_UNIT;
6248 +                       }
6249 +               }
6250 +       } else if (icoord->between == AFTER_UNIT || icoord->between == AFTER_ITEM) {
6251 +               coord_dup(&circa, icoord);
6252 +               /* otherwise, try to glue to the item at the right, if any */
6253 +               if (coord_set_to_right(&circa)) {
6254 +                       result = 0;
6255 +                       coord_init_after_item(icoord);
6256 +               } else {
6257 +                       int (*cck) (const coord_t *, const reiser4_key *, const reiser4_item_data *);
6258 +
6259 +                       old_iplug = item_plugin_by_coord(&circa);
6260 +
6261 +                       cck = old_iplug->b.can_contain_key;
6262 +                       if (cck == NULL)
6263 +                               /* item doesn't define ->can_contain_key
6264 +                                  method? So it is not expandable. */
6265 +                               result = 0;
6266 +                       else {
6267 +                               result = (old_iplug == new_iplug) && cck(&circa /*icoord */ , key, data);
6268 +                               if (result) {
6269 +                                       coord_dup(icoord, &circa);
6270 +                                       icoord->between = BEFORE_UNIT;
6271 +                               }
6272 +                       }
6273 +               }
6274 +       } else
6275 +               impossible("nikita-2513", "Nothing works");
6276 +       if (result) {
6277 +               if (icoord->between == BEFORE_ITEM) {
6278 +                       assert("vs-912", icoord->unit_pos == 0);
6279 +                       icoord->between = BEFORE_UNIT;
6280 +               } else if (icoord->between == AFTER_ITEM) {
6281 +                       coord_init_after_item_end(icoord);
6282 +               }
6283 +       }
6284 +       return result;
6285 +}
6286 +
6287 +/* implements COP_PASTE operation
6288 +
6289 +   Paste data into existing item. This is complicated by the fact that after
6290 +   we shifted something to the left or right neighbors trying to free some
6291 +   space, item we were supposed to paste into can be in different node than
6292 +   insertion coord. If so, we are no longer doing paste, but insert. See
6293 +   comments in insert_paste_common().
6294 +
6295 +*/
6296 +static int
6297 +carry_paste(carry_op * op /* operation to be performed */ ,
6298 +           carry_level * doing UNUSED_ARG      /* current carry
6299 +                                                * level */ ,
6300 +           carry_level * todo /* next carry level */ )
6301 +{
6302 +       znode *node;
6303 +       carry_insert_data cdata;
6304 +       coord_t dcoord;
6305 +       reiser4_item_data data;
6306 +       int result;
6307 +       int real_size;
6308 +       item_plugin *iplug;
6309 +       carry_plugin_info info;
6310 +       coord_t *coord;
6311 +
6312 +       assert("nikita-982", op != NULL);
6313 +       assert("nikita-983", todo != NULL);
6314 +       assert("nikita-984", op->op == COP_PASTE);
6315 +
6316 +       trace_stamp(TRACE_CARRY);
6317 +       reiser4_stat_level_inc(doing, paste);
6318 +
6319 +       coord_init_zero(&dcoord);
6320 +
6321 +       result = insert_paste_common(op, doing, todo, &cdata, &dcoord, &data);
6322 +       if (result != 0)
6323 +               return result;
6324 +
6325 +       coord = op->u.insert.d->coord;
6326 +
6327 +       /* handle case when op -> u.insert.coord doesn't point to the item
6328 +          of required type. restart as insert. */
6329 +       if (!can_paste(coord, op->u.insert.d->key, op->u.insert.d->data)) {
6330 +               op->op = COP_INSERT;
6331 +               op->u.insert.type = COPT_PASTE_RESTARTED;
6332 +               reiser4_stat_level_inc(doing, paste_restarted);
6333 +               result = op_dispatch_table[COP_INSERT].handler(op, doing, todo);
6334 +
6335 +               return result;
6336 +       }
6337 +
6338 +       node = coord->node;
6339 +       iplug = item_plugin_by_coord(coord);
6340 +       assert("nikita-992", iplug != NULL);
6341 +
6342 +       assert("nikita-985", node != NULL);
6343 +       assert("nikita-986", node_plugin_by_node(node) != NULL);
6344 +
6345 +       assert("nikita-987", space_needed_for_op(node, op) <= znode_free_space(node));
6346 +
6347 +       assert("nikita-1286", coord_is_existing_item(coord));
6348 +
6349 +       /*
6350 +        * if item is expanded as a result of this operation, we should first
6351 +        * change item size, than call ->b.paste item method. If item is
6352 +        * shrunk, it should be done other way around: first call ->b.paste
6353 +        * method, then reduce item size.
6354 +        */
6355 +
6356 +       real_size = space_needed_for_op(node, op);
6357 +       if (real_size > 0)
6358 +               node->nplug->change_item_size(coord, real_size);
6359 +
6360 +       doing->restartable = 0;
6361 +       info.doing = doing;
6362 +       info.todo = todo;
6363 +
6364 +       result = iplug->b.paste(coord, op->u.insert.d->data, &info);
6365 +
6366 +       if (real_size < 0)
6367 +               node->nplug->change_item_size(coord, real_size);
6368 +
6369 +       /* if we pasted at the beginning of the item, update item's key. */
6370 +       if (coord->unit_pos == 0 && coord->between != AFTER_UNIT)
6371 +               node->nplug->update_item_key(coord, op->u.insert.d->key, &info);
6372 +
6373 +       znode_make_dirty(node);
6374 +       return result;
6375 +}
6376 +
6377 +/* handle carry COP_EXTENT operation. */
6378 +static int
6379 +carry_extent(carry_op * op /* operation to perform */ ,
6380 +            carry_level * doing        /* queue of operations @op
6381 +                                        * is part of */ ,
6382 +            carry_level * todo /* queue where new operations
6383 +                                * are accumulated */ )
6384 +{
6385 +       znode *node;
6386 +       carry_insert_data cdata;
6387 +       coord_t coord;
6388 +       reiser4_item_data data;
6389 +       carry_op *delete_dummy;
6390 +       carry_op *insert_extent;
6391 +       int result;
6392 +       carry_plugin_info info;
6393 +
6394 +       assert("nikita-1751", op != NULL);
6395 +       assert("nikita-1752", todo != NULL);
6396 +       assert("nikita-1753", op->op == COP_EXTENT);
6397 +
6398 +       trace_stamp(TRACE_CARRY);
6399 +       reiser4_stat_level_inc(doing, extent);
6400 +
6401 +       /* extent insertion overview:
6402 +
6403 +          extents live on the TWIG LEVEL, which is level one above the leaf
6404 +          one. This complicates extent insertion logic somewhat: it may
6405 +          happen (and going to happen all the time) that in logical key
6406 +          ordering extent has to be placed between items I1 and I2, located
6407 +          at the leaf level, but I1 and I2 are in the same formatted leaf
6408 +          node N1. To insert extent one has to
6409 +
6410 +           (1) reach node N1 and shift data between N1, its neighbors and
6411 +           possibly newly allocated nodes until I1 and I2 fall into different
6412 +           nodes. Since I1 and I2 are still neighboring items in logical key
6413 +           order, they will be necessary utmost items in their respective
6414 +           nodes.
6415 +
6416 +           (2) After this new extent item is inserted into node on the twig
6417 +           level.
6418 +
6419 +          Fortunately this process can reuse almost all code from standard
6420 +          insertion procedure (viz. make_space() and insert_paste_common()),
6421 +          due to the following observation: make_space() only shifts data up
6422 +          to and excluding or including insertion point. It never
6423 +          "over-moves" through insertion point. Thus, one can use
6424 +          make_space() to perform step (1). All required for this is just to
6425 +          instruct free_space_shortage() to keep make_space() shifting data
6426 +          until insertion point is at the node border.
6427 +
6428 +       */
6429 +
6430 +       /* perform common functionality of insert and paste. */
6431 +       result = insert_paste_common(op, doing, todo, &cdata, &coord, &data);
6432 +       if (result != 0)
6433 +               return result;
6434 +
6435 +       node = op->u.extent.d->coord->node;
6436 +       assert("nikita-1754", node != NULL);
6437 +       assert("nikita-1755", node_plugin_by_node(node) != NULL);
6438 +       assert("nikita-1700", coord_wrt(op->u.extent.d->coord) != COORD_INSIDE);
6439 +
6440 +       /* NOTE-NIKITA add some checks here. Not assertions, -EIO. Check that
6441 +          extent fits between items. */
6442 +
6443 +       info.doing = doing;
6444 +       info.todo = todo;
6445 +
6446 +       /* there is another complication due to placement of extents on the
6447 +          twig level: extents are "rigid" in the sense that key-range
6448 +          occupied by extent cannot grow indefinitely to the right as it is
6449 +          for the formatted leaf nodes. Because of this when search finds two
6450 +          adjacent extents on the twig level, it has to "drill" to the leaf
6451 +          level, creating new node. Here we are removing this node.
6452 +       */
6453 +       if (node_is_empty(node)) {
6454 +               delete_dummy = node_post_carry(&info, COP_DELETE, node, 1);
6455 +               if (IS_ERR(delete_dummy))
6456 +                       return PTR_ERR(delete_dummy);
6457 +               delete_dummy->u.delete.child = NULL;
6458 +               delete_dummy->u.delete.flags = DELETE_RETAIN_EMPTY;
6459 +               ZF_SET(node, JNODE_HEARD_BANSHEE);
6460 +       }
6461 +
6462 +       /* proceed with inserting extent item into parent. We are definitely
6463 +          inserting rather than pasting if we get that far. */
6464 +       insert_extent = node_post_carry(&info, COP_INSERT, node, 1);
6465 +       if (IS_ERR(insert_extent))
6466 +               /* @delete_dummy will be automatically destroyed on the level
6467 +                  exiting  */
6468 +               return PTR_ERR(insert_extent);
6469 +       /* NOTE-NIKITA insertion by key is simplest option here. Another
6470 +          possibility is to insert on the left or right of already existing
6471 +          item.
6472 +       */
6473 +       insert_extent->u.insert.type = COPT_KEY;
6474 +       insert_extent->u.insert.d = op->u.extent.d;
6475 +       assert("nikita-1719", op->u.extent.d->key != NULL);
6476 +       insert_extent->u.insert.d->data->arg = op->u.extent.d->coord;
6477 +       insert_extent->u.insert.flags = znode_get_tree(node)->carry.new_extent_flags;
6478 +
6479 +       /*
6480 +        * if carry was asked to track lock handle we should actually track
6481 +        * lock handle on the twig node rather than on the leaf where
6482 +        * operation was started from. Transfer tracked lock handle.
6483 +        */
6484 +       if (doing->track_type) {
6485 +               assert("nikita-3242", doing->tracked != NULL);
6486 +               assert("nikita-3244", todo->tracked == NULL);
6487 +               todo->tracked = doing->tracked;
6488 +               todo->track_type = CARRY_TRACK_NODE;
6489 +               doing->tracked = NULL;
6490 +               doing->track_type = 0;
6491 +       }
6492 +
6493 +       return 0;
6494 +}
6495 +
6496 +/* update key in @parent between pointers to @left and @right.
6497 +
6498 +   Find coords of @left and @right and update delimiting key between them.
6499 +   This is helper function called by carry_update(). Finds position of
6500 +   internal item involved. Updates item key. Updates delimiting keys of child
6501 +   nodes involved.
6502 +*/
6503 +static int
6504 +update_delimiting_key(znode * parent   /* node key is updated
6505 +                                        * in */ ,
6506 +                     znode * left /* child of @parent */ ,
6507 +                     znode * right /* child of @parent */ ,
6508 +                     carry_level * doing       /* current carry
6509 +                                                * level */ ,
6510 +                     carry_level * todo        /* parent carry
6511 +                                                * level */ ,
6512 +                     const char **error_msg    /* place to
6513 +                                                * store error
6514 +                                                * message */ )
6515 +{
6516 +       coord_t left_pos;
6517 +       coord_t right_pos;
6518 +       int result;
6519 +       reiser4_key ldkey;
6520 +       carry_plugin_info info;
6521 +
6522 +       assert("nikita-1177", right != NULL);
6523 +       /* find position of right left child in a parent */
6524 +       result = find_child_ptr(parent, right, &right_pos);
6525 +       if (result != NS_FOUND) {
6526 +               *error_msg = "Cannot find position of right child";
6527 +               return result;
6528 +       }
6529 +
6530 +       if ((left != NULL) && !coord_is_leftmost_unit(&right_pos)) {
6531 +               /* find position of the left child in a parent */
6532 +               result = find_child_ptr(parent, left, &left_pos);
6533 +               if (result != NS_FOUND) {
6534 +                       *error_msg = "Cannot find position of left child";
6535 +                       return result;
6536 +               }
6537 +               assert("nikita-1355", left_pos.node != NULL);
6538 +       } else
6539 +               left_pos.node = NULL;
6540 +
6541 +       /* check that they are separated by exactly one key and are basically
6542 +          sane */
6543 +       if (REISER4_DEBUG) {
6544 +               if ((left_pos.node != NULL)
6545 +                   && !coord_is_existing_unit(&left_pos)) {
6546 +                       *error_msg = "Left child is bastard";
6547 +                       return RETERR(-EIO);
6548 +               }
6549 +               if (!coord_is_existing_unit(&right_pos)) {
6550 +                       *error_msg = "Right child is bastard";
6551 +                       return RETERR(-EIO);
6552 +               }
6553 +               if (left_pos.node != NULL &&
6554 +                   !coord_are_neighbors(&left_pos, &right_pos)) {
6555 +                       *error_msg = "Children are not direct siblings";
6556 +                       return RETERR(-EIO);
6557 +               }
6558 +       }
6559 +       *error_msg = NULL;
6560 +
6561 +       info.doing = doing;
6562 +       info.todo = todo;
6563 +
6564 +       /*
6565 +        * If child node is not empty, new key of internal item is a key of
6566 +        * leftmost item in the child node. If the child is empty, take its
6567 +        * right delimiting key as a new key of the internal item. Precise key
6568 +        * in the latter case is not important per se, because the child (and
6569 +        * the internal item) are going to be killed shortly anyway, but we
6570 +        * have to preserve correct order of keys in the parent node.
6571 +        */
6572 +
6573 +       if (!ZF_ISSET(right, JNODE_HEARD_BANSHEE))
6574 +               leftmost_key_in_node(right, &ldkey);
6575 +       else
6576 +               UNDER_RW_VOID(dk, znode_get_tree(parent), read,
6577 +                             ldkey = *znode_get_rd_key(right));
6578 +       node_plugin_by_node(parent)->update_item_key(&right_pos, &ldkey, &info);
6579 +       doing->restartable = 0;
6580 +       znode_make_dirty(parent);
6581 +       return 0;
6582 +}
6583 +
6584 +/* implements COP_UPDATE opration
6585 +
6586 +   Update delimiting keys.
6587 +
6588 +*/
6589 +static int
6590 +carry_update(carry_op * op /* operation to be performed */ ,
6591 +            carry_level * doing /* current carry level */ ,
6592 +            carry_level * todo /* next carry level */ )
6593 +{
6594 +       int result;
6595 +       carry_node *missing UNUSED_ARG;
6596 +       znode *left;
6597 +       znode *right;
6598 +       carry_node *lchild;
6599 +       carry_node *rchild;
6600 +       const char *error_msg;
6601 +       reiser4_tree *tree;
6602 +
6603 +       /*
6604 +        * This operation is called to update key of internal item. This is
6605 +        * necessary when carry shifted of cut data on the child
6606 +        * level. Arguments of this operation are:
6607 +        *
6608 +        *     @right --- child node. Operation should update key of internal
6609 +        *     item pointing to @right.
6610 +        *
6611 +        *     @left --- left neighbor of @right. This parameter is optional.
6612 +        */
6613 +
6614 +       assert("nikita-902", op != NULL);
6615 +       assert("nikita-903", todo != NULL);
6616 +       assert("nikita-904", op->op == COP_UPDATE);
6617 +       trace_stamp(TRACE_CARRY);
6618 +       reiser4_stat_level_inc(doing, update);
6619 +
6620 +       lchild = op->u.update.left;
6621 +       rchild = op->node;
6622 +
6623 +       if (lchild != NULL) {
6624 +               assert("nikita-1001", lchild->parent);
6625 +               assert("nikita-1003", !lchild->left);
6626 +               left = carry_real(lchild);
6627 +       } else
6628 +               left = NULL;
6629 +
6630 +       tree = znode_get_tree(rchild->node);
6631 +       RLOCK_TREE(tree);
6632 +       right = znode_parent(rchild->node);
6633 +       if (REISER4_STATS) {
6634 +               znode *old_right;
6635 +               if (rchild != NULL) {
6636 +                       assert("nikita-1000", rchild->parent);
6637 +                       assert("nikita-1002", !rchild->left);
6638 +                       old_right = carry_real(rchild);
6639 +               } else
6640 +                       old_right = NULL;
6641 +               if (znode_parent(rchild->node) != old_right)
6642 +                       /* parent node was split, and pointer to @rchild was
6643 +                          inserted/moved into new node. Wonders of balkancing
6644 +                          (sic.).
6645 +                       */
6646 +                       reiser4_stat_level_inc(doing, half_split_race);
6647 +       }
6648 +       RUNLOCK_TREE(tree);
6649 +
6650 +       if (right != NULL) {
6651 +               result = update_delimiting_key(right,
6652 +                                              lchild ? lchild->node : NULL,
6653 +                                              rchild->node,
6654 +                                              doing, todo, &error_msg);
6655 +       } else {
6656 +               error_msg = "Cannot find node to update key in";
6657 +               result = RETERR(-EIO);
6658 +       }
6659 +       /* operation will be reposted to the next level by the
6660 +          ->update_item_key() method of node plugin, if necessary. */
6661 +
6662 +       if (result != 0) {
6663 +               warning("nikita-999", "Error updating delimiting key: %s (%i)", error_msg ? : "", result);
6664 +               print_znode("left", left);
6665 +               print_znode("right", right);
6666 +               print_znode("lchild", lchild ? lchild->node : NULL);
6667 +               print_znode("rchild", rchild->node);
6668 +       }
6669 +       return result;
6670 +}
6671 +
6672 +/* move items from @node during carry */
6673 +static int
6674 +carry_shift_data(sideof side /* in what direction to move data */ ,
6675 +                coord_t * insert_coord /* coord where new item
6676 +                                          * is to be inserted */ ,
6677 +                znode * node /* node which data are moved from */ ,
6678 +                carry_level * doing /* active carry queue */ ,
6679 +                carry_level * todo     /* carry queue where new
6680 +                                        * operations are to be put
6681 +                                        * in */ ,
6682 +                unsigned int including_insert_coord_p  /* true if
6683 +                                                        * @insertion_coord
6684 +                                                        * can be moved */ )
6685 +{
6686 +       int result;
6687 +       znode *source;
6688 +       carry_plugin_info info;
6689 +       node_plugin *nplug;
6690 +
6691 +       source = insert_coord->node;
6692 +
6693 +       info.doing = doing;
6694 +       info.todo = todo;
6695 +
6696 +       nplug = node_plugin_by_node(node);
6697 +       result = nplug->shift(insert_coord, node,
6698 +                             (side == LEFT_SIDE) ? SHIFT_LEFT : SHIFT_RIGHT, 0,
6699 +                             (int) including_insert_coord_p, &info);
6700 +       /* the only error ->shift() method of node plugin can return is
6701 +          -ENOMEM due to carry node/operation allocation. */
6702 +       assert("nikita-915", result >= 0 || result == -ENOMEM);
6703 +       if (result > 0) {
6704 +               /*
6705 +                * if some number of bytes was actually shifted, mark nodes
6706 +                * dirty, and carry level as non-restartable.
6707 +                */
6708 +               doing->restartable = 0;
6709 +               znode_make_dirty(source);
6710 +               znode_make_dirty(node);
6711 +       }
6712 +
6713 +       assert("nikita-2077", coord_check(insert_coord));
6714 +       return 0;
6715 +}
6716 +
6717 +typedef carry_node *(*carry_iterator) (carry_node * node);
6718 +static carry_node *find_dir_carry(carry_node * node, carry_level * level, carry_iterator iterator);
6719 +
6720 +/* look for the left neighbor of given carry node in a carry queue.
6721 +
6722 +   This is used by find_left_neighbor(), but I am not sure that this
6723 +   really gives any advantage. More statistics required.
6724 +
6725 +*/
6726 +reiser4_internal carry_node *
6727 +find_left_carry(carry_node * node      /* node to fine left neighbor
6728 +                                        * of */ ,
6729 +               carry_level * level /* level to scan */ )
6730 +{
6731 +       return find_dir_carry(node, level, (carry_iterator) pool_level_list_prev);
6732 +}
6733 +
6734 +/* look for the right neighbor of given carry node in a
6735 +   carry queue.
6736 +
6737 +   This is used by find_right_neighbor(), but I am not sure that this
6738 +   really gives any advantage. More statistics required.
6739 +
6740 +*/
6741 +reiser4_internal carry_node *
6742 +find_right_carry(carry_node * node     /* node to fine right neighbor
6743 +                                          * of */ ,
6744 +                carry_level * level /* level to scan */ )
6745 +{
6746 +       return find_dir_carry(node, level, (carry_iterator) pool_level_list_next);
6747 +}
6748 +
6749 +/* look for the left or right neighbor of given carry node in a carry
6750 +   queue.
6751 +
6752 +   Helper function used by find_{left|right}_carry().
6753 +*/
6754 +static carry_node *
6755 +find_dir_carry(carry_node * node       /* node to start scanning
6756 +                                        * from */ ,
6757 +              carry_level * level /* level to scan */ ,
6758 +              carry_iterator iterator  /* operation to
6759 +                                        * move to the next
6760 +                                        * node */ )
6761 +{
6762 +       carry_node *neighbor;
6763 +
6764 +       assert("nikita-1059", node != NULL);
6765 +       assert("nikita-1060", level != NULL);
6766 +
6767 +       /* scan list of carry nodes on this list dir-ward, skipping all
6768 +          carry nodes referencing the same znode. */
6769 +       neighbor = node;
6770 +       while (1) {
6771 +               neighbor = iterator(neighbor);
6772 +               if (pool_level_list_end(&level->nodes, &neighbor->header))
6773 +                       return NULL;
6774 +               if (carry_real(neighbor) != carry_real(node))
6775 +                       return neighbor;
6776 +       }
6777 +}
6778 +
6779 +/*
6780 + * Memory reservation estimation.
6781 + *
6782 + * Carry process proceeds through tree levels upwards. Carry assumes that it
6783 + * takes tree in consistent state (e.g., that search tree invariants hold),
6784 + * and leaves tree consistent after it finishes. This means that when some
6785 + * error occurs carry cannot simply return if there are pending carry
6786 + * operations. Generic solution for this problem is carry-undo either as
6787 + * transaction manager feature (requiring checkpoints and isolation), or
6788 + * through some carry specific mechanism.
6789 + *
6790 + * Our current approach is to panic if carry hits an error while tree is
6791 + * inconsistent. Unfortunately -ENOMEM can easily be triggered. To work around
6792 + * this "memory reservation" mechanism was added.
6793 + *
6794 + * Memory reservation is implemented by perthread-pages.diff patch from
6795 + * core-patches. Its API is defined in <linux/gfp.h>
6796 + *
6797 + *     int  perthread_pages_reserve(int nrpages, int gfp);
6798 + *     void perthread_pages_release(int nrpages);
6799 + *     int  perthread_pages_count(void);
6800 + *
6801 + * carry estimates its worst case memory requirements at the entry, reserved
6802 + * enough memory, and released unused pages before returning.
6803 + *
6804 + * Code below estimates worst case memory requirements for a given carry
6805 + * queue. This is dome by summing worst case memory requirements for each
6806 + * operation in the queue.
6807 + *
6808 + */
6809 +
6810 +/*
6811 + * Memory memory requirements of many operations depends on the tree
6812 + * height. For example, item insertion requires new node to be inserted at
6813 + * each tree level in the worst case. What tree height should be used for
6814 + * estimation? Current tree height is wrong, because tree height can change
6815 + * between the time when estimation was done and the time when operation is
6816 + * actually performed. Maximal possible tree height (REISER4_MAX_ZTREE_HEIGHT)
6817 + * is also not desirable, because it would lead to the huge over-estimation
6818 + * all the time. Plausible solution is "capped tree height": if current tree
6819 + * height is less than some TREE_HEIGHT_CAP constant, capped tree height is
6820 + * TREE_HEIGHT_CAP, otherwise it's current tree height. Idea behind this is
6821 + * that if tree height is TREE_HEIGHT_CAP or larger, it's extremely unlikely
6822 + * to be increased even more during short interval of time.
6823 + */
6824 +#define TREE_HEIGHT_CAP (5)
6825 +
6826 +/* return capped tree height for the @tree. See comment above. */
6827 +static int
6828 +cap_tree_height(reiser4_tree * tree)
6829 +{
6830 +       return max_t(int, tree->height, TREE_HEIGHT_CAP);
6831 +}
6832 +
6833 +/* return capped tree height for the current tree. */
6834 +static int capped_height(void)
6835 +{
6836 +       return cap_tree_height(current_tree);
6837 +}
6838 +
6839 +/* return number of pages required to store given number of bytes */
6840 +static int bytes_to_pages(int bytes)
6841 +{
6842 +       return (bytes + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
6843 +}
6844 +
6845 +/* how many pages are required to allocate znodes during item insertion. */
6846 +static int
6847 +carry_estimate_znodes(void)
6848 +{
6849 +       /*
6850 +        * Note, that there we have some problem here: there is no way to
6851 +        * reserve pages specifically for the given slab. This means that
6852 +        * these pages can be hijacked for some other end.
6853 +        */
6854 +
6855 +       /* in the worst case we need 3 new znode on each tree level */
6856 +       return bytes_to_pages(capped_height() * sizeof(znode) * 3);
6857 +}
6858 +
6859 +/*
6860 + * how many pages are required to load bitmaps. One bitmap per level.
6861 + */
6862 +static int
6863 +carry_estimate_bitmaps(void)
6864 +{
6865 +       if (reiser4_is_set(reiser4_get_current_sb(), REISER4_DONT_LOAD_BITMAP)) {
6866 +               int bytes;
6867 +
6868 +               bytes = capped_height() *
6869 +                       (0 +   /* bnode should be added, but its is private to
6870 +                               * bitmap.c, skip for now. */
6871 +                        2 * sizeof(jnode));      /* working and commit jnodes */
6872 +               return bytes_to_pages(bytes) + 2; /* and their contents */
6873 +       } else
6874 +               /* bitmaps were pre-loaded during mount */
6875 +               return 0;
6876 +}
6877 +
6878 +/* worst case item insertion memory requirements */
6879 +static int
6880 +carry_estimate_insert(carry_op * op, carry_level * level)
6881 +{
6882 +       return
6883 +               carry_estimate_bitmaps() +
6884 +               carry_estimate_znodes() +
6885 +               1 + /* new atom */
6886 +               capped_height() + /* new block on each level */
6887 +               1 + /* and possibly extra new block at the leaf level */
6888 +               3; /* loading of leaves into memory */
6889 +}
6890 +
6891 +/* worst case item deletion memory requirements */
6892 +static int
6893 +carry_estimate_delete(carry_op * op, carry_level * level)
6894 +{
6895 +       return
6896 +               carry_estimate_bitmaps() +
6897 +               carry_estimate_znodes() +
6898 +               1 + /* new atom */
6899 +               3; /* loading of leaves into memory */
6900 +}
6901 +
6902 +/* worst case tree cut memory requirements */
6903 +static int
6904 +carry_estimate_cut(carry_op * op, carry_level * level)
6905 +{
6906 +       return
6907 +               carry_estimate_bitmaps() +
6908 +               carry_estimate_znodes() +
6909 +               1 + /* new atom */
6910 +               3; /* loading of leaves into memory */
6911 +}
6912 +
6913 +/* worst case memory requirements of pasting into item */
6914 +static int
6915 +carry_estimate_paste(carry_op * op, carry_level * level)
6916 +{
6917 +       return
6918 +               carry_estimate_bitmaps() +
6919 +               carry_estimate_znodes() +
6920 +               1 + /* new atom */
6921 +               capped_height() + /* new block on each level */
6922 +               1 + /* and possibly extra new block at the leaf level */
6923 +               3; /* loading of leaves into memory */
6924 +}
6925 +
6926 +/* worst case memory requirements of extent insertion */
6927 +static int
6928 +carry_estimate_extent(carry_op * op, carry_level * level)
6929 +{
6930 +       return
6931 +               carry_estimate_insert(op, level) + /* insert extent */
6932 +               carry_estimate_delete(op, level);  /* kill leaf */
6933 +}
6934 +
6935 +/* worst case memory requirements of key update */
6936 +static int
6937 +carry_estimate_update(carry_op * op, carry_level * level)
6938 +{
6939 +       return 0;
6940 +}
6941 +
6942 +/* worst case memory requirements of flow insertion */
6943 +static int
6944 +carry_estimate_insert_flow(carry_op * op, carry_level * level)
6945 +{
6946 +       int newnodes;
6947 +
6948 +       newnodes = min(bytes_to_pages(op->u.insert_flow.flow->length),
6949 +                      CARRY_FLOW_NEW_NODES_LIMIT);
6950 +       /*
6951 +        * roughly estimate insert_flow as a sequence of insertions.
6952 +        */
6953 +       return newnodes * carry_estimate_insert(op, level);
6954 +}
6955 +
6956 +/* This is dispatch table for carry operations. It can be trivially
6957 +   abstracted into useful plugin: tunable balancing policy is a good
6958 +   thing. */
6959 +reiser4_internal carry_op_handler op_dispatch_table[COP_LAST_OP] = {
6960 +       [COP_INSERT] = {
6961 +               .handler = carry_insert,
6962 +               .estimate = carry_estimate_insert
6963 +       },
6964 +       [COP_DELETE] = {
6965 +               .handler = carry_delete,
6966 +               .estimate = carry_estimate_delete
6967 +       },
6968 +       [COP_CUT] = {
6969 +               .handler = carry_cut,
6970 +               .estimate = carry_estimate_cut
6971 +       },
6972 +       [COP_PASTE] = {
6973 +               .handler = carry_paste,
6974 +               .estimate = carry_estimate_paste
6975 +       },
6976 +       [COP_EXTENT] = {
6977 +               .handler = carry_extent,
6978 +               .estimate = carry_estimate_extent
6979 +       },
6980 +       [COP_UPDATE] = {
6981 +               .handler = carry_update,
6982 +               .estimate = carry_estimate_update
6983 +       },
6984 +       [COP_INSERT_FLOW] = {
6985 +               .handler = carry_insert_flow,
6986 +               .estimate = carry_estimate_insert_flow
6987 +       }
6988 +};
6989 +
6990 +/* Make Linus happy.
6991 +   Local variables:
6992 +   c-indentation-style: "K&R"
6993 +   mode-name: "LC"
6994 +   c-basic-offset: 8
6995 +   tab-width: 8
6996 +   fill-column: 120
6997 +   scroll-step: 1
6998 +   End:
6999 +*/
7000 diff -rupN linux-2.6.8-rc3/fs/reiser4/carry_ops.h linux-2.6.8-rc3-a/fs/reiser4/carry_ops.h
7001 --- linux-2.6.8-rc3/fs/reiser4/carry_ops.h      1970-01-01 03:00:00.000000000 +0300
7002 +++ linux-2.6.8-rc3-a/fs/reiser4/carry_ops.h    2004-08-05 21:20:53.331608130 +0400
7003 @@ -0,0 +1,41 @@
7004 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
7005 +
7006 +/* implementation of carry operations. See carry_ops.c for details. */
7007 +
7008 +#if !defined( __CARRY_OPS_H__ )
7009 +#define __CARRY_OPS_H__
7010 +
7011 +#include "forward.h"
7012 +#include "znode.h"
7013 +#include "carry.h"
7014 +
7015 +/* carry operation handlers */
7016 +typedef struct carry_op_handler {
7017 +       /* perform operation */
7018 +       int (*handler) (carry_op * op, carry_level * doing, carry_level * todo);
7019 +       /* estimate memory requirements for @op */
7020 +       int (*estimate) (carry_op * op, carry_level * level);
7021 +} carry_op_handler;
7022 +
7023 +/* This is dispatch table for carry operations. It can be trivially
7024 +   abstracted into useful plugin: tunable balancing policy is a good
7025 +   thing. */
7026 +extern carry_op_handler op_dispatch_table[COP_LAST_OP];
7027 +
7028 +unsigned int space_needed(const znode * node, const coord_t * coord, const reiser4_item_data * data, int inserting);
7029 +extern carry_node *find_left_carry(carry_node * node, carry_level * level);
7030 +extern carry_node *find_right_carry(carry_node * node, carry_level * level);
7031 +
7032 +/* __CARRY_OPS_H__ */
7033 +#endif
7034 +
7035 +/* Make Linus happy.
7036 +   Local variables:
7037 +   c-indentation-style: "K&R"
7038 +   mode-name: "LC"
7039 +   c-basic-offset: 8
7040 +   tab-width: 8
7041 +   fill-column: 120
7042 +   scroll-step: 1
7043 +   End:
7044 +*/
7045 diff -rupN linux-2.6.8-rc3/fs/reiser4/cluster.c linux-2.6.8-rc3-a/fs/reiser4/cluster.c
7046 --- linux-2.6.8-rc3/fs/reiser4/cluster.c        1970-01-01 03:00:00.000000000 +0300
7047 +++ linux-2.6.8-rc3-a/fs/reiser4/cluster.c      2004-08-05 21:20:53.337606865 +0400
7048 @@ -0,0 +1,71 @@
7049 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
7050 +
7051 +/* Contains cluster operations for cryptcompress object plugin (see
7052 +   http://www.namesys.com/cryptcompress_design.txt for details). */
7053 +
7054 +/*         Concepts of clustering. Definition of cluster size.
7055 +          Data clusters, page clusters, disk clusters.
7056 +
7057 +
7058 +   In order to compress plain text we first should split it into chunks.
7059 +   Then we process each chunk independently by the following function:
7060 +
7061 +   void alg(char *input_ptr, int input_length, char *output_ptr, int *output_length);
7062 +
7063 +   where:
7064 +   input_ptr is a pointer to the first byte of input chunk (that contains plain text),
7065 +   input_len is a length of input chunk,
7066 +   output_ptr is a pointer to the first byte of output chunk (that contains processed text),
7067 +   *output_len is a length of output chunk.
7068 +
7069 +   the length of output chunk depends both on input_len and on the content of
7070 +   input chunk.  input_len (which can be assigned an arbitrary value) affects the
7071 +   compression quality (the more input_len the better the compression quality).
7072 +   For each cryptcompress file we assign special attribute - cluster size:
7073 +
7074 +   Cluster size is a file attribute, which determines the maximal size
7075 +   of input chunk that we use for compression.
7076 +
7077 +   So if we wanna compress a 10K-file with a cluster size of 4K, we split this file
7078 +   into three chunks (first and second - 4K, third - 2K). Those chunks are
7079 +   clusters in the space of file offsets (data clusters).
7080 +
7081 +   Cluster sizes are represented as (PAGE_CACHE_SIZE << shift), where
7082 +   shift (= 0, 1, 2,... ).  You'll note that this representation
7083 +   affects the allowed values for cluster size.  This is stored in
7084 +   disk stat-data (CLUSTER_STAT, layout is in reiser4_cluster_stat (see
7085 +   (plugin/item/static_stat.h) for details).
7086 +   Note that working with
7087 +   cluster_size > PAGE_SIZE (when cluster_shift > 0, and cluster contains more
7088 +   then one page) is suboptimal because before compression we should assemble
7089 +   all cluster pages into one flow (this means superfluous memcpy during
7090 +   read/write). So the better way to increase cluster size (and therefore
7091 +   compression quality) is making PAGE_SIZE larger (for instance by page
7092 +   clustering stuff of William Lee). But if you need PAGE_SIZE < cluster_size,
7093 +   then use the page clustering offered by reiser4.
7094 +
7095 +   The inode mapping of a cryptcompress file contains pages filled by plain text.
7096 +   Cluster size also defines clustering in address space. For example,
7097 +   101K-file with cluster size 16K (cluster shift = 2), which can be mapped
7098 +   into 26 pages, has 7 "page clusters": first six clusters contains 4 pages
7099 +   and one cluster contains 2 pages (for the file tail).
7100 +
7101 +   We split each output (compressed) chunk into special items to provide
7102 +   tight packing of data on disk (currently only ctails hold compressed data).
7103 +   This set of items we call a "disk cluster".
7104 +
7105 +   Each cluster is defined (like pages are) by its index (e.g. offset,
7106 +   but the unit is cluster size instead of PAGE_SIZE). Key offset of
7107 +   the first unit of the first item of each disk cluster (we call this a
7108 +   "key of disk cluster") is a multiple of the cluster index.
7109 +
7110 +   All read/write/truncate operations are performed upon clusters.
7111 +   For example, if we wanna read 40K of a cryptcompress file with cluster size 16K
7112 +   from offset = 20K, we first need to read two clusters (of indexes 1, 2). This
7113 +   means that all main methods of cryptcompress object plugin call appropriate
7114 +   cluster operation.
7115 +
7116 +   For the same index we use one structure (type reiser4_cluster_t) to
7117 +   represent all data/page/disk clusters.  (EDWARD-FIXME-HANS: are you
7118 +   sure that is good style? and where is the code that goes with this comment....;-) )
7119 +*/
7120 diff -rupN linux-2.6.8-rc3/fs/reiser4/cluster.h linux-2.6.8-rc3-a/fs/reiser4/cluster.h
7121 --- linux-2.6.8-rc3/fs/reiser4/cluster.h        1970-01-01 03:00:00.000000000 +0300
7122 +++ linux-2.6.8-rc3-a/fs/reiser4/cluster.h      2004-08-05 21:20:53.471578607 +0400
7123 @@ -0,0 +1,182 @@
7124 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
7125 +
7126 +/* This file contains page/cluster index translators and offset modulators
7127 +   See http://www.namesys.com/cryptcompress_design.html for details */
7128 +
7129 +#if !defined( __FS_REISER4_CLUSTER_H__ )
7130 +#define __FS_REISER4_CLUSTER_H__
7131 +
7132 +static inline loff_t min_count(loff_t a, loff_t b)
7133 +{
7134 +       return (a < b ? a : b);
7135 +}
7136 +
7137 +static inline __u8 inode_cluster_shift (struct inode * inode)
7138 +{
7139 +       assert("edward-92", inode != NULL);
7140 +       assert("edward-93", reiser4_inode_data(inode) != NULL);
7141 +       assert("edward-94", inode_get_flag(inode, REISER4_CLUSTER_KNOWN));
7142 +
7143 +       return reiser4_inode_data(inode)->cluster_shift;
7144 +}
7145 +
7146 +/* returns number of pages in the cluster */
7147 +static inline int inode_cluster_pages (struct inode * inode)
7148 +{
7149 +       return (1 << inode_cluster_shift(inode));
7150 +}
7151 +
7152 +static inline size_t inode_cluster_size (struct inode * inode)
7153 +{
7154 +       assert("edward-96", inode != NULL);
7155 +
7156 +       return (PAGE_CACHE_SIZE << inode_cluster_shift(inode));
7157 +}
7158 +
7159 +static inline unsigned long
7160 +pg_to_clust(unsigned long idx, struct inode * inode)
7161 +{
7162 +       return idx >> inode_cluster_shift(inode);
7163 +}
7164 +
7165 +static inline unsigned long
7166 +clust_to_pg(unsigned long idx, struct inode * inode)
7167 +{
7168 +       return idx << inode_cluster_shift(inode);
7169 +}
7170 +
7171 +static inline unsigned long
7172 +pg_to_clust_to_pg(unsigned long idx, struct inode * inode)
7173 +{
7174 +       return clust_to_pg(pg_to_clust(idx, inode), inode);
7175 +}
7176 +
7177 +static inline unsigned long
7178 +off_to_pg(loff_t off)
7179 +{
7180 +       return (off >> PAGE_CACHE_SHIFT);
7181 +}
7182 +
7183 +static inline loff_t
7184 +pg_to_off(unsigned long idx)
7185 +{
7186 +       return ((loff_t)(idx) << PAGE_CACHE_SHIFT);
7187 +}
7188 +
7189 +static inline unsigned long
7190 +off_to_clust(loff_t off, struct inode * inode)
7191 +{
7192 +       return pg_to_clust(off_to_pg(off), inode);
7193 +}
7194 +
7195 +static inline loff_t
7196 +clust_to_off(unsigned long idx, struct inode * inode)
7197 +{
7198 +       return pg_to_off(clust_to_pg(idx, inode));
7199 +}
7200 +
7201 +static inline loff_t
7202 +off_to_clust_to_off(loff_t off, struct inode * inode)
7203 +{
7204 +       return clust_to_off(off_to_clust(off, inode), inode);
7205 +}
7206 +
7207 +static inline unsigned long
7208 +off_to_clust_to_pg(loff_t off, struct inode * inode)
7209 +{
7210 +       return clust_to_pg(off_to_clust(off, inode), inode);
7211 +}
7212 +
7213 +static inline unsigned
7214 +off_to_pgoff(loff_t off)
7215 +{
7216 +       return off & (PAGE_CACHE_SIZE - 1);
7217 +}
7218 +
7219 +static inline unsigned
7220 +off_to_cloff(loff_t off, struct inode * inode)
7221 +{
7222 +       return off & ((loff_t)(inode_cluster_size(inode)) - 1);
7223 +}
7224 +
7225 +static inline unsigned
7226 +pg_to_off_to_cloff(unsigned long idx, struct inode * inode)
7227 +{
7228 +       return off_to_cloff(pg_to_off(idx), inode);
7229 +}
7230 +
7231 +/* if @size != 0, returns index of the page
7232 +   which contains the last byte of the file */
7233 +static inline pgoff_t
7234 +size_to_pg(loff_t size)
7235 +{
7236 +       return (size ? off_to_pg(size - 1) : 0);
7237 +}
7238 +
7239 +/* minimal index of the page which doesn't contain
7240 +   file data */
7241 +static inline pgoff_t
7242 +size_to_next_pg(loff_t size)
7243 +{
7244 +       return (size ? off_to_pg(size - 1) + 1 : 0);
7245 +}
7246 +
7247 +static inline unsigned
7248 +off_to_pgcount(loff_t off, unsigned long idx)
7249 +{
7250 +       if (idx > off_to_pg(off))
7251 +               return 0;
7252 +       if (idx < off_to_pg(off))
7253 +               return PAGE_CACHE_SIZE;
7254 +       return off_to_pgoff(off);
7255 +}
7256 +
7257 +static inline unsigned
7258 +off_to_count(loff_t off, unsigned long idx, struct inode * inode)
7259 +{
7260 +       if (idx > off_to_clust(off, inode))
7261 +               return 0;
7262 +       if (idx < off_to_clust(off, inode))
7263 +               return inode_cluster_size(inode);
7264 +       return off_to_cloff(off, inode);
7265 +}
7266 +
7267 +static inline unsigned
7268 +fsize_to_count(reiser4_cluster_t * clust, struct inode * inode)
7269 +{
7270 +       assert("edward-288", clust != NULL);
7271 +       assert("edward-289", inode != NULL);
7272 +
7273 +       return off_to_count(inode->i_size, clust->index, inode);
7274 +}
7275 +
7276 +static inline int
7277 +alloc_clust_pages(reiser4_cluster_t * clust, struct inode * inode )
7278 +{
7279 +       assert("edward-791", clust != NULL);
7280 +       assert("edward-792", inode != NULL);
7281 +       clust->pages = reiser4_kmalloc(sizeof(*clust->pages) << inode_cluster_shift(inode), GFP_KERNEL);
7282 +       if (!clust->pages)
7283 +               return -ENOMEM;
7284 +       return 0;
7285 +}
7286 +
7287 +static inline void
7288 +free_clust_pages(reiser4_cluster_t * clust)
7289 +{
7290 +       reiser4_kfree(clust->pages);
7291 +}
7292 +
7293 +#endif /* __FS_REISER4_CLUSTER_H__ */
7294 +
7295 +
7296 +/* Make Linus happy.
7297 +   Local variables:
7298 +   c-indentation-style: "K&R"
7299 +   mode-name: "LC"
7300 +   c-basic-offset: 8
7301 +   tab-width: 8
7302 +   fill-column: 120
7303 +   scroll-step: 1
7304 +   End:
7305 +*/
7306 diff -rupN linux-2.6.8-rc3/fs/reiser4/context.c linux-2.6.8-rc3-a/fs/reiser4/context.c
7307 --- linux-2.6.8-rc3/fs/reiser4/context.c        1970-01-01 03:00:00.000000000 +0300
7308 +++ linux-2.6.8-rc3-a/fs/reiser4/context.c      2004-08-05 21:20:53.479576920 +0400
7309 @@ -0,0 +1,373 @@
7310 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
7311 +
7312 +/* Manipulation of reiser4_context */
7313 +
7314 +/*
7315 + * global context used during system call. Variable of this type is allocated
7316 + * on the stack at the beginning of the reiser4 part of the system call and
7317 + * pointer to it is stored in the current->fs_context. This allows us to avoid
7318 + * passing pointer to current transaction and current lockstack (both in
7319 + * one-to-one mapping with threads) all over the call chain.
7320 + *
7321 + * It's kind of like those global variables the prof used to tell you not to
7322 + * use in CS1, except thread specific.;-) Nikita, this was a good idea.
7323 + *
7324 + * In some situations it is desirable to have ability to enter reiser4_context
7325 + * more than once for the same thread (nested contexts). For example, there
7326 + * are some functions that can be called either directly from VFS/VM or from
7327 + * already active reiser4 context (->writepage, for example).
7328 + *
7329 + * In such situations "child" context acts like dummy: all activity is
7330 + * actually performed in the top level context, and get_current_context()
7331 + * always returns top level context. Of course, init_context()/done_context()
7332 + * have to be properly nested any way.
7333 + *
7334 + * Note that there is an important difference between reiser4 uses
7335 + * ->fs_context and the way other file systems use it. Other file systems
7336 + * (ext3 and reiserfs) use ->fs_context only for the duration of _transaction_
7337 + * (this is why ->fs_context was initially called ->journal_info). This means,
7338 + * that when ext3 or reiserfs finds that ->fs_context is not NULL on the entry
7339 + * to the file system, they assume that some transaction is already underway,
7340 + * and usually bail out, because starting nested transaction would most likely
7341 + * lead to the deadlock. This gives false positives with reiser4, because we
7342 + * set ->fs_context before starting transaction.
7343 + */
7344 +
7345 +#include "debug.h"
7346 +#include "super.h"
7347 +#include "context.h"
7348 +
7349 +#include <linux/writeback.h> /* balance_dirty_pages() */
7350 +
7351 +#if REISER4_DEBUG_CONTEXTS
7352 +/* List of all currently active contexts, used for debugging purposes.  */
7353 +context_list_head active_contexts;
7354 +/* lock protecting access to active_contexts. */
7355 +spinlock_t active_contexts_lock;
7356 +
7357 +void
7358 +check_contexts(void)
7359 +{
7360 +       reiser4_context *ctx;
7361 +
7362 +       spin_lock(&active_contexts_lock);
7363 +       for_all_type_safe_list(context, &active_contexts, ctx) {
7364 +               assert("vs-$BIGNUM", ctx->magic == context_magic);
7365 +       }
7366 +       spin_unlock(&active_contexts_lock);
7367 +}
7368 +/* REISER4_DEBUG_CONTEXTS */
7369 +#endif
7370 +
7371 +struct {
7372 +       void *task;
7373 +       void *context;
7374 +       void *path[16];
7375 +} context_ok;
7376 +
7377 +
7378 +
7379 +reiser4_internal void get_context_ok(reiser4_context *ctx)
7380 +{
7381 +       int i;
7382 +       void *addr = NULL, *frame = NULL;
7383 +
7384 +#define CTX_FRAME(nr)                                          \
7385 +       case (nr):                                              \
7386 +               addr  = __builtin_return_address((nr));         \
7387 +                frame = __builtin_frame_address(nr);           \
7388 +               break
7389 +
7390 +       memset(&context_ok, 0, sizeof(context_ok));
7391 +
7392 +       context_ok.task = current;
7393 +       context_ok.context = ctx;
7394 +       for (i = 0; i < 16; i ++) {
7395 +               switch(i) {
7396 +                       CTX_FRAME(0);
7397 +                       CTX_FRAME(1);
7398 +                       CTX_FRAME(2);
7399 +                       CTX_FRAME(3);
7400 +                       CTX_FRAME(4);
7401 +                       CTX_FRAME(5);
7402 +                       CTX_FRAME(6);
7403 +                       CTX_FRAME(7);
7404 +                       CTX_FRAME(8);
7405 +                       CTX_FRAME(9);
7406 +                       CTX_FRAME(10);
7407 +                       CTX_FRAME(11);
7408 +                       CTX_FRAME(12);
7409 +                       CTX_FRAME(13);
7410 +                       CTX_FRAME(14);
7411 +                       CTX_FRAME(15);
7412 +               default:
7413 +                       impossible("", "");
7414 +               }
7415 +               if (frame > (void *)ctx)
7416 +                       break;
7417 +               context_ok.path[i] = addr;
7418 +       }
7419 +#undef CTX_FRAME
7420 +}
7421 +
7422 +
7423 +/* initialise context and bind it to the current thread
7424 +
7425 +   This function should be called at the beginning of reiser4 part of
7426 +   syscall.
7427 +*/
7428 +reiser4_internal int
7429 +init_context(reiser4_context * context /* pointer to the reiser4 context
7430 +                                        * being initalised */ ,
7431 +            struct super_block *super  /* super block we are going to
7432 +                                        * work with */)
7433 +{
7434 +       assert("nikita-2662", !in_interrupt() && !in_irq());
7435 +       assert("nikita-3356", context != NULL);
7436 +       assert("nikita-3357", super != NULL);
7437 +       assert("nikita-3358", super->s_op == NULL || is_reiser4_super(super));
7438 +
7439 +       xmemset(context, 0, sizeof *context);
7440 +
7441 +       if (is_in_reiser4_context()) {
7442 +               reiser4_context *parent;
7443 +
7444 +               parent = (reiser4_context *) current->journal_info;
7445 +               /* NOTE-NIKITA this is dubious */
7446 +               if (parent->super == super) {
7447 +                       context->parent = parent;
7448 +#if (REISER4_DEBUG)
7449 +                       ++context->parent->nr_children;
7450 +#endif
7451 +                       return 0;
7452 +               }
7453 +       }
7454 +
7455 +       context->super = super;
7456 +       context->magic = context_magic;
7457 +       context->outer = current->journal_info;
7458 +       current->journal_info = (void *) context;
7459 +
7460 +       init_lock_stack(&context->stack);
7461 +
7462 +       txn_begin(context);
7463 +
7464 +       context->parent = context;
7465 +       tap_list_init(&context->taps);
7466 +#if REISER4_DEBUG
7467 +#if REISER4_DEBUG_CONTEXTS
7468 +       context_list_clean(context);    /* to satisfy assertion */
7469 +       spin_lock(&active_contexts_lock);
7470 +       context_list_check(&active_contexts);
7471 +       context_list_push_front(&active_contexts, context);
7472 +       /*check_contexts();*/
7473 +       spin_unlock(&active_contexts_lock);
7474 +#endif
7475 +       context->task = current;
7476 +#endif
7477 +       grab_space_enable();
7478 +       return 0;
7479 +}
7480 +
7481 +/* cast lock stack embedded into reiser4 context up to its container */
7482 +reiser4_internal reiser4_context *
7483 +get_context_by_lock_stack(lock_stack * owner)
7484 +{
7485 +       return container_of(owner, reiser4_context, stack);
7486 +}
7487 +
7488 +/* true if there is already _any_ reiser4 context for the current thread */
7489 +reiser4_internal int
7490 +is_in_reiser4_context(void)
7491 +{
7492 +       reiser4_context *ctx;
7493 +
7494 +       ctx = current->journal_info;
7495 +       return
7496 +               ctx != NULL &&
7497 +               ((unsigned long) ctx->magic) == context_magic;
7498 +}
7499 +
7500 +/*
7501 + * call balance dirty pages for the current context.
7502 + *
7503 + * File system is expected to call balance_dirty_pages_ratelimited() whenever
7504 + * it dirties a page. reiser4 does this for unformatted nodes (that is, during
7505 + * write---this covers vast majority of all dirty traffic), but we cannot do
7506 + * this immediately when formatted node is dirtied, because long term lock is
7507 + * usually held at that time. To work around this, dirtying of formatted node
7508 + * simply increases ->nr_marked_dirty counter in the current reiser4
7509 + * context. When we are about to leave this context,
7510 + * balance_dirty_pages_ratelimited() is called, if necessary.
7511 + *
7512 + * This introduces another problem: sometimes we do not want to run
7513 + * balance_dirty_pages_ratelimited() when leaving a context, for example
7514 + * because some important lock (like ->i_sem on the parent directory) is
7515 + * held. To achieve this, ->nobalance flag can be set in the current context.
7516 + */
7517 +static void
7518 +balance_dirty_pages_at(reiser4_context * context)
7519 +{
7520 +       reiser4_super_info_data * sbinfo = get_super_private(context->super);
7521 +
7522 +       /*
7523 +        * call balance_dirty_pages_ratelimited() to process formatted nodes
7524 +        * dirtied during this system call.
7525 +        */
7526 +       if (context->nr_marked_dirty != 0 &&   /* were any nodes dirtied? */
7527 +           /* aren't we called early during mount? */
7528 +           sbinfo->fake &&
7529 +           /* don't call balance dirty pages from ->writepage(): it's
7530 +            * deadlock prone */
7531 +           !(current->flags & PF_MEMALLOC) &&
7532 +           /* and don't stall pdflush */
7533 +           !current_is_pdflush())
7534 +               balance_dirty_pages_ratelimited(sbinfo->fake->i_mapping);
7535 +}
7536 +
7537 +/*
7538 + * exit reiser4 context. Call balance_dirty_pages_at() if necessary. Close
7539 + * transaction. Call done_context() to do context related book-keeping.
7540 + */
7541 +reiser4_internal void reiser4_exit_context(reiser4_context * context)
7542 +{
7543 +       assert("nikita-3021", schedulable());
7544 +
7545 +       if (context == context->parent) {
7546 +               if (!context->nobalance)
7547 +                       balance_dirty_pages_at(context);
7548 +               txn_end(context);
7549 +       }
7550 +       done_context(context);
7551 +}
7552 +
7553 +/* release resources associated with context.
7554 +
7555 +   This function should be called at the end of "session" with reiser4,
7556 +   typically just before leaving reiser4 driver back to VFS.
7557 +
7558 +   This is good place to put some degugging consistency checks, like that
7559 +   thread released all locks and closed transcrash etc.
7560 +
7561 +*/
7562 +reiser4_internal void
7563 +done_context(reiser4_context * context /* context being released */)
7564 +{
7565 +       reiser4_context *parent;
7566 +       assert("nikita-860", context != NULL);
7567 +
7568 +       parent = context->parent;
7569 +       assert("nikita-2174", parent != NULL);
7570 +       assert("nikita-2093", parent == parent->parent);
7571 +       assert("nikita-859", parent->magic == context_magic);
7572 +       assert("vs-646", (reiser4_context *) current->journal_info == parent);
7573 +       assert("zam-686", !in_interrupt() && !in_irq());
7574 +
7575 +       /* only do anything when leaving top-level reiser4 context. All nested
7576 +        * contexts are just dummies. */
7577 +       if (parent == context) {
7578 +               assert("jmacd-673", parent->trans == NULL);
7579 +               assert("jmacd-1002", lock_stack_isclean(&parent->stack));
7580 +               assert("nikita-1936", no_counters_are_held());
7581 +               assert("nikita-3403", !delayed_inode_updates(context->dirty));
7582 +               assert("nikita-2626", tap_list_empty(taps_list()));
7583 +               assert("zam-1004", get_super_private(context->super)->delete_sema_owner != current);
7584 +
7585 +               /* release all grabbed but as yet unused blocks */
7586 +               if (context->grabbed_blocks != 0)
7587 +                       all_grabbed2free();
7588 +
7589 +               /*
7590 +                * synchronize against longterm_unlock_znode():
7591 +                * wake_up_requestor() wakes up requestors without holding
7592 +                * zlock (otherwise they will immediately bump into that lock
7593 +                * after wake up on another CPU). To work around (rare)
7594 +                * situation where requestor has been woken up asynchronously
7595 +                * and managed to run until completion (and destroy its
7596 +                * context and lock stack) before wake_up_requestor() called
7597 +                * wake_up() on it, wake_up_requestor() synchronize on lock
7598 +                * stack spin lock. It has actually been observed that spin
7599 +                * lock _was_ locked at this point, because
7600 +                * wake_up_requestor() took interrupt.
7601 +                */
7602 +               spin_lock_stack(&context->stack);
7603 +               spin_unlock_stack(&context->stack);
7604 +
7605 +#if REISER4_DEBUG_CONTEXTS
7606 +               /* remove from active contexts */
7607 +               spin_lock(&active_contexts_lock);
7608 +               /*check_contexts();*/
7609 +               context_list_remove(parent);
7610 +               spin_unlock(&active_contexts_lock);
7611 +#endif
7612 +               assert("zam-684", context->nr_children == 0);
7613 +               /* restore original ->fs_context value */
7614 +               current->journal_info = context->outer;
7615 +       } else {
7616 +#if REISER4_DEBUG
7617 +               parent->nr_children--;
7618 +               assert("zam-685", parent->nr_children >= 0);
7619 +#endif
7620 +       }
7621 +}
7622 +
7623 +/* Initialize list of all contexts */
7624 +reiser4_internal int
7625 +init_context_mgr(void)
7626 +{
7627 +#if REISER4_DEBUG_CONTEXTS
7628 +       spin_lock_init(&active_contexts_lock);
7629 +       context_list_init(&active_contexts);
7630 +#endif
7631 +       return 0;
7632 +}
7633 +
7634 +#if REISER4_DEBUG_OUTPUT
7635 +/* debugging function: output reiser4 context contexts in the human readable
7636 + * form  */
7637 +reiser4_internal void
7638 +print_context(const char *prefix, reiser4_context * context)
7639 +{
7640 +       if (context == NULL) {
7641 +               printk("%s: null context\n", prefix);
7642 +               return;
7643 +       }
7644 +#if REISER4_TRACE
7645 +       printk("%s: trace_flags: %x\n", prefix, context->trace_flags);
7646 +#endif
7647 +       print_lock_counters("\tlocks", &context->locks);
7648 +#if REISER4_DEBUG
7649 +       printk("pid: %i, comm: %s\n", context->task->pid, context->task->comm);
7650 +#endif
7651 +       print_lock_stack("\tlock stack", &context->stack);
7652 +       info_atom("\tatom", context->trans_in_ctx.atom);
7653 +}
7654 +
7655 +#if REISER4_DEBUG_CONTEXTS
7656 +/* debugging: dump contents of all active contexts */
7657 +void
7658 +print_contexts(void)
7659 +{
7660 +       reiser4_context *context;
7661 +
7662 +       spin_lock(&active_contexts_lock);
7663 +
7664 +       for_all_type_safe_list(context, &active_contexts, context) {
7665 +               print_context("context", context);
7666 +       }
7667 +
7668 +       spin_unlock(&active_contexts_lock);
7669 +}
7670 +#endif
7671 +#endif
7672 +
7673 +/* Make Linus happy.
7674 +   Local variables:
7675 +   c-indentation-style: "K&R"
7676 +   mode-name: "LC"
7677 +   c-basic-offset: 8
7678 +   tab-width: 8
7679 +   fill-column: 120
7680 +   scroll-step: 1
7681 +   End:
7682 +*/
7683 diff -rupN linux-2.6.8-rc3/fs/reiser4/context.h linux-2.6.8-rc3-a/fs/reiser4/context.h
7684 --- linux-2.6.8-rc3/fs/reiser4/context.h        1970-01-01 03:00:00.000000000 +0300
7685 +++ linux-2.6.8-rc3-a/fs/reiser4/context.h      2004-08-05 21:20:53.199635966 +0400
7686 @@ -0,0 +1,315 @@
7687 +/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
7688 + * reiser4/README */
7689 +
7690 +/* Reiser4 context. See context.c for details. */
7691 +
7692 +#if !defined( __REISER4_CONTEXT_H__ )
7693 +#define __REISER4_CONTEXT_H__
7694 +
7695 +#include "forward.h"
7696 +#include "debug.h"
7697 +#include "spin_macros.h"
7698 +#include "dformat.h"
7699 +#include "type_safe_list.h"
7700 +#include "tap.h"
7701 +#include "lock.h"
7702 +
7703 +#include <linux/types.h>       /* for __u??  */
7704 +#include <linux/fs.h>          /* for struct super_block  */
7705 +#include <linux/spinlock.h>
7706 +#include <linux/sched.h>       /* for struct task_struct */
7707 +
7708 +/* list of active lock stacks */
7709 +#if REISER4_DEBUG_CONTEXTS
7710 +TYPE_SAFE_LIST_DECLARE(context);
7711 +#endif
7712 +
7713 +ON_DEBUG(TYPE_SAFE_LIST_DECLARE(flushers);)
7714 +
7715 +#if REISER4_DEBUG
7716 +
7717 +/*
7718 + * Stat-data update tracking.
7719 + *
7720 + * Some reiser4 functions (reiser4_{del,add}_nlink() take an additional
7721 + * parameter indicating whether stat-data update should be performed. This is
7722 + * because sometimes fields of the same inode are modified several times
7723 + * during single system and updating stat-data (which implies tree lookup and,
7724 + * sometimes, tree balancing) on each inode modification is too expensive. To
7725 + * avoid unnecessary stat-data updates, we pass flag to not update it during
7726 + * inode field updates, and update it manually at the end of the system call.
7727 + *
7728 + * This introduces a possibility of "missed stat data update" when final
7729 + * stat-data update is not performed in some code path. To detect and track
7730 + * down such situations following code was developed.
7731 + *
7732 + * dirty_inode_info is an array of slots. Each slot keeps information about
7733 + * "delayed stat data update", that is about a call to a function modifying
7734 + * inode field that was instructed to not update stat data. Direct call to
7735 + * reiser4_update_sd() clears corresponding slot. On leaving reiser4 context
7736 + * all slots are scanned and information about still not forced updates is
7737 + * printed.
7738 + */
7739 +
7740 +/* how many delayed stat data update slots to remember */
7741 +#define TRACKED_DELAYED_UPDATE (0)
7742 +
7743 +typedef struct {
7744 +       ino_t ino;      /* inode number of object with delayed stat data
7745 +                        * update */
7746 +       int   delayed;  /* 1 if update is delayed, 0 if update for forced */
7747 +       void *stack[4]; /* stack back-trace of the call chain where update was
7748 +                        * delayed */
7749 +} dirty_inode_info[TRACKED_DELAYED_UPDATE];
7750 +
7751 +extern void mark_inode_update(struct inode *object, int immediate);
7752 +extern int  delayed_inode_updates(dirty_inode_info info);
7753 +
7754 +#else
7755 +
7756 +typedef struct {} dirty_inode_info;
7757 +
7758 +#define mark_inode_update(object, immediate) noop
7759 +#define delayed_inode_updates(info) noop
7760 +
7761 +#endif
7762 +
7763 +/* reiser4 per-thread context */
7764 +struct reiser4_context {
7765 +       /* magic constant. For identification of reiser4 contexts. */
7766 +       __u32 magic;
7767 +
7768 +       /* current lock stack. See lock.[ch]. This is where list of all
7769 +          locks taken by current thread is kept. This is also used in
7770 +          deadlock detection. */
7771 +       lock_stack stack;
7772 +
7773 +       /* current transcrash. */
7774 +       txn_handle *trans;
7775 +       /* transaction handle embedded into reiser4_context. ->trans points
7776 +        * here by default. */
7777 +       txn_handle trans_in_ctx;
7778 +
7779 +       /* super block we are working with.  To get the current tree
7780 +          use &get_super_private (reiser4_get_current_sb ())->tree. */
7781 +       struct super_block *super;
7782 +
7783 +       /* parent fs activation */
7784 +       struct fs_activation *outer;
7785 +
7786 +       /* per-thread grabbed (for further allocation) blocks counter */
7787 +       reiser4_block_nr grabbed_blocks;
7788 +
7789 +       /* parent context */
7790 +       reiser4_context *parent;
7791 +
7792 +       /* list of taps currently monitored. See tap.c */
7793 +       tap_list_head taps;
7794 +
7795 +       /* grabbing space is enabled */
7796 +       int grab_enabled  :1;
7797 +       /* should be set when we are write dirty nodes to disk in jnode_flush or
7798 +        * reiser4_write_logs() */
7799 +       int writeout_mode :1;
7800 +       /* true, if current thread is an ent thread */
7801 +       int entd          :1;
7802 +       /* true, if balance_dirty_pages() should not be run when leaving this
7803 +        * context. This is used to avoid lengthly balance_dirty_pages()
7804 +        * operation when holding some important resource, like directory
7805 +        * ->i_sem */
7806 +       int nobalance     :1;
7807 +
7808 +       /* count non-trivial jnode_set_dirty() calls */
7809 +       unsigned long nr_marked_dirty;
7810 +#if REISER4_DEBUG
7811 +       /* A link of all active contexts. */
7812 +       context_list_link contexts_link;
7813 +       /* debugging information about reiser4 locks held by the current
7814 +        * thread */
7815 +       lock_counters_info locks;
7816 +       int nr_children;        /* number of child contexts */
7817 +       struct task_struct *task; /* so we can easily find owner of the stack */
7818 +
7819 +       /*
7820 +        * disk space grabbing debugging support
7821 +        */
7822 +       /* how many disk blocks were grabbed by the first call to
7823 +        * reiser4_grab_space() in this context */
7824 +       reiser4_block_nr grabbed_initially;
7825 +       /* stack back-trace of the first call to reiser4_grab_space() in this
7826 +        * context */
7827 +       backtrace_path   grabbed_at;
7828 +
7829 +       /* list of all threads doing flush currently */
7830 +       flushers_list_link  flushers_link;
7831 +       /* information about last error encountered by reiser4 */
7832 +       err_site err;
7833 +       /* information about delayed stat data updates. See above. */
7834 +       dirty_inode_info dirty;
7835 +#endif
7836 +
7837 +#if REISER4_TRACE
7838 +       /* per-thread tracing flags. Use reiser4_trace_flags enum to set
7839 +          bits in it. */
7840 +       __u32 trace_flags;
7841 +#endif
7842 +#if REISER4_DEBUG_NODE
7843 +       /*
7844 +        * don't perform node consistency checks while this is greater than
7845 +        * zero. Used during operations that temporary violate node
7846 +        * consistency.
7847 +        */
7848 +       int disable_node_check;
7849 +#endif
7850 +};
7851 +
7852 +#if REISER4_DEBUG_CONTEXTS
7853 +TYPE_SAFE_LIST_DEFINE(context, reiser4_context, contexts_link);
7854 +#endif
7855 +#if REISER4_DEBUG
7856 +TYPE_SAFE_LIST_DEFINE(flushers, reiser4_context, flushers_link);
7857 +#endif
7858 +
7859 +extern reiser4_context *get_context_by_lock_stack(lock_stack *);
7860 +
7861 +/* Debugging helps. */
7862 +extern int init_context_mgr(void);
7863 +#if REISER4_DEBUG_OUTPUT
7864 +extern void print_context(const char *prefix, reiser4_context * ctx);
7865 +#else
7866 +#define print_context(p,c) noop
7867 +#endif
7868 +
7869 +#if REISER4_DEBUG_CONTEXTS && REISER4_DEBUG_OUTPUT
7870 +extern void print_contexts(void);
7871 +#else
7872 +#define print_contexts() noop
7873 +#endif
7874 +
7875 +#if REISER4_DEBUG_CONTEXTS
7876 +extern void check_contexts(void);
7877 +#else
7878 +#define check_contexts() noop
7879 +#endif
7880 +
7881 +#define current_tree (&(get_super_private(reiser4_get_current_sb())->tree))
7882 +#define current_blocksize reiser4_get_current_sb()->s_blocksize
7883 +#define current_blocksize_bits reiser4_get_current_sb()->s_blocksize_bits
7884 +
7885 +extern int init_context(reiser4_context * context, struct super_block *super);
7886 +extern void done_context(reiser4_context * context);
7887 +
7888 +/* magic constant we store in reiser4_context allocated at the stack. Used to
7889 +   catch accesses to staled or uninitialized contexts. */
7890 +#define context_magic ((__u32) 0x4b1b5d0b)
7891 +
7892 +extern int is_in_reiser4_context(void);
7893 +
7894 +/* return context associated with given thread */
7895 +
7896 +void get_context_ok(reiser4_context *);
7897 +
7898 +/*
7899 + * return reiser4_context for the thread @tsk
7900 + */
7901 +static inline reiser4_context *
7902 +get_context(const struct task_struct *tsk)
7903 +{
7904 +       assert("vs-1682", ((reiser4_context *) tsk->journal_info)->magic == context_magic);
7905 +       return (reiser4_context *) tsk->journal_info;
7906 +}
7907 +
7908 +/*
7909 + * return reiser4 context of the current thread, or NULL if there is none.
7910 + */
7911 +static inline reiser4_context *
7912 +get_current_context_check(void)
7913 +{
7914 +       if (is_in_reiser4_context())
7915 +               return get_context(current);
7916 +       else
7917 +               return NULL;
7918 +}
7919 +
7920 +static inline reiser4_context * get_current_context(void);/* __attribute__((const));*/
7921 +
7922 +/* return context associated with current thread */
7923 +static inline reiser4_context *
7924 +get_current_context(void)
7925 +{
7926 +       return get_context(current);
7927 +}
7928 +
7929 +/*
7930 + * true if current thread is in the write-out mode. Thread enters write-out
7931 + * mode during jnode_flush and reiser4_write_logs().
7932 + */
7933 +static inline int is_writeout_mode(void)
7934 +{
7935 +       return get_current_context()->writeout_mode;
7936 +}
7937 +
7938 +/*
7939 + * enter write-out mode
7940 + */
7941 +static inline void writeout_mode_enable(void)
7942 +{
7943 +       assert("zam-941", !get_current_context()->writeout_mode);
7944 +       get_current_context()->writeout_mode = 1;
7945 +}
7946 +
7947 +/*
7948 + * leave write-out mode
7949 + */
7950 +static inline void writeout_mode_disable(void)
7951 +{
7952 +       assert("zam-942", get_current_context()->writeout_mode);
7953 +       get_current_context()->writeout_mode = 0;
7954 +}
7955 +
7956 +static inline void grab_space_enable(void)
7957 +{
7958 +       get_current_context()->grab_enabled = 1;
7959 +}
7960 +
7961 +static inline void grab_space_disable(void)
7962 +{
7963 +       get_current_context()->grab_enabled = 0;
7964 +}
7965 +
7966 +static inline void grab_space_set_enabled (int enabled)
7967 +{
7968 +       get_current_context()->grab_enabled = enabled;
7969 +}
7970 +
7971 +static inline int is_grab_enabled(reiser4_context *ctx)
7972 +{
7973 +       return ctx->grab_enabled;
7974 +}
7975 +
7976 +/* mark transaction handle in @ctx as TXNH_DONT_COMMIT, so that no commit or
7977 + * flush would be performed when it is closed. This is necessary when handle
7978 + * has to be closed under some coarse semaphore, like i_sem of
7979 + * directory. Commit will be performed by ktxnmgrd. */
7980 +static inline void context_set_commit_async(reiser4_context * context)
7981 +{
7982 +       context = context->parent;
7983 +       context->nobalance = 1;
7984 +       context->trans->flags |= TXNH_DONT_COMMIT;
7985 +}
7986 +
7987 +extern void reiser4_exit_context(reiser4_context * context);
7988 +
7989 +/* __REISER4_CONTEXT_H__ */
7990 +#endif
7991 +
7992 +/* Make Linus happy.
7993 +   Local variables:
7994 +   c-indentation-style: "K&R"
7995 +   mode-name: "LC"
7996 +   c-basic-offset: 8
7997 +   tab-width: 8
7998 +   fill-column: 120
7999 +   scroll-step: 1
8000 +   End:
8001 +*/
8002 diff -rupN linux-2.6.8-rc3/fs/reiser4/coord.c linux-2.6.8-rc3-a/fs/reiser4/coord.c
8003 --- linux-2.6.8-rc3/fs/reiser4/coord.c  1970-01-01 03:00:00.000000000 +0300
8004 +++ linux-2.6.8-rc3-a/fs/reiser4/coord.c        2004-08-05 21:20:53.301614457 +0400
8005 @@ -0,0 +1,1001 @@
8006 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
8007 +
8008 +#include "forward.h"
8009 +#include "debug.h"
8010 +#include "dformat.h"
8011 +#include "tree.h"
8012 +#include "plugin/item/item.h"
8013 +#include "znode.h"
8014 +#include "coord.h"
8015 +
8016 +/* Internal constructor. */
8017 +static inline void
8018 +coord_init_values(coord_t *coord, const znode *node, pos_in_node_t item_pos,
8019 +                 pos_in_node_t unit_pos, between_enum between)
8020 +{
8021 +       coord->node = (znode *) node;
8022 +       coord_set_item_pos(coord, item_pos);
8023 +       coord->unit_pos = unit_pos;
8024 +       coord->between = between;
8025 +       ON_DEBUG(coord->plug_v = 0);
8026 +       ON_DEBUG(coord->body_v = 0);
8027 +
8028 +       /*ON_TRACE (TRACE_COORDS, "init coord %p node %p: %u %u %s\n", coord, node, item_pos, unit_pos, coord_tween_tostring (between)); */
8029 +}
8030 +
8031 +/* after shifting of node content, coord previously set properly may become
8032 +   invalid, try to "normalize" it. */
8033 +reiser4_internal void
8034 +coord_normalize(coord_t *coord)
8035 +{
8036 +       znode *node;
8037 +
8038 +       node = coord->node;
8039 +       assert("vs-683", node);
8040 +
8041 +       coord_clear_iplug(coord);
8042 +
8043 +       if (node_is_empty(node)) {
8044 +               coord_init_first_unit(coord, node);
8045 +       } else if ((coord->between == AFTER_ITEM) || (coord->between == AFTER_UNIT)) {
8046 +               return;
8047 +       } else if (coord->item_pos == coord_num_items(coord) && coord->between == BEFORE_ITEM) {
8048 +               coord_dec_item_pos(coord);
8049 +               coord->between = AFTER_ITEM;
8050 +       } else if (coord->unit_pos == coord_num_units(coord) && coord->between == BEFORE_UNIT) {
8051 +               coord->unit_pos--;
8052 +               coord->between = AFTER_UNIT;
8053 +       } else if (coord->item_pos == coord_num_items(coord) && coord->unit_pos == 0 && coord->between == BEFORE_UNIT) {
8054 +               coord_dec_item_pos(coord);
8055 +               coord->unit_pos = 0;
8056 +               coord->between = AFTER_ITEM;
8057 +       }
8058 +}
8059 +
8060 +/* Copy a coordinate. */
8061 +reiser4_internal void
8062 +coord_dup(coord_t * coord, const coord_t * old_coord)
8063 +{
8064 +       assert("jmacd-9800", coord_check(old_coord));
8065 +       coord_dup_nocheck(coord, old_coord);
8066 +}
8067 +
8068 +/* Copy a coordinate without check. Useful when old_coord->node is not
8069 +   loaded. As in cbk_tree_lookup -> connect_znode -> connect_one_side */
8070 +reiser4_internal void
8071 +coord_dup_nocheck(coord_t * coord, const coord_t * old_coord)
8072 +{
8073 +       coord->node = old_coord->node;
8074 +       coord_set_item_pos(coord, old_coord->item_pos);
8075 +       coord->unit_pos = old_coord->unit_pos;
8076 +       coord->between = old_coord->between;
8077 +       coord->iplugid = old_coord->iplugid;
8078 +       ON_DEBUG(coord->plug_v = old_coord->plug_v);
8079 +       ON_DEBUG(coord->body_v = old_coord->body_v);
8080 +}
8081 +
8082 +/* Initialize an invalid coordinate. */
8083 +reiser4_internal void
8084 +coord_init_invalid(coord_t * coord, const znode * node)
8085 +{
8086 +       coord_init_values(coord, node, 0, 0, INVALID_COORD);
8087 +}
8088 +
8089 +reiser4_internal void
8090 +coord_init_first_unit_nocheck(coord_t * coord, const znode * node)
8091 +{
8092 +       coord_init_values(coord, node, 0, 0, AT_UNIT);
8093 +}
8094 +
8095 +/* Initialize a coordinate to point at the first unit of the first item.  If the node is
8096 +   empty, it is positioned at the EMPTY_NODE. */
8097 +reiser4_internal void
8098 +coord_init_first_unit(coord_t * coord, const znode * node)
8099 +{
8100 +       int is_empty = node_is_empty(node);
8101 +
8102 +       coord_init_values(coord, node, 0, 0, (is_empty ? EMPTY_NODE : AT_UNIT));
8103 +
8104 +       assert("jmacd-9801", coord_check(coord));
8105 +}
8106 +
8107 +/* Initialize a coordinate to point at the last unit of the last item.  If the node is
8108 +   empty, it is positioned at the EMPTY_NODE. */
8109 +reiser4_internal void
8110 +coord_init_last_unit(coord_t * coord, const znode * node)
8111 +{
8112 +       int is_empty = node_is_empty(node);
8113 +
8114 +       coord_init_values(coord, node, (is_empty ? 0 : node_num_items(node) - 1), 0, (is_empty ? EMPTY_NODE : AT_UNIT));
8115 +       if (!is_empty)
8116 +               coord->unit_pos = coord_last_unit_pos(coord);
8117 +       assert("jmacd-9802", coord_check(coord));
8118 +}
8119 +
8120 +/* Initialize a coordinate to before the first item.  If the node is empty, it is
8121 +   positioned at the EMPTY_NODE. */
8122 +reiser4_internal void
8123 +coord_init_before_first_item(coord_t * coord, const znode * node)
8124 +{
8125 +       int is_empty = node_is_empty(node);
8126 +
8127 +       coord_init_values(coord, node, 0, 0, (is_empty ? EMPTY_NODE : BEFORE_UNIT));
8128 +
8129 +       assert("jmacd-9803", coord_check(coord));
8130 +}
8131 +
8132 +/* Initialize a coordinate to after the last item.  If the node is empty, it is positioned
8133 +   at the EMPTY_NODE. */
8134 +reiser4_internal void
8135 +coord_init_after_last_item(coord_t * coord, const znode * node)
8136 +{
8137 +       int is_empty = node_is_empty(node);
8138 +
8139 +       coord_init_values(coord, node,
8140 +                         (is_empty ? 0 : node_num_items(node) - 1), 0, (is_empty ? EMPTY_NODE : AFTER_ITEM));
8141 +
8142 +       assert("jmacd-9804", coord_check(coord));
8143 +}
8144 +
8145 +/* Initialize a coordinate to after last unit in the item. Coord must be set
8146 +   already to existing item */
8147 +reiser4_internal void
8148 +coord_init_after_item_end(coord_t * coord)
8149 +{
8150 +       coord->between = AFTER_UNIT;
8151 +       coord->unit_pos = coord_last_unit_pos(coord);
8152 +}
8153 +
8154 +/* Initialize a coordinate to before the item. Coord must be set already to existing item */
8155 +reiser4_internal void
8156 +coord_init_before_item(coord_t * coord)
8157 +{
8158 +       coord->unit_pos = 0;
8159 +       coord->between = BEFORE_ITEM;
8160 +}
8161 +
8162 +/* Initialize a coordinate to after the item. Coord must be set already to existing item */
8163 +reiser4_internal void
8164 +coord_init_after_item(coord_t * coord)
8165 +{
8166 +       coord->unit_pos = 0;
8167 +       coord->between = AFTER_ITEM;
8168 +}
8169 +
8170 +/* Initialize a coordinate by 0s. Used in places where init_coord was used and
8171 +   it was not clear how actually */
8172 +reiser4_internal void
8173 +coord_init_zero(coord_t * coord)
8174 +{
8175 +       xmemset(coord, 0, sizeof (*coord));
8176 +}
8177 +
8178 +/* Return the number of units at the present item.  Asserts coord_is_existing_item(). */
8179 +reiser4_internal unsigned
8180 +coord_num_units(const coord_t * coord)
8181 +{
8182 +       assert("jmacd-9806", coord_is_existing_item(coord));
8183 +
8184 +       return item_plugin_by_coord(coord)->b.nr_units(coord);
8185 +}
8186 +
8187 +/* Returns true if the coord was initializewd by coord_init_invalid (). */
8188 +/* Audited by: green(2002.06.15) */
8189 +reiser4_internal int
8190 +coord_is_invalid(const coord_t * coord)
8191 +{
8192 +       return coord->between == INVALID_COORD;
8193 +}
8194 +
8195 +/* Returns true if the coordinate is positioned at an existing item, not before or after
8196 +   an item.  It may be placed at, before, or after any unit within the item, whether
8197 +   existing or not. */
8198 +reiser4_internal int
8199 +coord_is_existing_item(const coord_t * coord)
8200 +{
8201 +       switch (coord->between) {
8202 +       case EMPTY_NODE:
8203 +       case BEFORE_ITEM:
8204 +       case AFTER_ITEM:
8205 +       case INVALID_COORD:
8206 +               return 0;
8207 +
8208 +       case BEFORE_UNIT:
8209 +       case AT_UNIT:
8210 +       case AFTER_UNIT:
8211 +               return coord->item_pos < coord_num_items(coord);
8212 +       }
8213 +
8214 +       IF_TRACE(TRACE_COORDS, print_coord("unreachable", coord, 0));
8215 +       impossible("jmacd-9900", "unreachable coord: %p", coord);
8216 +       return 0;
8217 +}
8218 +
8219 +/* Returns true if the coordinate is positioned at an existing unit, not before or after a
8220 +   unit. */
8221 +/* Audited by: green(2002.06.15) */
8222 +reiser4_internal int
8223 +coord_is_existing_unit(const coord_t * coord)
8224 +{
8225 +       switch (coord->between) {
8226 +       case EMPTY_NODE:
8227 +       case BEFORE_UNIT:
8228 +       case AFTER_UNIT:
8229 +       case BEFORE_ITEM:
8230 +       case AFTER_ITEM:
8231 +       case INVALID_COORD:
8232 +               return 0;
8233 +
8234 +       case AT_UNIT:
8235 +               return (coord->item_pos < coord_num_items(coord) && coord->unit_pos < coord_num_units(coord));
8236 +       }
8237 +
8238 +       impossible("jmacd-9902", "unreachable");
8239 +       return 0;
8240 +}
8241 +
8242 +/* Returns true if the coordinate is positioned at the first unit of the first item.  Not
8243 +   true for empty nodes nor coordinates positioned before the first item. */
8244 +/* Audited by: green(2002.06.15) */
8245 +reiser4_internal int
8246 +coord_is_leftmost_unit(const coord_t * coord)
8247 +{
8248 +       return (coord->between == AT_UNIT && coord->item_pos == 0 && coord->unit_pos == 0);
8249 +}
8250 +
8251 +#if REISER4_DEBUG
8252 +/* For assertions only, checks for a valid coordinate. */
8253 +int
8254 +coord_check(const coord_t * coord)
8255 +{
8256 +       if (coord->node == NULL) {
8257 +               return 0;
8258 +       }
8259 +       if (znode_above_root(coord->node))
8260 +               return 1;
8261 +
8262 +       switch (coord->between) {
8263 +       default:
8264 +       case INVALID_COORD:
8265 +               return 0;
8266 +       case EMPTY_NODE:
8267 +               if (!node_is_empty(coord->node)) {
8268 +                       return 0;
8269 +               }
8270 +               return coord->item_pos == 0 && coord->unit_pos == 0;
8271 +
8272 +       case BEFORE_UNIT:
8273 +       case AFTER_UNIT:
8274 +               if (node_is_empty(coord->node) && (coord->item_pos == 0) && (coord->unit_pos == 0))
8275 +                       return 1;
8276 +       case AT_UNIT:
8277 +               break;
8278 +       case AFTER_ITEM:
8279 +       case BEFORE_ITEM:
8280 +               /* before/after item should not set unit_pos. */
8281 +               if (coord->unit_pos != 0) {
8282 +                       return 0;
8283 +               }
8284 +               break;
8285 +       }
8286 +
8287 +       if (coord->item_pos >= node_num_items(coord->node)) {
8288 +               return 0;
8289 +       }
8290 +
8291 +       /* FIXME-VS: we are going to check unit_pos. This makes no sense when
8292 +          between is set either AFTER_ITEM or BEFORE_ITEM */
8293 +       if (coord->between == AFTER_ITEM || coord->between == BEFORE_ITEM)
8294 +               return 1;
8295 +
8296 +       if (coord_is_iplug_set(coord) &&
8297 +           coord->unit_pos > item_plugin_by_coord(coord)->b.nr_units(coord) - 1) {
8298 +               return 0;
8299 +       }
8300 +       return 1;
8301 +}
8302 +#endif
8303 +
8304 +/* Adjust coordinate boundaries based on the number of items prior to coord_next/prev.
8305 +   Returns 1 if the new position is does not exist. */
8306 +static int
8307 +coord_adjust_items(coord_t * coord, unsigned items, int is_next)
8308 +{
8309 +       /* If the node is invalid, leave it. */
8310 +       if (coord->between == INVALID_COORD) {
8311 +               return 1;
8312 +       }
8313 +
8314 +       /* If the node is empty, set it appropriately. */
8315 +       if (items == 0) {
8316 +               coord->between = EMPTY_NODE;
8317 +               coord_set_item_pos(coord, 0);
8318 +               coord->unit_pos = 0;
8319 +               return 1;
8320 +       }
8321 +
8322 +       /* If it was empty and it no longer is, set to BEFORE/AFTER_ITEM. */
8323 +       if (coord->between == EMPTY_NODE) {
8324 +               coord->between = (is_next ? BEFORE_ITEM : AFTER_ITEM);
8325 +               coord_set_item_pos(coord, 0);
8326 +               coord->unit_pos = 0;
8327 +               return 0;
8328 +       }
8329 +
8330 +       /* If the item_pos is out-of-range, set it appropriatly. */
8331 +       if (coord->item_pos >= items) {
8332 +               coord->between = AFTER_ITEM;
8333 +               coord_set_item_pos(coord, items - 1);
8334 +               coord->unit_pos = 0;
8335 +               /* If is_next, return 1 (can't go any further). */
8336 +               return is_next;
8337 +       }
8338 +
8339 +       return 0;
8340 +}
8341 +
8342 +/* Advances the coordinate by one unit to the right.  If empty, no change.  If
8343 +   coord_is_rightmost_unit, advances to AFTER THE LAST ITEM.  Returns 0 if new position is an
8344 +   existing unit. */
8345 +reiser4_internal int
8346 +coord_next_unit(coord_t * coord)
8347 +{
8348 +       unsigned items = coord_num_items(coord);
8349 +
8350 +       if (coord_adjust_items(coord, items, 1) == 1) {
8351 +               return 1;
8352 +       }
8353 +
8354 +       switch (coord->between) {
8355 +       case BEFORE_UNIT:
8356 +               /* Now it is positioned at the same unit. */
8357 +               coord->between = AT_UNIT;
8358 +               return 0;
8359 +
8360 +       case AFTER_UNIT:
8361 +       case AT_UNIT:
8362 +               /* If it was at or after a unit and there are more units in this item,
8363 +                  advance to the next one. */
8364 +               if (coord->unit_pos < coord_last_unit_pos(coord)) {
8365 +                       coord->unit_pos += 1;
8366 +                       coord->between = AT_UNIT;
8367 +                       return 0;
8368 +               }
8369 +
8370 +               /* Otherwise, it is crossing an item boundary and treated as if it was
8371 +                  after the current item. */
8372 +               coord->between = AFTER_ITEM;
8373 +               coord->unit_pos = 0;
8374 +               /* FALLTHROUGH */
8375 +
8376 +       case AFTER_ITEM:
8377 +               /* Check for end-of-node. */
8378 +               if (coord->item_pos == items - 1) {
8379 +                       return 1;
8380 +               }
8381 +
8382 +               coord_inc_item_pos(coord);
8383 +               coord->unit_pos = 0;
8384 +               coord->between = AT_UNIT;
8385 +               return 0;
8386 +
8387 +       case BEFORE_ITEM:
8388 +               /* The adjust_items checks ensure that we are valid here. */
8389 +               coord->unit_pos = 0;
8390 +               coord->between = AT_UNIT;
8391 +               return 0;
8392 +
8393 +       case INVALID_COORD:
8394 +       case EMPTY_NODE:
8395 +               /* Handled in coord_adjust_items(). */
8396 +               break;
8397 +       }
8398 +
8399 +       impossible("jmacd-9902", "unreachable");
8400 +       return 0;
8401 +}
8402 +
8403 +/* Advances the coordinate by one item to the right.  If empty, no change.  If
8404 +   coord_is_rightmost_unit, advances to AFTER THE LAST ITEM.  Returns 0 if new position is
8405 +   an existing item. */
8406 +reiser4_internal int
8407 +coord_next_item(coord_t * coord)
8408 +{
8409 +       unsigned items = coord_num_items(coord);
8410 +
8411 +       if (coord_adjust_items(coord, items, 1) == 1) {
8412 +               return 1;
8413 +       }
8414 +
8415 +       switch (coord->between) {
8416 +       case AFTER_UNIT:
8417 +       case AT_UNIT:
8418 +       case BEFORE_UNIT:
8419 +       case AFTER_ITEM:
8420 +               /* Check for end-of-node. */
8421 +               if (coord->item_pos == items - 1) {
8422 +                       coord->between = AFTER_ITEM;
8423 +                       coord->unit_pos = 0;
8424 +                       coord_clear_iplug(coord);
8425 +                       return 1;
8426 +               }
8427 +
8428 +               /* Anywhere in an item, go to the next one. */
8429 +               coord->between = AT_UNIT;
8430 +               coord_inc_item_pos(coord);
8431 +               coord->unit_pos = 0;
8432 +               return 0;
8433 +
8434 +       case BEFORE_ITEM:
8435 +               /* The out-of-range check ensures that we are valid here. */
8436 +               coord->unit_pos = 0;
8437 +               coord->between = AT_UNIT;
8438 +               return 0;
8439 +       case INVALID_COORD:
8440 +       case EMPTY_NODE:
8441 +               /* Handled in coord_adjust_items(). */
8442 +               break;
8443 +       }
8444 +
8445 +       impossible("jmacd-9903", "unreachable");
8446 +       return 0;
8447 +}
8448 +
8449 +/* Advances the coordinate by one unit to the left.  If empty, no change.  If
8450 +   coord_is_leftmost_unit, advances to BEFORE THE FIRST ITEM.  Returns 0 if new position
8451 +   is an existing unit. */
8452 +reiser4_internal int
8453 +coord_prev_unit(coord_t * coord)
8454 +{
8455 +       unsigned items = coord_num_items(coord);
8456 +
8457 +       if (coord_adjust_items(coord, items, 0) == 1) {
8458 +               return 1;
8459 +       }
8460 +
8461 +       switch (coord->between) {
8462 +       case AT_UNIT:
8463 +       case BEFORE_UNIT:
8464 +               if (coord->unit_pos > 0) {
8465 +                       coord->unit_pos -= 1;
8466 +                       coord->between = AT_UNIT;
8467 +                       return 0;
8468 +               }
8469 +
8470 +               if (coord->item_pos == 0) {
8471 +                       coord->between = BEFORE_ITEM;
8472 +                       return 1;
8473 +               }
8474 +
8475 +               coord_dec_item_pos(coord);
8476 +               coord->unit_pos = coord_last_unit_pos(coord);
8477 +               coord->between = AT_UNIT;
8478 +               return 0;
8479 +
8480 +       case AFTER_UNIT:
8481 +               /* What if unit_pos is out-of-range? */
8482 +               assert("jmacd-5442", coord->unit_pos <= coord_last_unit_pos(coord));
8483 +               coord->between = AT_UNIT;
8484 +               return 0;
8485 +
8486 +       case BEFORE_ITEM:
8487 +               if (coord->item_pos == 0) {
8488 +                       return 1;
8489 +               }
8490 +
8491 +               coord_dec_item_pos(coord);
8492 +               /* FALLTHROUGH */
8493 +
8494 +       case AFTER_ITEM:
8495 +               coord->between = AT_UNIT;
8496 +               coord->unit_pos = coord_last_unit_pos(coord);
8497 +               return 0;
8498 +
8499 +       case INVALID_COORD:
8500 +       case EMPTY_NODE:
8501 +               break;
8502 +       }
8503 +
8504 +       impossible("jmacd-9904", "unreachable");
8505 +       return 0;
8506 +}
8507 +
8508 +/* Advances the coordinate by one item to the left.  If empty, no change.  If
8509 +   coord_is_leftmost_unit, advances to BEFORE THE FIRST ITEM.  Returns 0 if new position
8510 +   is an existing item. */
8511 +reiser4_internal int
8512 +coord_prev_item(coord_t * coord)
8513 +{
8514 +       unsigned items = coord_num_items(coord);
8515 +
8516 +       if (coord_adjust_items(coord, items, 0) == 1) {
8517 +               return 1;
8518 +       }
8519 +
8520 +       switch (coord->between) {
8521 +       case AT_UNIT:
8522 +       case AFTER_UNIT:
8523 +       case BEFORE_UNIT:
8524 +       case BEFORE_ITEM:
8525 +
8526 +               if (coord->item_pos == 0) {
8527 +                       coord->between = BEFORE_ITEM;
8528 +                       coord->unit_pos = 0;
8529 +                       return 1;
8530 +               }
8531 +
8532 +               coord_dec_item_pos(coord);
8533 +               coord->unit_pos = 0;
8534 +               coord->between = AT_UNIT;
8535 +               return 0;
8536 +
8537 +       case AFTER_ITEM:
8538 +               coord->between = AT_UNIT;
8539 +               coord->unit_pos = 0;
8540 +               return 0;
8541 +
8542 +       case INVALID_COORD:
8543 +       case EMPTY_NODE:
8544 +               break;
8545 +       }
8546 +
8547 +       impossible("jmacd-9905", "unreachable");
8548 +       return 0;
8549 +}
8550 +
8551 +/* Calls either coord_init_first_unit or coord_init_last_unit depending on sideof argument. */
8552 +reiser4_internal void
8553 +coord_init_sideof_unit(coord_t * coord, const znode * node, sideof dir)
8554 +{
8555 +       assert("jmacd-9821", dir == LEFT_SIDE || dir == RIGHT_SIDE);
8556 +       if (dir == LEFT_SIDE) {
8557 +               coord_init_first_unit(coord, node);
8558 +       } else {
8559 +               coord_init_last_unit(coord, node);
8560 +       }
8561 +}
8562 +
8563 +/* Calls either coord_is_before_leftmost or coord_is_after_rightmost depending on sideof
8564 +   argument. */
8565 +/* Audited by: green(2002.06.15) */
8566 +reiser4_internal int
8567 +coord_is_after_sideof_unit(coord_t * coord, sideof dir)
8568 +{
8569 +       assert("jmacd-9822", dir == LEFT_SIDE || dir == RIGHT_SIDE);
8570 +       if (dir == LEFT_SIDE) {
8571 +               return coord_is_before_leftmost(coord);
8572 +       } else {
8573 +               return coord_is_after_rightmost(coord);
8574 +       }
8575 +}
8576 +
8577 +/* Calls either coord_next_unit or coord_prev_unit depending on sideof argument. */
8578 +/* Audited by: green(2002.06.15) */
8579 +reiser4_internal int
8580 +coord_sideof_unit(coord_t * coord, sideof dir)
8581 +{
8582 +       assert("jmacd-9823", dir == LEFT_SIDE || dir == RIGHT_SIDE);
8583 +       if (dir == LEFT_SIDE) {
8584 +               return coord_prev_unit(coord);
8585 +       } else {
8586 +               return coord_next_unit(coord);
8587 +       }
8588 +}
8589 +
8590 +#if REISER4_DEBUG
8591 +#define DEBUG_COORD_FIELDS (sizeof(c1->plug_v) + sizeof(c1->body_v))
8592 +#else
8593 +#define DEBUG_COORD_FIELDS (0)
8594 +#endif
8595 +
8596 +reiser4_internal int
8597 +coords_equal(const coord_t * c1, const coord_t * c2)
8598 +{
8599 +       assert("nikita-2840", c1 != NULL);
8600 +       assert("nikita-2841", c2 != NULL);
8601 +
8602 +       /* assertion to track changes in coord_t */
8603 +       cassert(sizeof(*c1) == sizeof(c1->node) +
8604 +               sizeof(c1->item_pos) +
8605 +               sizeof(c1->unit_pos) +
8606 +               sizeof(c1->iplugid) +
8607 +               sizeof(c1->between) +
8608 +               sizeof(c1->pad) +
8609 +               sizeof(c1->offset) +
8610 +               DEBUG_COORD_FIELDS);
8611 +       return
8612 +               c1->node == c2->node &&
8613 +               c1->item_pos == c2->item_pos &&
8614 +               c1->unit_pos == c2->unit_pos &&
8615 +               c1->between == c2->between;
8616 +}
8617 +
8618 +/* Returns true if two coordinates are consider equal.  Coordinates that are between units
8619 +   or items are considered equal. */
8620 +/* Audited by: green(2002.06.15) */
8621 +reiser4_internal int
8622 +coord_eq(const coord_t * c1, const coord_t * c2)
8623 +{
8624 +       assert("nikita-1807", c1 != NULL);
8625 +       assert("nikita-1808", c2 != NULL);
8626 +
8627 +       if (coords_equal(c1, c2)) {
8628 +               return 1;
8629 +       }
8630 +       if (c1->node != c2->node) {
8631 +               return 0;
8632 +       }
8633 +
8634 +       switch (c1->between) {
8635 +       case INVALID_COORD:
8636 +       case EMPTY_NODE:
8637 +       case AT_UNIT:
8638 +               return 0;
8639 +
8640 +       case BEFORE_UNIT:
8641 +               /* c2 must be after the previous unit. */
8642 +               return (c1->item_pos == c2->item_pos && c2->between == AFTER_UNIT && c2->unit_pos == c1->unit_pos - 1);
8643 +
8644 +       case AFTER_UNIT:
8645 +               /* c2 must be before the next unit. */
8646 +               return (c1->item_pos == c2->item_pos && c2->between == BEFORE_UNIT && c2->unit_pos == c1->unit_pos + 1);
8647 +
8648 +       case BEFORE_ITEM:
8649 +               /* c2 must be after the previous item. */
8650 +               return (c1->item_pos == c2->item_pos - 1 && c2->between == AFTER_ITEM);
8651 +
8652 +       case AFTER_ITEM:
8653 +               /* c2 must be before the next item. */
8654 +               return (c1->item_pos == c2->item_pos + 1 && c2->between == BEFORE_ITEM);
8655 +       }
8656 +
8657 +       impossible("jmacd-9906", "unreachable");
8658 +       return 0;
8659 +}
8660 +
8661 +/* If coord_is_after_rightmost return NCOORD_ON_THE_RIGHT, if coord_is_after_leftmost
8662 +   return NCOORD_ON_THE_LEFT, otherwise return NCOORD_INSIDE. */
8663 +/* Audited by: green(2002.06.15) */
8664 +reiser4_internal coord_wrt_node coord_wrt(const coord_t * coord)
8665 +{
8666 +       if (coord_is_before_leftmost(coord)) {
8667 +               return COORD_ON_THE_LEFT;
8668 +       }
8669 +
8670 +       if (coord_is_after_rightmost(coord)) {
8671 +               return COORD_ON_THE_RIGHT;
8672 +       }
8673 +
8674 +       return COORD_INSIDE;
8675 +}
8676 +
8677 +/* Returns true if the coordinate is positioned after the last item or after the last unit
8678 +   of the last item or it is an empty node. */
8679 +/* Audited by: green(2002.06.15) */
8680 +reiser4_internal int
8681 +coord_is_after_rightmost(const coord_t * coord)
8682 +{
8683 +       assert("jmacd-7313", coord_check(coord));
8684 +
8685 +       switch (coord->between) {
8686 +       case INVALID_COORD:
8687 +       case AT_UNIT:
8688 +       case BEFORE_UNIT:
8689 +       case BEFORE_ITEM:
8690 +               return 0;
8691 +
8692 +       case EMPTY_NODE:
8693 +               return 1;
8694 +
8695 +       case AFTER_ITEM:
8696 +               return (coord->item_pos == node_num_items(coord->node) - 1);
8697 +
8698 +       case AFTER_UNIT:
8699 +               return ((coord->item_pos == node_num_items(coord->node) - 1) &&
8700 +                       coord->unit_pos == coord_last_unit_pos(coord));
8701 +       }
8702 +
8703 +       impossible("jmacd-9908", "unreachable");
8704 +       return 0;
8705 +}
8706 +
8707 +/* Returns true if the coordinate is positioned before the first item or it is an empty
8708 +   node. */
8709 +reiser4_internal int
8710 +coord_is_before_leftmost(const coord_t * coord)
8711 +{
8712 +       /* FIXME-VS: coord_check requires node to be loaded whereas it is not
8713 +          necessary to check if coord is set before leftmost
8714 +          assert ("jmacd-7313", coord_check (coord)); */
8715 +       switch (coord->between) {
8716 +       case INVALID_COORD:
8717 +       case AT_UNIT:
8718 +       case AFTER_ITEM:
8719 +       case AFTER_UNIT:
8720 +               return 0;
8721 +
8722 +       case EMPTY_NODE:
8723 +               return 1;
8724 +
8725 +       case BEFORE_ITEM:
8726 +       case BEFORE_UNIT:
8727 +               return (coord->item_pos == 0) && (coord->unit_pos == 0);
8728 +       }
8729 +
8730 +       impossible("jmacd-9908", "unreachable");
8731 +       return 0;
8732 +}
8733 +
8734 +/* Returns true if the coordinate is positioned after a item, before a item, after the
8735 +   last unit of an item, before the first unit of an item, or at an empty node. */
8736 +/* Audited by: green(2002.06.15) */
8737 +reiser4_internal int
8738 +coord_is_between_items(const coord_t * coord)
8739 +{
8740 +       assert("jmacd-7313", coord_check(coord));
8741 +
8742 +       switch (coord->between) {
8743 +       case INVALID_COORD:
8744 +       case AT_UNIT:
8745 +               return 0;
8746 +
8747 +       case AFTER_ITEM:
8748 +       case BEFORE_ITEM:
8749 +       case EMPTY_NODE:
8750 +               return 1;
8751 +
8752 +       case BEFORE_UNIT:
8753 +               return coord->unit_pos == 0;
8754 +
8755 +       case AFTER_UNIT:
8756 +               return coord->unit_pos == coord_last_unit_pos(coord);
8757 +       }
8758 +
8759 +       impossible("jmacd-9908", "unreachable");
8760 +       return 0;
8761 +}
8762 +
8763 +/* Returns true if the coordinates are positioned at adjacent units, regardless of
8764 +   before-after or item boundaries. */
8765 +reiser4_internal int
8766 +coord_are_neighbors(coord_t * c1, coord_t * c2)
8767 +{
8768 +       coord_t *left;
8769 +       coord_t *right;
8770 +
8771 +       assert("nikita-1241", c1 != NULL);
8772 +       assert("nikita-1242", c2 != NULL);
8773 +       assert("nikita-1243", c1->node == c2->node);
8774 +       assert("nikita-1244", coord_is_existing_unit(c1));
8775 +       assert("nikita-1245", coord_is_existing_unit(c2));
8776 +
8777 +       left = right = 0;
8778 +       switch (coord_compare(c1, c2)) {
8779 +       case COORD_CMP_ON_LEFT:
8780 +               left = c1;
8781 +               right = c2;
8782 +               break;
8783 +       case COORD_CMP_ON_RIGHT:
8784 +               left = c2;
8785 +               right = c1;
8786 +               break;
8787 +       case COORD_CMP_SAME:
8788 +               return 0;
8789 +       default:
8790 +               wrong_return_value("nikita-1246", "compare_coords()");
8791 +       }
8792 +       assert("vs-731", left && right);
8793 +       if (left->item_pos == right->item_pos) {
8794 +               return left->unit_pos + 1 == right->unit_pos;
8795 +       } else if (left->item_pos + 1 == right->item_pos) {
8796 +               return (left->unit_pos == coord_last_unit_pos(left)) && (right->unit_pos == 0);
8797 +       } else {
8798 +               return 0;
8799 +       }
8800 +}
8801 +
8802 +/* Assuming two coordinates are positioned in the same node, return COORD_CMP_ON_RIGHT,
8803 +   COORD_CMP_ON_LEFT, or COORD_CMP_SAME depending on c1's position relative to c2.  */
8804 +/* Audited by: green(2002.06.15) */
8805 +reiser4_internal coord_cmp coord_compare(coord_t * c1, coord_t * c2)
8806 +{
8807 +       assert("vs-209", c1->node == c2->node);
8808 +       assert("vs-194", coord_is_existing_unit(c1)
8809 +              && coord_is_existing_unit(c2));
8810 +
8811 +       if (c1->item_pos > c2->item_pos)
8812 +               return COORD_CMP_ON_RIGHT;
8813 +       if (c1->item_pos < c2->item_pos)
8814 +               return COORD_CMP_ON_LEFT;
8815 +       if (c1->unit_pos > c2->unit_pos)
8816 +               return COORD_CMP_ON_RIGHT;
8817 +       if (c1->unit_pos < c2->unit_pos)
8818 +               return COORD_CMP_ON_LEFT;
8819 +       return COORD_CMP_SAME;
8820 +}
8821 +
8822 +/* If the coordinate is between items, shifts it to the right.  Returns 0 on success and
8823 +   non-zero if there is no position to the right. */
8824 +reiser4_internal int
8825 +coord_set_to_right(coord_t * coord)
8826 +{
8827 +       unsigned items = coord_num_items(coord);
8828 +
8829 +       if (coord_adjust_items(coord, items, 1) == 1) {
8830 +               return 1;
8831 +       }
8832 +
8833 +       switch (coord->between) {
8834 +       case AT_UNIT:
8835 +               return 0;
8836 +
8837 +       case BEFORE_ITEM:
8838 +       case BEFORE_UNIT:
8839 +               coord->between = AT_UNIT;
8840 +               return 0;
8841 +
8842 +       case AFTER_UNIT:
8843 +               if (coord->unit_pos < coord_last_unit_pos(coord)) {
8844 +                       coord->unit_pos += 1;
8845 +                       coord->between = AT_UNIT;
8846 +                       return 0;
8847 +               } else {
8848 +
8849 +                       coord->unit_pos = 0;
8850 +
8851 +                       if (coord->item_pos == items - 1) {
8852 +                               coord->between = AFTER_ITEM;
8853 +                               return 1;
8854 +                       }
8855 +
8856 +                       coord_inc_item_pos(coord);
8857 +                       coord->between = AT_UNIT;
8858 +                       return 0;
8859 +               }
8860 +
8861 +       case AFTER_ITEM:
8862 +               if (coord->item_pos == items - 1) {
8863 +                       return 1;
8864 +               }
8865 +
8866 +               coord_inc_item_pos(coord);
8867 +               coord->unit_pos = 0;
8868 +               coord->between = AT_UNIT;
8869 +               return 0;
8870 +
8871 +       case EMPTY_NODE:
8872 +               return 1;
8873 +
8874 +       case INVALID_COORD:
8875 +               break;
8876 +       }
8877 +
8878 +       impossible("jmacd-9920", "unreachable");
8879 +       return 0;
8880 +}
8881 +
8882 +/* If the coordinate is between items, shifts it to the left.  Returns 0 on success and
8883 +   non-zero if there is no position to the left. */
8884 +reiser4_internal int
8885 +coord_set_to_left(coord_t * coord)
8886 +{
8887 +       unsigned items = coord_num_items(coord);
8888 +
8889 +       if (coord_adjust_items(coord, items, 0) == 1) {
8890 +               return 1;
8891 +       }
8892 +
8893 +       switch (coord->between) {
8894 +       case AT_UNIT:
8895 +               return 0;
8896 +
8897 +       case AFTER_UNIT:
8898 +               coord->between = AT_UNIT;
8899 +               return 0;
8900 +
8901 +       case AFTER_ITEM:
8902 +               coord->between = AT_UNIT;
8903 +               coord->unit_pos = coord_last_unit_pos(coord);
8904 +               return 0;
8905 +
8906 +       case BEFORE_UNIT:
8907 +               if (coord->unit_pos > 0) {
8908 +                       coord->unit_pos -= 1;
8909 +                       coord->between = AT_UNIT;
8910 +                       return 0;
8911 +               } else {
8912 +
8913 +                       if (coord->item_pos == 0) {
8914 +                               coord->between = BEFORE_ITEM;
8915 +                               return 1;
8916 +                       }
8917 +
8918 +                       coord->unit_pos = coord_last_unit_pos(coord);
8919 +                       coord_dec_item_pos(coord);
8920 +                       coord->between = AT_UNIT;
8921 +                       return 0;
8922 +               }
8923 +
8924 +       case BEFORE_ITEM:
8925 +               if (coord->item_pos == 0) {
8926 +                       return 1;
8927 +               }
8928 +
8929 +               coord_dec_item_pos(coord);
8930 +               coord->unit_pos = coord_last_unit_pos(coord);
8931 +               coord->between = AT_UNIT;
8932 +               return 0;
8933 +
8934 +       case EMPTY_NODE:
8935 +               return 1;
8936 +
8937 +       case INVALID_COORD:
8938 +               break;
8939 +       }
8940 +
8941 +       impossible("jmacd-9920", "unreachable");
8942 +       return 0;
8943 +}
8944 +
8945 +reiser4_internal const char *
8946 +coord_tween_tostring(between_enum n)
8947 +{
8948 +       switch (n) {
8949 +       case BEFORE_UNIT:
8950 +               return "before unit";
8951 +       case BEFORE_ITEM:
8952 +               return "before item";
8953 +       case AT_UNIT:
8954 +               return "at unit";
8955 +       case AFTER_UNIT:
8956 +               return "after unit";
8957 +       case AFTER_ITEM:
8958 +               return "after item";
8959 +       case EMPTY_NODE:
8960 +               return "empty node";
8961 +       case INVALID_COORD:
8962 +               return "invalid";
8963 +       default:{
8964 +                       static char buf[30];
8965 +
8966 +                       sprintf(buf, "unknown: %i", n);
8967 +                       return buf;
8968 +               }
8969 +       }
8970 +}
8971 +
8972 +reiser4_internal void
8973 +print_coord(const char *mes, const coord_t * coord, int node)
8974 +{
8975 +       if (coord == NULL) {
8976 +               printk("%s: null\n", mes);
8977 +               return;
8978 +       }
8979 +       printk("%s: item_pos = %d, unit_pos %d, tween=%s, iplug=%d\n",
8980 +              mes, coord->item_pos, coord->unit_pos, coord_tween_tostring(coord->between), coord->iplugid);
8981 +       if (node)
8982 +               print_znode("\tnode", coord->node);
8983 +}
8984 +
8985 +reiser4_internal int
8986 +item_utmost_child_real_block(const coord_t * coord, sideof side, reiser4_block_nr * blk)
8987 +{
8988 +       return item_plugin_by_coord(coord)->f.utmost_child_real_block(coord, side, blk);
8989 +}
8990 +
8991 +reiser4_internal int
8992 +item_utmost_child(const coord_t * coord, sideof side, jnode ** child)
8993 +{
8994 +       return item_plugin_by_coord(coord)->f.utmost_child(coord, side, child);
8995 +}
8996 +
8997 +/*
8998 +   Local variables:
8999 +   c-indentation-style: "K&R"
9000 +   mode-name: "LC"
9001 +   c-basic-offset: 8
9002 +   tab-width: 8
9003 +   fill-column: 120
9004 +   scroll-step: 1
9005 +   End:
9006 +*/
9007 diff -rupN linux-2.6.8-rc3/fs/reiser4/coord.h linux-2.6.8-rc3-a/fs/reiser4/coord.h
9008 --- linux-2.6.8-rc3/fs/reiser4/coord.h  1970-01-01 03:00:00.000000000 +0300
9009 +++ linux-2.6.8-rc3-a/fs/reiser4/coord.h        2004-08-05 21:20:53.304613824 +0400
9010 @@ -0,0 +1,341 @@
9011 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
9012 +
9013 +/* Coords */
9014 +
9015 +#if !defined( __REISER4_COORD_H__ )
9016 +#define __REISER4_COORD_H__
9017 +
9018 +#include "forward.h"
9019 +#include "debug.h"
9020 +#include "dformat.h"
9021 +
9022 +/* insertions happen between coords in the tree, so we need some means
9023 +   of specifying the sense of betweenness. */
9024 +typedef enum {
9025 +       BEFORE_UNIT,            /* Note: we/init_coord depends on this value being zero. */
9026 +       AT_UNIT,
9027 +       AFTER_UNIT,
9028 +       BEFORE_ITEM,
9029 +       AFTER_ITEM,
9030 +       INVALID_COORD,
9031 +       EMPTY_NODE,
9032 +} between_enum;
9033 +
9034 +/* location of coord w.r.t. its node */
9035 +typedef enum {
9036 +       COORD_ON_THE_LEFT = -1,
9037 +       COORD_ON_THE_RIGHT = +1,
9038 +       COORD_INSIDE = 0
9039 +} coord_wrt_node;
9040 +
9041 +typedef enum {
9042 +       COORD_CMP_SAME = 0, COORD_CMP_ON_LEFT = -1, COORD_CMP_ON_RIGHT = +1
9043 +} coord_cmp;
9044 +
9045 +struct coord {
9046 +       /* node in a tree */
9047 +       /*  0 */ znode *node;
9048 +
9049 +       /* position of item within node */
9050 +       /*  4 */ pos_in_node_t item_pos;
9051 +       /* position of unit within item */
9052 +       /*  6 */ pos_in_node_t unit_pos;
9053 +       /* optimization: plugin of item is stored in coord_t. Until this was
9054 +          implemented, item_plugin_by_coord() was major CPU consumer. ->iplugid
9055 +          is invalidated (set to 0xff) on each modification of ->item_pos,
9056 +          and all such modifications are funneled through coord_*_item_pos()
9057 +          functions below.
9058 +       */
9059 +       /*  8 */ char iplugid;
9060 +       /* position of coord w.r.t. to neighboring items and/or units.
9061 +          Values are taken from &between_enum above.
9062 +       */
9063 +       /*  9 */ char between;
9064 +       /* padding. It will be added by the compiler anyway to conform to the
9065 +        * C language alignment requirements. We keep it here to be on the
9066 +        * safe side and to have a clear picture of the memory layout of this
9067 +        * structure. */
9068 +       /* 10 */ __u16 pad;
9069 +       /* 12 */ int offset;
9070 +#if REISER4_DEBUG
9071 +       unsigned long plug_v;
9072 +       unsigned long body_v;
9073 +#endif
9074 +};
9075 +
9076 +#define INVALID_PLUGID  ((char)((1 << 8) - 1))
9077 +#define INVALID_OFFSET -1
9078 +
9079 +static inline void
9080 +coord_clear_iplug(coord_t * coord)
9081 +{
9082 +       assert("nikita-2835", coord != NULL);
9083 +       coord->iplugid = INVALID_PLUGID;
9084 +       coord->offset  = INVALID_OFFSET;
9085 +}
9086 +
9087 +static inline int
9088 +coord_is_iplug_set(const coord_t * coord)
9089 +{
9090 +       assert("nikita-2836", coord != NULL);
9091 +       return coord->iplugid != INVALID_PLUGID;
9092 +}
9093 +
9094 +static inline void
9095 +coord_set_item_pos(coord_t * coord, pos_in_node_t pos)
9096 +{
9097 +       assert("nikita-2478", coord != NULL);
9098 +       coord->item_pos = pos;
9099 +       coord_clear_iplug(coord);
9100 +}
9101 +
9102 +static inline void
9103 +coord_dec_item_pos(coord_t * coord)
9104 +{
9105 +       assert("nikita-2480", coord != NULL);
9106 +       --coord->item_pos;
9107 +       coord_clear_iplug(coord);
9108 +}
9109 +
9110 +static inline void
9111 +coord_inc_item_pos(coord_t * coord)
9112 +{
9113 +       assert("nikita-2481", coord != NULL);
9114 +       ++coord->item_pos;
9115 +       coord_clear_iplug(coord);
9116 +}
9117 +
9118 +static inline void
9119 +coord_add_item_pos(coord_t * coord, int delta)
9120 +{
9121 +       assert("nikita-2482", coord != NULL);
9122 +       coord->item_pos += delta;
9123 +       coord_clear_iplug(coord);
9124 +}
9125 +
9126 +static inline void
9127 +coord_invalid_item_pos(coord_t * coord)
9128 +{
9129 +       assert("nikita-2832", coord != NULL);
9130 +       coord->item_pos = (unsigned short)~0;
9131 +       coord_clear_iplug(coord);
9132 +}
9133 +
9134 +/* Reverse a direction. */
9135 +static inline sideof
9136 +sideof_reverse(sideof side)
9137 +{
9138 +       return side == LEFT_SIDE ? RIGHT_SIDE : LEFT_SIDE;
9139 +}
9140 +
9141 +/* NOTE: There is a somewhat odd mixture of the following opposed terms:
9142 +
9143 +   "first" and "last"
9144 +   "next" and "prev"
9145 +   "before" and "after"
9146 +   "leftmost" and "rightmost"
9147 +
9148 +   But I think the chosen names are decent the way they are.
9149 +*/
9150 +
9151 +/* COORD INITIALIZERS */
9152 +
9153 +/* Initialize an invalid coordinate. */
9154 +extern void coord_init_invalid(coord_t * coord, const znode * node);
9155 +
9156 +extern void coord_init_first_unit_nocheck(coord_t * coord, const znode * node);
9157 +
9158 +/* Initialize a coordinate to point at the first unit of the first item.  If the node is
9159 +   empty, it is positioned at the EMPTY_NODE. */
9160 +extern void coord_init_first_unit(coord_t * coord, const znode * node);
9161 +
9162 +/* Initialize a coordinate to point at the last unit of the last item.  If the node is
9163 +   empty, it is positioned at the EMPTY_NODE. */
9164 +extern void coord_init_last_unit(coord_t * coord, const znode * node);
9165 +
9166 +/* Initialize a coordinate to before the first item.  If the node is empty, it is
9167 +   positioned at the EMPTY_NODE. */
9168 +extern void coord_init_before_first_item(coord_t * coord, const znode * node);
9169 +
9170 +/* Initialize a coordinate to after the last item.  If the node is empty, it is positioned
9171 +   at the EMPTY_NODE. */
9172 +extern void coord_init_after_last_item(coord_t * coord, const znode * node);
9173 +
9174 +/* Initialize a coordinate to after last unit in the item. Coord must be set
9175 +   already to existing item */
9176 +void coord_init_after_item_end(coord_t * coord);
9177 +
9178 +/* Initialize a coordinate to before the item. Coord must be set already to existing item */
9179 +void coord_init_before_item(coord_t *);
9180 +/* Initialize a coordinate to after the item. Coord must be set already to existing item */
9181 +void coord_init_after_item(coord_t *);
9182 +
9183 +/* Calls either coord_init_first_unit or coord_init_last_unit depending on sideof argument. */
9184 +extern void coord_init_sideof_unit(coord_t * coord, const znode * node, sideof dir);
9185 +
9186 +/* Initialize a coordinate by 0s. Used in places where init_coord was used and
9187 +   it was not clear how actually
9188 +   FIXME-VS: added by vs (2002, june, 8) */
9189 +extern void coord_init_zero(coord_t * coord);
9190 +
9191 +/* COORD METHODS */
9192 +
9193 +/* after shifting of node content, coord previously set properly may become
9194 +   invalid, try to "normalize" it. */
9195 +void coord_normalize(coord_t * coord);
9196 +
9197 +/* Copy a coordinate. */
9198 +extern void coord_dup(coord_t * coord, const coord_t * old_coord);
9199 +
9200 +/* Copy a coordinate without check. */
9201 +void coord_dup_nocheck(coord_t * coord, const coord_t * old_coord);
9202 +
9203 +unsigned coord_num_units(const coord_t * coord);
9204 +
9205 +/* Return the last valid unit number at the present item (i.e.,
9206 +   coord_num_units() - 1). */
9207 +static inline unsigned
9208 +coord_last_unit_pos(const coord_t * coord)
9209 +{
9210 +       return coord_num_units(coord) - 1;
9211 +}
9212 +
9213 +#if REISER4_DEBUG
9214 +/* For assertions only, checks for a valid coordinate. */
9215 +extern int coord_check(const coord_t * coord);
9216 +
9217 +extern unsigned long znode_times_locked(const znode *z);
9218 +
9219 +static inline void
9220 +coord_update_v(coord_t * coord)
9221 +{
9222 +       coord->plug_v = coord->body_v = znode_times_locked(coord->node);
9223 +}
9224 +#endif
9225 +
9226 +extern int coords_equal(const coord_t * c1, const coord_t * c2);
9227 +
9228 +/* Returns true if two coordinates are consider equal.  Coordinates that are between units
9229 +   or items are considered equal. */
9230 +extern int coord_eq(const coord_t * c1, const coord_t * c2);
9231 +
9232 +extern void print_coord(const char *mes, const coord_t * coord, int print_node);
9233 +
9234 +/* If coord_is_after_rightmost return NCOORD_ON_THE_RIGHT, if coord_is_after_leftmost
9235 +   return NCOORD_ON_THE_LEFT, otherwise return NCOORD_INSIDE. */
9236 +extern coord_wrt_node coord_wrt(const coord_t * coord);
9237 +
9238 +/* Returns true if the coordinates are positioned at adjacent units, regardless of
9239 +   before-after or item boundaries. */
9240 +extern int coord_are_neighbors(coord_t * c1, coord_t * c2);
9241 +
9242 +/* Assuming two coordinates are positioned in the same node, return NCOORD_CMP_ON_RIGHT,
9243 +   NCOORD_CMP_ON_LEFT, or NCOORD_CMP_SAME depending on c1's position relative to c2.  */
9244 +extern coord_cmp coord_compare(coord_t * c1, coord_t * c2);
9245 +
9246 +/* COORD PREDICATES */
9247 +
9248 +/* Returns true if the coord was initializewd by coord_init_invalid (). */
9249 +extern int coord_is_invalid(const coord_t * coord);
9250 +
9251 +/* Returns true if the coordinate is positioned at an existing item, not before or after
9252 +   an item.  It may be placed at, before, or after any unit within the item, whether
9253 +   existing or not.  If this is true you can call methods of the item plugin.  */
9254 +extern int coord_is_existing_item(const coord_t * coord);
9255 +
9256 +/* Returns true if the coordinate is positioned after a item, before a item, after the
9257 +   last unit of an item, before the first unit of an item, or at an empty node. */
9258 +extern int coord_is_between_items(const coord_t * coord);
9259 +
9260 +/* Returns true if the coordinate is positioned at an existing unit, not before or after a
9261 +   unit. */
9262 +extern int coord_is_existing_unit(const coord_t * coord);
9263 +
9264 +/* Returns true if the coordinate is positioned at an empty node. */
9265 +extern int coord_is_empty(const coord_t * coord);
9266 +
9267 +/* Returns true if the coordinate is positioned at the first unit of the first item.  Not
9268 +   true for empty nodes nor coordinates positioned before the first item. */
9269 +extern int coord_is_leftmost_unit(const coord_t * coord);
9270 +
9271 +/* Returns true if the coordinate is positioned after the last item or after the last unit
9272 +   of the last item or it is an empty node. */
9273 +extern int coord_is_after_rightmost(const coord_t * coord);
9274 +
9275 +/* Returns true if the coordinate is positioned before the first item or it is an empty
9276 +   node. */
9277 +extern int coord_is_before_leftmost(const coord_t * coord);
9278 +
9279 +/* Calls either coord_is_before_leftmost or coord_is_after_rightmost depending on sideof
9280 +   argument. */
9281 +extern int coord_is_after_sideof_unit(coord_t * coord, sideof dir);
9282 +
9283 +/* COORD MODIFIERS */
9284 +
9285 +/* Advances the coordinate by one unit to the right.  If empty, no change.  If
9286 +   coord_is_rightmost_unit, advances to AFTER THE LAST ITEM.  Returns 0 if new position is
9287 +   an existing unit. */
9288 +extern int coord_next_unit(coord_t * coord);
9289 +
9290 +/* Advances the coordinate by one item to the right.  If empty, no change.  If
9291 +   coord_is_rightmost_unit, advances to AFTER THE LAST ITEM.  Returns 0 if new position is
9292 +   an existing item. */
9293 +extern int coord_next_item(coord_t * coord);
9294 +
9295 +/* Advances the coordinate by one unit to the left.  If empty, no change.  If
9296 +   coord_is_leftmost_unit, advances to BEFORE THE FIRST ITEM.  Returns 0 if new position
9297 +   is an existing unit. */
9298 +extern int coord_prev_unit(coord_t * coord);
9299 +
9300 +/* Advances the coordinate by one item to the left.  If empty, no change.  If
9301 +   coord_is_leftmost_unit, advances to BEFORE THE FIRST ITEM.  Returns 0 if new position
9302 +   is an existing item. */
9303 +extern int coord_prev_item(coord_t * coord);
9304 +
9305 +/* If the coordinate is between items, shifts it to the right.  Returns 0 on success and
9306 +   non-zero if there is no position to the right. */
9307 +extern int coord_set_to_right(coord_t * coord);
9308 +
9309 +/* If the coordinate is between items, shifts it to the left.  Returns 0 on success and
9310 +   non-zero if there is no position to the left. */
9311 +extern int coord_set_to_left(coord_t * coord);
9312 +
9313 +/* If the coordinate is at an existing unit, set to after that unit.  Returns 0 on success
9314 +   and non-zero if the unit did not exist. */
9315 +extern int coord_set_after_unit(coord_t * coord);
9316 +
9317 +/* Calls either coord_next_unit or coord_prev_unit depending on sideof argument. */
9318 +extern int coord_sideof_unit(coord_t * coord, sideof dir);
9319 +
9320 +/* iterate over all units in @node */
9321 +#define for_all_units( coord, node )                                   \
9322 +       for( coord_init_before_first_item( ( coord ), ( node ) ) ;      \
9323 +            coord_next_unit( coord ) == 0 ; )
9324 +
9325 +/* iterate over all items in @node */
9326 +#define for_all_items( coord, node )                                   \
9327 +       for( coord_init_before_first_item( ( coord ), ( node ) ) ;      \
9328 +            coord_next_item( coord ) == 0 ; )
9329 +
9330 +#if REISER4_DEBUG_OUTPUT
9331 +extern const char *coord_tween_tostring(between_enum n);
9332 +#endif
9333 +
9334 +/* COORD/ITEM METHODS */
9335 +
9336 +extern int item_utmost_child_real_block(const coord_t * coord, sideof side, reiser4_block_nr * blk);
9337 +extern int item_utmost_child(const coord_t * coord, sideof side, jnode ** child);
9338 +
9339 +/* __REISER4_COORD_H__ */
9340 +#endif
9341 +
9342 +/* Make Linus happy.
9343 +   Local variables:
9344 +   c-indentation-style: "K&R"
9345 +   mode-name: "LC"
9346 +   c-basic-offset: 8
9347 +   tab-width: 8
9348 +   fill-column: 120
9349 +   scroll-step: 1
9350 +   End:
9351 +*/
9352 diff -rupN linux-2.6.8-rc3/fs/reiser4/crypt.c linux-2.6.8-rc3-a/fs/reiser4/crypt.c
9353 --- linux-2.6.8-rc3/fs/reiser4/crypt.c  1970-01-01 03:00:00.000000000 +0300
9354 +++ linux-2.6.8-rc3-a/fs/reiser4/crypt.c        2004-08-05 21:20:52.762728122 +0400
9355 @@ -0,0 +1,92 @@
9356 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
9357 +/* Crypto-plugins for reiser4 cryptcompress objects */
9358 +
9359 +#include "debug.h"
9360 +#include "plugin/plugin.h"
9361 +#include "plugin/cryptcompress.h"
9362 +#include <linux/types.h>
9363 +#include <linux/random.h>
9364 +
9365 +#define MAX_CRYPTO_BLOCKSIZE 128
9366 +#define NONE_EXPKEY_WORDS 8
9367 +#define NONE_BLOCKSIZE 8
9368 +
9369 +/*
9370 +  Default align() method of the crypto-plugin (look for description of this method
9371 +  in plugin/plugin.h)
9372 +
9373 +1) creates the aligning armored format of the input flow before encryption.
9374 +   "armored" means that padding is filled by private data (for example,
9375 +   pseudo-random sequence of bytes is not private data).
9376 +2) returns length of appended padding
9377 +
9378 +   [ flow | aligning_padding ]
9379 +            ^
9380 +            |
9381 +         @pad
9382 +*/
9383 +UNUSED_ARG static int
9384 +align_cluster_common(__u8 *pad /* pointer to the first byte of aligning format */,
9385 +                    int flow_size /* size of non-aligned flow */,
9386 +                               int blocksize /* crypto-block size */)
9387 +{
9388 +       int pad_size;
9389 +
9390 +       assert("edward-01", pad != NULL);
9391 +       assert("edward-02", flow_size != 0);
9392 +       assert("edward-03", blocksize != 0 || blocksize <= MAX_CRYPTO_BLOCKSIZE);
9393 +
9394 +       pad_size = blocksize - (flow_size % blocksize);
9395 +       get_random_bytes (pad, pad_size);
9396 +       return pad_size;
9397 +}
9398 +
9399 +/* common scale method (look for description of this method in plugin/plugin.h)
9400 +   for all symmetric algorithms which doesn't scale anything
9401 +*/
9402 +static loff_t scale_common(struct inode * inode UNUSED_ARG,
9403 +                          size_t blocksize UNUSED_ARG /* crypto block size, which is returned
9404 +                                                         by blocksize method of crypto plugin */,
9405 +                          loff_t src_off /* offset to scale */)
9406 +{
9407 +       return src_off;
9408 +}
9409 +
9410 +REGISTER_NONE_ALG(crypt, CRYPTO)
9411 +
9412 +/* EDWARD-FIXME-HANS: why is this not in the plugin directory? */
9413 +
9414 +/* crypto plugins */
9415 +crypto_plugin crypto_plugins[LAST_CRYPTO_ID] = {
9416 +       [NONE_CRYPTO_ID] = {
9417 +               .h = {
9418 +                       .type_id = REISER4_CRYPTO_PLUGIN_TYPE,
9419 +                       .id = NONE_CRYPTO_ID,
9420 +                       .pops = NULL,
9421 +                       /* If you wanna your files to not be crypto
9422 +                          transformed, specify this crypto pluigin */
9423 +                       .label = "none",
9424 +                       .desc = "absence of crypto transform",
9425 +                       .linkage = TYPE_SAFE_LIST_LINK_ZERO
9426 +               },
9427 +               .alloc = alloc_none_crypt,
9428 +               .free = free_none_crypt,
9429 +               .nr_keywords = NONE_EXPKEY_WORDS,
9430 +               .scale = scale_common,
9431 +               .align_cluster = NULL,
9432 +               .setkey = NULL,
9433 +               .encrypt = NULL,
9434 +               .decrypt = NULL
9435 +       }
9436 +};
9437 +
9438 +/* Make Linus happy.
9439 +   Local variables:
9440 +   c-indentation-style: "K&R"
9441 +   mode-name: "LC"
9442 +   c-basic-offset: 8
9443 +   tab-width: 8
9444 +   fill-column: 120
9445 +   scroll-step: 1
9446 +   End:
9447 +*/
9448 diff -rupN linux-2.6.8-rc3/fs/reiser4/debug.c linux-2.6.8-rc3-a/fs/reiser4/debug.c
9449 --- linux-2.6.8-rc3/fs/reiser4/debug.c  1970-01-01 03:00:00.000000000 +0300
9450 +++ linux-2.6.8-rc3-a/fs/reiser4/debug.c        2004-08-05 21:20:52.994679197 +0400
9451 @@ -0,0 +1,735 @@
9452 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
9453 + * reiser4/README */
9454 +
9455 +/* Debugging facilities. */
9456 +
9457 +/*
9458 + * This file contains generic debugging functions used by reiser4. Roughly
9459 + * following:
9460 + *
9461 + *     panicking: reiser4_do_panic(), reiser4_print_prefix().
9462 + *
9463 + *     locking: schedulable(), lock_counters(), print_lock_counters(),
9464 + *     no_counters_are_held(), commit_check_locks()
9465 + *
9466 + *     {debug,trace,log}_flags: reiser4_are_all_debugged(),
9467 + *     reiser4_is_debugged(), get_current_trace_flags(),
9468 + *     get_current_log_flags().
9469 + *
9470 + *     kmalloc/kfree leak detection: reiser4_kmalloc(), reiser4_kfree(),
9471 + *     reiser4_kfree_in_sb().
9472 + *
9473 + *     error code monitoring (see comment before RETERR macro): return_err(),
9474 + *     report_err().
9475 + *
9476 + *     stack back-tracing: fill_backtrace()
9477 + *
9478 + *     miscellaneous: preempt_point(), call_on_each_assert(), debugtrap().
9479 + *
9480 + */
9481 +
9482 +#include "kattr.h"
9483 +#include "reiser4.h"
9484 +#include "context.h"
9485 +#include "super.h"
9486 +#include "txnmgr.h"
9487 +#include "znode.h"
9488 +
9489 +#include <linux/sysfs.h>
9490 +#include <linux/slab.h>
9491 +#include <linux/types.h>
9492 +#include <linux/fs.h>
9493 +#include <linux/spinlock.h>
9494 +#include <linux/kallsyms.h>
9495 +#include <linux/vmalloc.h>
9496 +#include <linux/ctype.h>
9497 +#include <linux/sysctl.h>
9498 +
9499 +extern void cond_resched(void);
9500 +
9501 +/*
9502 + * global buffer where message given to reiser4_panic is formatted.
9503 + */
9504 +static char panic_buf[REISER4_PANIC_MSG_BUFFER_SIZE];
9505 +
9506 +/*
9507 + * lock protecting consistency of panic_buf under concurrent panics
9508 + */
9509 +static spinlock_t panic_guard = SPIN_LOCK_UNLOCKED;
9510 +
9511 +/* Your best friend. Call it on each occasion.  This is called by
9512 +    fs/reiser4/debug.h:reiser4_panic(). */
9513 +reiser4_internal void
9514 +reiser4_do_panic(const char *format /* format string */ , ... /* rest */)
9515 +{
9516 +       static int in_panic = 0;
9517 +       va_list args;
9518 +
9519 +       /*
9520 +        * check for recursive panic.
9521 +        */
9522 +       if (in_panic == 0) {
9523 +               in_panic = 1;
9524 +
9525 +               spin_lock(&panic_guard);
9526 +               va_start(args, format);
9527 +               vsnprintf(panic_buf, sizeof(panic_buf), format, args);
9528 +               va_end(args);
9529 +               printk(KERN_EMERG "reiser4 panicked cowardly: %s", panic_buf);
9530 +               spin_unlock(&panic_guard);
9531 +
9532 +               /*
9533 +                * if kernel debugger is configured---drop in. Early dropping
9534 +                * into kgdb is not always convenient, because panic message
9535 +                * is not yet printed most of the times. But:
9536 +                *
9537 +                *     (1) message can be extracted from printk_buf[]
9538 +                *     (declared static inside of printk()), and
9539 +                *
9540 +                *     (2) sometimes serial/kgdb combo dies while printing
9541 +                *     long panic message, so it's more prudent to break into
9542 +                *     debugger earlier.
9543 +                *
9544 +                */
9545 +               DEBUGON(1);
9546 +
9547 +               if (get_current_context_check() != NULL) {
9548 +                       struct super_block *super;
9549 +                       reiser4_context *ctx;
9550 +
9551 +                       /*
9552 +                        * if we are within reiser4 context, print it contents:
9553 +                        */
9554 +
9555 +                       /* lock counters... */
9556 +                       print_lock_counters("pins held", lock_counters());
9557 +                       /* other active contexts... */
9558 +                       print_contexts();
9559 +                       ctx = get_current_context();
9560 +                       super = ctx->super;
9561 +                       if (get_super_private(super) != NULL &&
9562 +                           reiser4_is_debugged(super, REISER4_VERBOSE_PANIC))
9563 +                               /* znodes... */
9564 +                               print_znodes("znodes", current_tree);
9565 +#if REISER4_DEBUG_CONTEXTS
9566 +                       {
9567 +                               extern spinlock_t active_contexts_lock;
9568 +
9569 +                               /*
9570 +                                * remove context from the list of active
9571 +                                * contexts. This is precaution measure:
9572 +                                * current is going to die, and leaving
9573 +                                * context on the list would render latter
9574 +                                * corrupted.
9575 +                                */
9576 +                               spin_lock(&active_contexts_lock);
9577 +                               context_list_remove(ctx->parent);
9578 +                               spin_unlock(&active_contexts_lock);
9579 +                       }
9580 +#endif
9581 +               }
9582 +       }
9583 +       BUG();
9584 +       /* to make gcc happy about noreturn attribute */
9585 +       panic("%s", panic_buf);
9586 +}
9587 +
9588 +reiser4_internal void
9589 +reiser4_print_prefix(const char *level, int reperr, const char *mid,
9590 +                    const char *function, const char *file, int lineno)
9591 +{
9592 +       const char *comm;
9593 +       int   pid;
9594 +
9595 +       if (unlikely(in_interrupt() || in_irq())) {
9596 +               comm = "interrupt";
9597 +               pid  = 0;
9598 +       } else {
9599 +               comm = current->comm;
9600 +               pid  = current->pid;
9601 +       }
9602 +       printk("%sreiser4[%.16s(%i)]: %s (%s:%i)[%s]:\n",
9603 +              level, comm, pid, function, file, lineno, mid);
9604 +       if (reperr)
9605 +               report_err();
9606 +}
9607 +
9608 +/* Preemption point: this should be called periodically during long running
9609 +   operations (carry, allocate, and squeeze are best examples) */
9610 +reiser4_internal int
9611 +preempt_point(void)
9612 +{
9613 +       assert("nikita-3008", schedulable());
9614 +       cond_resched();
9615 +       return signal_pending(current);
9616 +}
9617 +
9618 +#if REISER4_DEBUG
9619 +
9620 +/* check that no spinlocks are held */
9621 +int schedulable(void)
9622 +{
9623 +       if (get_current_context_check() != NULL) {
9624 +               if (!LOCK_CNT_NIL(spin_locked)) {
9625 +                       print_lock_counters("in atomic", lock_counters());
9626 +                       return 0;
9627 +               }
9628 +       }
9629 +       might_sleep();
9630 +       return 1;
9631 +}
9632 +#endif
9633 +
9634 +#if REISER4_DEBUG_SPIN_LOCKS
9635 +/* Debugging aid: return struct where information about locks taken by current
9636 +   thread is accumulated. This can be used to formulate lock ordering
9637 +   constraints and various assertions.
9638 +
9639 +*/
9640 +lock_counters_info *
9641 +lock_counters(void)
9642 +{
9643 +       reiser4_context *ctx = get_current_context();
9644 +       assert("jmacd-1123", ctx != NULL);
9645 +       return &ctx->locks;
9646 +}
9647 +
9648 +#if REISER4_DEBUG_OUTPUT
9649 +/*
9650 + * print human readable information about locks held by the reiser4 context.
9651 + */
9652 +void
9653 +print_lock_counters(const char *prefix, const lock_counters_info * info)
9654 +{
9655 +       printk("%s: jnode: %i, tree: %i (r:%i,w:%i), dk: %i (r:%i,w:%i)\n"
9656 +              "jload: %i, "
9657 +              "txnh: %i, atom: %i, stack: %i, txnmgr: %i, "
9658 +              "ktxnmgrd: %i, fq: %i, reiser4_sb: %i\n"
9659 +              "inode: %i, "
9660 +              "cbk_cache: %i (r:%i,w%i), "
9661 +              "epoch: %i, eflush: %i, "
9662 +              "zlock: %i (r:%i, w:%i)\n"
9663 +              "spin: %i, long: %i inode_sem: (r:%i,w:%i)\n"
9664 +              "d: %i, x: %i, t: %i\n", prefix,
9665 +              info->spin_locked_jnode,
9666 +              info->rw_locked_tree, info->read_locked_tree,
9667 +              info->write_locked_tree,
9668 +
9669 +              info->rw_locked_dk, info->read_locked_dk, info->write_locked_dk,
9670 +
9671 +              info->spin_locked_jload,
9672 +              info->spin_locked_txnh,
9673 +              info->spin_locked_atom, info->spin_locked_stack,
9674 +              info->spin_locked_txnmgr, info->spin_locked_ktxnmgrd,
9675 +              info->spin_locked_fq, info->spin_locked_super,
9676 +              info->spin_locked_inode_object,
9677 +
9678 +              info->rw_locked_cbk_cache,
9679 +              info->read_locked_cbk_cache,
9680 +              info->write_locked_cbk_cache,
9681 +
9682 +              info->spin_locked_epoch,
9683 +              info->spin_locked_super_eflush,
9684 +
9685 +              info->rw_locked_zlock,
9686 +              info->read_locked_zlock,
9687 +              info->write_locked_zlock,
9688 +
9689 +              info->spin_locked,
9690 +              info->long_term_locked_znode,
9691 +              info->inode_sem_r, info->inode_sem_w,
9692 +              info->d_refs, info->x_refs, info->t_refs);
9693 +}
9694 +
9695 +/*
9696 + * return true, iff no locks are held.
9697 + */
9698 +int
9699 +no_counters_are_held(void)
9700 +{
9701 +       lock_counters_info *counters;
9702 +
9703 +       counters = lock_counters();
9704 +       return
9705 +               (counters->rw_locked_zlock == 0) &&
9706 +               (counters->read_locked_zlock == 0) &&
9707 +               (counters->write_locked_zlock == 0) &&
9708 +               (counters->spin_locked_jnode == 0) &&
9709 +               (counters->rw_locked_tree == 0) &&
9710 +               (counters->read_locked_tree == 0) &&
9711 +               (counters->write_locked_tree == 0) &&
9712 +               (counters->rw_locked_dk == 0) &&
9713 +               (counters->read_locked_dk == 0) &&
9714 +               (counters->write_locked_dk == 0) &&
9715 +               (counters->spin_locked_txnh == 0) &&
9716 +               (counters->spin_locked_atom == 0) &&
9717 +               (counters->spin_locked_stack == 0) &&
9718 +               (counters->spin_locked_txnmgr == 0) &&
9719 +               (counters->spin_locked_inode_object == 0) &&
9720 +               (counters->spin_locked == 0) &&
9721 +               (counters->long_term_locked_znode == 0) &&
9722 +               (counters->inode_sem_r == 0) &&
9723 +               (counters->inode_sem_w == 0);
9724 +}
9725 +
9726 +/*
9727 + * return true, iff transaction commit can be done under locks held by the
9728 + * current thread.
9729 + */
9730 +int
9731 +commit_check_locks(void)
9732 +{
9733 +       lock_counters_info *counters;
9734 +       int inode_sem_r;
9735 +       int inode_sem_w;
9736 +       int result;
9737 +
9738 +       /*
9739 +        * inode's read/write semaphore is the only reiser4 lock that can be
9740 +        * held during commit.
9741 +        */
9742 +
9743 +       counters = lock_counters();
9744 +       inode_sem_r = counters->inode_sem_r;
9745 +       inode_sem_w = counters->inode_sem_w;
9746 +
9747 +       counters->inode_sem_r = counters->inode_sem_w = 0;
9748 +       result = no_counters_are_held();
9749 +       counters->inode_sem_r = inode_sem_r;
9750 +       counters->inode_sem_w = inode_sem_w;
9751 +       return result;
9752 +}
9753 +
9754 +/* REISER4_DEBUG_OUTPUT */
9755 +#endif
9756 +
9757 +/* REISER4_DEBUG_SPIN_LOCKS */
9758 +#endif
9759 +
9760 +/*
9761 + * check that all bits specified by @flags are set in ->debug_flags of the
9762 + * super block.
9763 + */
9764 +reiser4_internal int
9765 +reiser4_are_all_debugged(struct super_block *super, __u32 flags)
9766 +{
9767 +       return (get_super_private(super)->debug_flags & flags) == flags;
9768 +}
9769 +
9770 +/*
9771 + * check that some bits specified by @flags are set in ->debug_flags of the
9772 + * super block.
9773 + */
9774 +reiser4_internal int
9775 +reiser4_is_debugged(struct super_block *super, __u32 flag)
9776 +{
9777 +       return get_super_private(super)->debug_flags & flag;
9778 +}
9779 +
9780 +#if REISER4_TRACE
9781 +/* tracing setup: global trace flags stored in global variable plus
9782 +   per-thread trace flags plus per-fs trace flags.
9783 +   */
9784 +__u32 get_current_trace_flags(void)
9785 +{
9786 +       __u32 flags;
9787 +       reiser4_context *ctx;
9788 +
9789 +       flags = 0;
9790 +       ctx = get_current_context_check();
9791 +       if (ctx) {
9792 +               flags |= ctx->trace_flags;
9793 +               flags |= get_super_private(ctx->super)->trace_flags;
9794 +       }
9795 +       return flags;
9796 +}
9797 +#endif
9798 +
9799 +#if REISER4_LOG
9800 +
9801 +/* log flags are stored in super block */
9802 +__u32 get_current_log_flags(void)
9803 +{
9804 +       __u32 flags;
9805 +       reiser4_context *ctx;
9806 +
9807 +       flags = 0;
9808 +       ctx = get_current_context_check();
9809 +       if (ctx)
9810 +               flags = get_super_private(ctx->super)->log_flags;
9811 +       return flags;
9812 +}
9813 +
9814 +/* oid of file page events of which are to be logged */
9815 +__u32 get_current_oid_to_log(void)
9816 +{
9817 +       __u32 oid;
9818 +       reiser4_context *ctx;
9819 +
9820 +       oid = 0;
9821 +       ctx = get_current_context_check();
9822 +       if (ctx)
9823 +               oid = get_super_private(ctx->super)->oid_to_log;
9824 +       return oid;
9825 +}
9826 +
9827 +#endif
9828 +
9829 +/* allocate memory. This calls kmalloc(), performs some additional checks, and
9830 +   keeps track of how many memory was allocated on behalf of current super
9831 +   block. */
9832 +reiser4_internal void *
9833 +reiser4_kmalloc(size_t size /* number of bytes to allocate */ ,
9834 +               int gfp_flag /* allocation flag */ )
9835 +{
9836 +       void *result;
9837 +
9838 +       assert("nikita-3009", ergo(gfp_flag & __GFP_WAIT, schedulable()));
9839 +
9840 +       result = kmalloc(size, gfp_flag);
9841 +       if (REISER4_DEBUG && result != NULL) {
9842 +               unsigned int usedsize;
9843 +               reiser4_super_info_data *sbinfo;
9844 +
9845 +               usedsize = ksize(result);
9846 +
9847 +               sbinfo = get_current_super_private();
9848 +
9849 +               assert("nikita-3459", usedsize >= size);
9850 +               assert("nikita-1407", sbinfo != NULL);
9851 +               reiser4_spin_lock_sb(sbinfo);
9852 +               ON_DEBUG(sbinfo->kmalloc_allocated += usedsize);
9853 +               reiser4_spin_unlock_sb(sbinfo);
9854 +       }
9855 +       return result;
9856 +}
9857 +
9858 +/* release memory allocated by reiser4_kmalloc() and update counter. */
9859 +reiser4_internal void
9860 +reiser4_kfree(void *area /* memory to from */)
9861 +{
9862 +       assert("nikita-1410", area != NULL);
9863 +       return reiser4_kfree_in_sb(area, reiser4_get_current_sb());
9864 +}
9865 +
9866 +/* release memory allocated by reiser4_kmalloc() for the specified
9867 + * super-block. This is useful when memory is released outside of reiser4
9868 + * context */
9869 +reiser4_internal void
9870 +reiser4_kfree_in_sb(void *area /* memory to from */, struct super_block *sb)
9871 +{
9872 +       assert("nikita-2729", area != NULL);
9873 +       if (REISER4_DEBUG) {
9874 +               unsigned int size;
9875 +               reiser4_super_info_data *sbinfo;
9876 +
9877 +               size = ksize(area);
9878 +
9879 +               sbinfo = get_super_private(sb);
9880 +
9881 +               reiser4_spin_lock_sb(sbinfo);
9882 +               assert("nikita-2730", sbinfo->kmalloc_allocated >= (int) size);
9883 +               ON_DEBUG(sbinfo->kmalloc_allocated -= size);
9884 +               reiser4_spin_unlock_sb(sbinfo);
9885 +       }
9886 +       kfree(area);
9887 +}
9888 +
9889 +
9890 +#if defined(CONFIG_REISER4_NOOPT)
9891 +void __you_cannot_kmalloc_that_much(void)
9892 +{
9893 +       BUG();
9894 +}
9895 +#endif
9896 +
9897 +#if REISER4_DEBUG
9898 +
9899 +/*
9900 + * fill "error site" in the current reiser4 context. See comment before RETERR
9901 + * macro for more details.
9902 + */
9903 +void
9904 +return_err(int code, const char *file, int line)
9905 +{
9906 +       if (code < 0 && is_in_reiser4_context()) {
9907 +               reiser4_context *ctx = get_current_context();
9908 +
9909 +               if (ctx != NULL) {
9910 +                       fill_backtrace(&ctx->err.path,
9911 +                                      REISER4_BACKTRACE_DEPTH, 0);
9912 +                       ctx->err.code = code;
9913 +                       ctx->err.file = file;
9914 +                       ctx->err.line = line;
9915 +               }
9916 +       }
9917 +}
9918 +
9919 +/*
9920 + * report error information recorder by return_err().
9921 + */
9922 +void
9923 +report_err(void)
9924 +{
9925 +       reiser4_context *ctx = get_current_context_check();
9926 +
9927 +       if (ctx != NULL) {
9928 +               if (ctx->err.code != 0) {
9929 +#ifdef CONFIG_FRAME_POINTER
9930 +                       int i;
9931 +                       for (i = 0; i < REISER4_BACKTRACE_DEPTH ; ++ i)
9932 +                               printk("0x%p ", ctx->err.path.trace[i]);
9933 +                       printk("\n");
9934 +#endif
9935 +                       printk("code: %i at %s:%i\n",
9936 +                              ctx->err.code, ctx->err.file, ctx->err.line);
9937 +               }
9938 +       }
9939 +}
9940 +
9941 +#ifdef CONFIG_FRAME_POINTER
9942 +
9943 +extern int kswapd(void *);
9944 +
9945 +#include <linux/personality.h>
9946 +#include "ktxnmgrd.h"
9947 +#include "repacker.h"
9948 +
9949 +/*
9950 + * true iff @addr is between @start and @end
9951 + */
9952 +static int is_addr_in(void *addr, void *start, void *end)
9953 +{
9954 +       return start < addr && addr < end;
9955 +}
9956 +
9957 +/*
9958 + * stack back-tracing. Also see comments before REISER4_BACKTRACE_DEPTH in
9959 + * debug.h.
9960 + *
9961 + * Stack beck-trace is collected through __builtin_return_address() gcc
9962 + * builtin, which requires kernel to be compiled with frame pointers
9963 + * (CONFIG_FRAME_POINTER). Unfortunately, __builtin_return_address() doesn't
9964 + * provide means to detect when bottom of the stack is reached, and just
9965 + * crashed when trying to access non-existent frame.
9966 + *
9967 + * is_last_frame() function works around this (also see more advanced version
9968 + * in the proc-sleep patch that requires modification of core kernel code).
9969 + *
9970 + * This functions checks for common cases trying to detect that last stack
9971 + * frame was reached.
9972 + */
9973 +static int is_last_frame(void *addr)
9974 +{
9975 +       if (addr == NULL)
9976 +               return 1;
9977 +       if (is_addr_in(addr, kswapd, wakeup_kswapd))
9978 +               return 1;
9979 +       else if (is_addr_in(addr, reiser4_repacker, repacker_d))
9980 +               return 1;
9981 +       else if (is_addr_in(addr, init_ktxnmgrd_context, ktxnmgrd_kick))
9982 +               return 1;
9983 +       else if (is_addr_in(addr, init_entd_context, done_entd_context))
9984 +               return 1;
9985 +       else if (!kernel_text_address((unsigned long)addr))
9986 +               return 1;
9987 +       else
9988 +               return 0;
9989 +}
9990 +
9991 +/*
9992 + * fill stack back-trace.
9993 + */
9994 +reiser4_internal void
9995 +fill_backtrace(backtrace_path *path, int depth, int shift)
9996 +{
9997 +       int i;
9998 +       void *addr;
9999 +
10000 +       cassert(REISER4_BACKTRACE_DEPTH == 4);
10001 +       assert("nikita-3229", shift < 6);
10002 +
10003 +       /* long live Duff! */
10004 +
10005 +#define FRAME(nr)                                              \
10006 +       case (nr):                                              \
10007 +               addr  = __builtin_return_address((nr) + 2);     \
10008 +               break
10009 +
10010 +       xmemset(path, 0, sizeof *path);
10011 +       addr = NULL;
10012 +       /*
10013 +        * we need this silly loop, because __builtin_return_address() only
10014 +        * accepts _constant_ arguments. It reminds of the duff device
10015 +        * (http://www.faqs.org/docs/jargon/D/Duff's-device.html) which
10016 +        * explains the reference above.
10017 +        */
10018 +       for (i = 0; i < depth; ++ i) {
10019 +               switch(i + shift) {
10020 +                       FRAME(0);
10021 +                       FRAME(1);
10022 +                       FRAME(2);
10023 +                       FRAME(3);
10024 +                       FRAME(4);
10025 +                       FRAME(5);
10026 +                       FRAME(6);
10027 +                       FRAME(7);
10028 +                       FRAME(8);
10029 +                       FRAME(9);
10030 +                       FRAME(10);
10031 +               default:
10032 +                       impossible("nikita-3230", "everything is wrong");
10033 +               }
10034 +               path->trace[i] = addr;
10035 +               if (is_last_frame(addr))
10036 +                       break;
10037 +       }
10038 +}
10039 +#endif
10040 +
10041 +/*
10042 + * assert() macro calls this function on each invocation. This is convenient
10043 + * place to put some debugging code that has to be executed very
10044 + * frequently. _Very_.
10045 + */
10046 +void call_on_each_assert(void)
10047 +{
10048 +       return;
10049 +       /*
10050 +        * DON'T USE ASSERTIONS HERE :)
10051 +        */
10052 +       if (is_in_reiser4_context()) {
10053 +               reiser4_super_info_data *sinfo;
10054 +               reiser4_context *ctx;
10055 +
10056 +               ctx = (reiser4_context *) current->journal_info;
10057 +               sinfo = ctx->super->s_fs_info;
10058 +               /* put checks here */
10059 +       }
10060 +}
10061 +
10062 +/* REISER4_DEBUG */
10063 +#endif
10064 +
10065 +#if KERNEL_DEBUGGER
10066 +/*
10067 + * this functions just drops into kernel debugger. It is a convenient place to
10068 + * put breakpoint in.
10069 + */
10070 +void debugtrap(void)
10071 +{
10072 +       /* do nothing. Put break point here. */
10073 +#if defined(CONFIG_KGDB) && !defined(CONFIG_REISER4_FS_MODULE)
10074 +       extern void breakpoint(void);
10075 +       breakpoint();
10076 +#endif
10077 +}
10078 +#endif
10079 +
10080 +
10081 +/* debugging tool
10082 +   use clog_op to make a record
10083 +   use print_clog to see last CLOG_LENGTH record
10084 + */
10085 +#define CLOG_LENGTH 256
10086 +static spinlock_t clog_lock = SPIN_LOCK_UNLOCKED;
10087 +
10088 +typedef struct {
10089 +       int id;
10090 +       pid_t pid;
10091 +       int op;
10092 +       void *data1;
10093 +       void *data2;
10094 +} clog_t;
10095 +
10096 +clog_t clog[CLOG_LENGTH];
10097 +
10098 +int clog_start = 0;
10099 +int clog_length = 0;
10100 +int clog_id = 0;
10101 +
10102 +void
10103 +clog_op(int op, void *data1, void *data2)
10104 +{
10105 +       spin_lock(&clog_lock);
10106 +
10107 +       if (clog_length == CLOG_LENGTH) {
10108 +               clog[clog_start].id = clog_id ++;
10109 +               clog[clog_start].op = op;
10110 +               clog[clog_start].pid = current->pid;
10111 +               clog[clog_start].data1 = data1;
10112 +               clog[clog_start].data2 = data2;
10113 +               clog_start ++;
10114 +               clog_start %= CLOG_LENGTH;
10115 +       } else {
10116 +               assert("vs-1672", clog_start == 0);
10117 +               clog[clog_length].id = clog_id ++;
10118 +               clog[clog_length].op = op;
10119 +               clog[clog_length].pid = current->pid;
10120 +               clog[clog_length].data1 = data1;
10121 +               clog[clog_length].data2 = data2;
10122 +               clog_length ++;
10123 +       }
10124 +
10125 +       spin_unlock(&clog_lock);
10126 +}
10127 +
10128 +static const char *
10129 +op2str(int op)
10130 +{
10131 +       static const char *op_names[OP_NUM] = {
10132 +               "get-user-page",
10133 +               "put_user-page",
10134 +               "ex-write-in",
10135 +               "ex-write-out",
10136 +               "readp-in",
10137 +               "readp-out",
10138 +               "ex-write-in-nr-locks",
10139 +               "ex-write-out-nr-locks",
10140 +               "link-object",
10141 +               "unlink-object"
10142 +       };
10143 +       assert("vs-1673", op < OP_NUM);
10144 +       return op_names[op];
10145 +}
10146 +
10147 +void
10148 +print_clog(void)
10149 +{
10150 +       int i, j;
10151 +
10152 +       j = clog_start;
10153 +       for (i = 0; i < clog_length; i ++) {
10154 +               printk("%d(%d): id %d: pid %d, op %s, data1 %p, data2 %p\n",
10155 +                      i, j, clog[j].id, clog[j].pid, op2str(clog[j].op), clog[j].data1, clog[j].data2);
10156 +               j ++;
10157 +               j %= CLOG_LENGTH;
10158 +       }
10159 +       printk("clog length %d\n", clog_length);
10160 +}
10161 +
10162 +#if 0
10163 +void
10164 +print_symname(unsigned long address)
10165 +{
10166 +       char         *module;
10167 +       const char   *name;
10168 +       char          namebuf[128];
10169 +       unsigned long offset;
10170 +       unsigned long size;
10171 +
10172 +       name = kallsyms_lookup(address, &size, &offset, &module, namebuf);
10173 +       if (name != NULL)
10174 +               printk("  %s[%lx/%lx]", name, offset, size);
10175 +}
10176 +#endif
10177 +
10178 +/* Make Linus happy.
10179 +   Local variables:
10180 +   c-indentation-style: "K&R"
10181 +   mode-name: "LC"
10182 +   c-basic-offset: 8
10183 +   tab-width: 8
10184 +   fill-column: 120
10185 +   End:
10186 +*/
10187 diff -rupN linux-2.6.8-rc3/fs/reiser4/debug.h linux-2.6.8-rc3-a/fs/reiser4/debug.h
10188 --- linux-2.6.8-rc3/fs/reiser4/debug.h  1970-01-01 03:00:00.000000000 +0300
10189 +++ linux-2.6.8-rc3-a/fs/reiser4/debug.h        2004-08-05 21:20:53.345605178 +0400
10190 @@ -0,0 +1,559 @@
10191 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
10192 +
10193 +/* Declarations of debug macros. */
10194 +
10195 +#if !defined( __FS_REISER4_DEBUG_H__ )
10196 +#define __FS_REISER4_DEBUG_H__
10197 +
10198 +#include "forward.h"
10199 +#include "reiser4.h"
10200 +
10201 +
10202 +/* generic function to produce formatted output, decorating it with
10203 +   whatever standard prefixes/postfixes we want. "Fun" is a function
10204 +   that will be actually called, can be printk, panic etc.
10205 +   This is for use by other debugging macros, not by users. */
10206 +#define DCALL(lev, fun, reperr, label, format, ...)            \
10207 +({                                                             \
10208 +       reiser4_print_prefix(lev, reperr, label,                \
10209 +                            __FUNCTION__, __FILE__, __LINE__); \
10210 +       fun(lev format "\n" , ## __VA_ARGS__);                  \
10211 +})
10212 +
10213 +/*
10214 + * cause kernel to crash
10215 + */
10216 +#define reiser4_panic(mid, format, ...)                                \
10217 +       DCALL("", reiser4_do_panic, 1, mid, format , ## __VA_ARGS__)
10218 +
10219 +/* print message with indication of current process, file, line and
10220 +   function */
10221 +#define reiser4_log(label, format, ...)                                \
10222 +       DCALL(KERN_DEBUG, printk, 0, label, format , ## __VA_ARGS__)
10223 +
10224 +/* Assertion checked during compilation.
10225 +    If "cond" is false (0) we get duplicate case label in switch.
10226 +    Use this to check something like famous
10227 +       cassert (sizeof(struct reiserfs_journal_commit) == 4096) ;
10228 +    in 3.x journal.c. If cassertion fails you get compiler error,
10229 +    so no "maintainer-id".
10230 +*/
10231 +#define cassert(cond) ({ switch(-1) { case (cond): case 0: break; } })
10232 +
10233 +#define noop   do {;} while(0)
10234 +
10235 +#if REISER4_DEBUG
10236 +/* version of info that only actually prints anything when _d_ebugging
10237 +    is on */
10238 +#define dinfo(format, ...) printk(format , ## __VA_ARGS__)
10239 +/* macro to catch logical errors. Put it into `default' clause of
10240 +    switch() statement. */
10241 +#define impossible(label, format, ...)                         \
10242 +         reiser4_panic(label, "impossible: " format , ## __VA_ARGS__)
10243 +/* assert assures that @cond is true. If it is not, reiser4_panic() is
10244 +   called. Use this for checking logical consistency and _never_ call
10245 +   this to check correctness of external data: disk blocks and user-input . */
10246 +#define assert(label, cond)                                            \
10247 +({                                                                     \
10248 +       /* call_on_each_assert(); */                                    \
10249 +       if (cond) {                                             \
10250 +               /* put negated check to avoid using !(cond) that would lose \
10251 +                * warnings for things like assert(a = b); */           \
10252 +               ;                                                       \
10253 +       } else {                                                        \
10254 +               DEBUGON(1);                                             \
10255 +               reiser4_panic(label, "assertion failed: %s", #cond);    \
10256 +       }                                                               \
10257 +})
10258 +
10259 +/* like assertion, but @expr is evaluated even if REISER4_DEBUG is off. */
10260 +#define check_me( label, expr )        assert( label, ( expr ) )
10261 +
10262 +#define ON_DEBUG( exp ) exp
10263 +
10264 +extern int schedulable(void);
10265 +extern void call_on_each_assert(void);
10266 +
10267 +#else
10268 +
10269 +#define dinfo( format, args... ) noop
10270 +#define impossible( label, format, args... ) noop
10271 +#define assert( label, cond ) noop
10272 +#define check_me( label, expr )        ( ( void ) ( expr ) )
10273 +#define ON_DEBUG( exp )
10274 +#define schedulable() might_sleep()
10275 +
10276 +/* REISER4_DEBUG */
10277 +#endif
10278 +
10279 +#if REISER4_DEBUG_SPIN_LOCKS
10280 +/* per-thread information about lock acquired by this thread. Used by lock
10281 + * ordering checking in spin_macros.h */
10282 +typedef struct lock_counters_info {
10283 +       int rw_locked_tree;
10284 +       int read_locked_tree;
10285 +       int write_locked_tree;
10286 +
10287 +       int rw_locked_dk;
10288 +       int read_locked_dk;
10289 +       int write_locked_dk;
10290 +
10291 +       int rw_locked_cbk_cache;
10292 +       int read_locked_cbk_cache;
10293 +       int write_locked_cbk_cache;
10294 +
10295 +       int rw_locked_zlock;
10296 +       int read_locked_zlock;
10297 +       int write_locked_zlock;
10298 +
10299 +       int spin_locked_jnode;
10300 +       int spin_locked_jload;
10301 +       int spin_locked_txnh;
10302 +       int spin_locked_atom;
10303 +       int spin_locked_stack;
10304 +       int spin_locked_txnmgr;
10305 +       int spin_locked_ktxnmgrd;
10306 +       int spin_locked_fq;
10307 +       int spin_locked_super;
10308 +       int spin_locked_inode_object;
10309 +       int spin_locked_epoch;
10310 +       int spin_locked_super_eflush;
10311 +       int spin_locked;
10312 +       int long_term_locked_znode;
10313 +
10314 +       int inode_sem_r;
10315 +       int inode_sem_w;
10316 +
10317 +       int d_refs;
10318 +       int x_refs;
10319 +       int t_refs;
10320 +} lock_counters_info;
10321 +
10322 +extern lock_counters_info *lock_counters(void);
10323 +#define IN_CONTEXT(a, b) (is_in_reiser4_context() ? (a) : (b))
10324 +
10325 +/* increment lock-counter @counter, if present */
10326 +#define LOCK_CNT_INC(counter) IN_CONTEXT(++(lock_counters()->counter), 0)
10327 +
10328 +/* decrement lock-counter @counter, if present */
10329 +#define LOCK_CNT_DEC(counter) IN_CONTEXT(--(lock_counters()->counter), 0)
10330 +
10331 +/* check that lock-counter is zero. This is for use in assertions */
10332 +#define LOCK_CNT_NIL(counter) IN_CONTEXT(lock_counters()->counter == 0, 1)
10333 +
10334 +/* check that lock-counter is greater than zero. This is for use in
10335 + * assertions */
10336 +#define LOCK_CNT_GTZ(counter) IN_CONTEXT(lock_counters()->counter > 0, 1)
10337 +
10338 +/* REISER4_DEBUG_SPIN_LOCKS */
10339 +#else
10340 +
10341 +/* no-op versions on the above */
10342 +
10343 +typedef struct lock_counters_info {
10344 +} lock_counters_info;
10345 +#define lock_counters() ((lock_counters_info *)NULL)
10346 +#define LOCK_CNT_INC(counter) noop
10347 +#define LOCK_CNT_DEC(counter) noop
10348 +#define LOCK_CNT_NIL(counter) (1)
10349 +#define LOCK_CNT_GTZ(counter) (1)
10350 +/* REISER4_DEBUG_SPIN_LOCKS */
10351 +#endif
10352 +
10353 +/*
10354 + * back-trace recording. In several places in reiser4 we want to record stack
10355 + * back-trace for debugging purposes. This functionality is only supported
10356 + * when kernel was configured with CONFIG_FRAME_POINTER option.
10357 + */
10358 +
10359 +#ifdef CONFIG_FRAME_POINTER
10360 +
10361 +/*
10362 + * how many stack frames to record in back-trace.
10363 + *
10364 + * update debug.c:fill_backtrace() if you change this
10365 + */
10366 +#define REISER4_BACKTRACE_DEPTH (4)
10367 +
10368 +/*
10369 + * data type to store stack back-trace
10370 + */
10371 +typedef struct {
10372 +       void *trace[REISER4_BACKTRACE_DEPTH];
10373 +} backtrace_path;
10374 +
10375 +extern void fill_backtrace(backtrace_path *path, int depth, int shift);
10376 +#else
10377 +
10378 +/* no-op versions on the above */
10379 +
10380 +typedef struct {} backtrace_path;
10381 +#define fill_backtrace(path, depth, shift) noop
10382 +
10383 +#endif
10384 +
10385 +
10386 +/* flags controlling debugging behavior. Are set through debug_flags=N mount
10387 +   option. */
10388 +typedef enum {
10389 +       /* print a lot of information during panic. When this is on all jnodes
10390 +        * are listed. This can be *very* large output. Usually you don't want
10391 +        * this. Especially over serial line. */
10392 +       REISER4_VERBOSE_PANIC = 0x00000001,
10393 +       /* print a lot of information during umount */
10394 +       REISER4_VERBOSE_UMOUNT = 0x00000002,
10395 +       /* print gathered statistics on umount */
10396 +       REISER4_STATS_ON_UMOUNT = 0x00000004,
10397 +       /* check node consistency */
10398 +       REISER4_CHECK_NODE = 0x00000008
10399 +} reiser4_debug_flags;
10400 +
10401 +extern int reiser4_are_all_debugged(struct super_block *super, __u32 flags);
10402 +extern int reiser4_is_debugged(struct super_block *super, __u32 flag);
10403 +
10404 +extern int is_in_reiser4_context(void);
10405 +
10406 +/*
10407 + * evaluate expression @e only if with reiser4 context
10408 + */
10409 +#define ON_CONTEXT(e)  do {                    \
10410 +       if(is_in_reiser4_context()) {           \
10411 +               e;                              \
10412 +       } } while(0)
10413 +
10414 +/*
10415 + * evaluate expression @e only when within reiser4_context and debugging is
10416 + * on.
10417 + */
10418 +#define ON_DEBUG_CONTEXT( e ) ON_DEBUG( ON_CONTEXT( e ) )
10419 +
10420 +#if REISER4_DEBUG_MODIFY
10421 +/*
10422 + * evaluate expression @exp only if REISER4_DEBUG_MODIFY mode is on.
10423 + */
10424 +#define ON_DEBUG_MODIFY( exp ) exp
10425 +#else
10426 +#define ON_DEBUG_MODIFY( exp )
10427 +#endif
10428 +
10429 +/*
10430 + * complain about unexpected function result and crash. Used in "default"
10431 + * branches of switch statements and alike to assert that invalid results are
10432 + * not silently ignored.
10433 + */
10434 +#define wrong_return_value( label, function )                          \
10435 +       impossible( label, "wrong return value from " function )
10436 +
10437 +/* Issue warning message to the console */
10438 +#define warning( label, format, ... )                                  \
10439 +       DCALL( KERN_WARNING,                                            \
10440 +              printk, 1, label, "WARNING: " format , ## __VA_ARGS__ )
10441 +
10442 +/* mark not yet implemented functionality */
10443 +#define not_yet( label, format, ... )                          \
10444 +       reiser4_panic( label, "NOT YET IMPLEMENTED: " format , ## __VA_ARGS__ )
10445 +
10446 +#if REISER4_TRACE
10447 +/* helper macro for tracing, see trace_stamp() below. */
10448 +#define IF_TRACE(flags, e)                                                     \
10449 +       if(get_current_trace_flags() & (flags)) e
10450 +#else
10451 +#define IF_TRACE(flags, e) noop
10452 +#endif
10453 +
10454 +/* just print where we are: file, function, line */
10455 +#define trace_stamp( f )   IF_TRACE( f, reiser4_log( "trace", "" ) )
10456 +/* print value of "var" */
10457 +#define trace_var( f, format, var )                            \
10458 +        IF_TRACE( f, reiser4_log( "trace", #var ": " format, var ) )
10459 +/* print output only if appropriate trace flag(s) is on */
10460 +#define ON_TRACE( f, ... )   IF_TRACE(f, printk(__VA_ARGS__))
10461 +
10462 +/* tracing flags. */
10463 +typedef enum {
10464 +       /* trace nothing */
10465 +       NO_TRACE = 0,
10466 +       /* trace vfs interaction functions from vfs_ops.c */
10467 +       TRACE_VFS_OPS = (1 << 0),       /* 0x00000001 */
10468 +       /* trace plugin handling functions */
10469 +       TRACE_PLUGINS = (1 << 1),       /* 0x00000002 */
10470 +       /* trace tree traversals */
10471 +       TRACE_TREE = (1 << 2),  /* 0x00000004 */
10472 +       /* trace znode manipulation functions */
10473 +       TRACE_ZNODES = (1 << 3),        /* 0x00000008 */
10474 +       /* trace node layout functions */
10475 +       TRACE_NODES = (1 << 4), /* 0x00000010 */
10476 +       /* trace directory functions */
10477 +       TRACE_DIR = (1 << 5),   /* 0x00000020 */
10478 +       /* trace flush code verbosely */
10479 +       TRACE_FLUSH_VERB = (1 << 6),    /* 0x00000040 */
10480 +       /* trace flush code */
10481 +       TRACE_FLUSH = (1 << 7), /* 0x00000080 */
10482 +       /* trace carry */
10483 +       TRACE_CARRY = (1 << 8), /* 0x00000100 */
10484 +       /* trace how tree (web) of znodes if maintained through tree
10485 +          balancings. */
10486 +       TRACE_ZWEB = (1 << 9),  /* 0x00000200 */
10487 +       /* trace transactions. */
10488 +       TRACE_TXN = (1 << 10),  /* 0x00000400 */
10489 +       /* trace object id allocation/releasing */
10490 +       TRACE_OIDS = (1 << 11), /* 0x00000800 */
10491 +       /* trace item shifts */
10492 +       TRACE_SHIFT = (1 << 12),        /* 0x00001000 */
10493 +       /* trace page cache */
10494 +       TRACE_PCACHE = (1 << 13),       /* 0x00002000 */
10495 +       /* trace extents */
10496 +       TRACE_EXTENTS = (1 << 14),      /* 0x00004000 */
10497 +       /* trace locks */
10498 +       TRACE_LOCKS = (1 << 15),        /* 0x00008000 */
10499 +       /* trace coords */
10500 +       TRACE_COORDS = (1 << 16),       /* 0x00010000 */
10501 +       /* trace read-IO functions */
10502 +       TRACE_IO_R = (1 << 17), /* 0x00020000 */
10503 +       /* trace write-IO functions */
10504 +       TRACE_IO_W = (1 << 18), /* 0x00040000 */
10505 +
10506 +       /* trace log writing */
10507 +       TRACE_LOG = (1 << 19),  /* 0x00080000 */
10508 +
10509 +       /* trace journal replaying */
10510 +       TRACE_REPLAY = (1 << 20),       /* 0x00100000 */
10511 +
10512 +       /* trace space allocation */
10513 +       TRACE_ALLOC = (1 << 21),        /* 0x00200000 */
10514 +
10515 +       /* trace space reservation */
10516 +       TRACE_RESERVE = (1 << 22),      /* 0x00400000 */
10517 +
10518 +       /* trace emergency flush */
10519 +       TRACE_EFLUSH  = (1 << 23),      /* 0x00800000 */
10520 +
10521 +       /* trace ctails */
10522 +       TRACE_CTAIL = (1 << 24),       /* 0x01000000 */
10523 +
10524 +       TRACE_PARSE = (1 << 25),       /* 0x02000000 */
10525 +
10526 +       TRACE_CAPTURE_COPY = (1 << 26), /* 0x04000000 */
10527 +
10528 +       TRACE_EXTENT_ALLOC = (1 << 27),      /* 0x08000000 */
10529 +
10530 +       TRACE_CAPTURE_ANONYMOUS = (1 << 28), /* 0x10000000 */
10531 +
10532 +       /* vague section: used to trace bugs. Use it to issue optional prints
10533 +          at arbitrary points of code. */
10534 +       TRACE_BUG = (1 << 31),  /* 0x80000000 */
10535 +
10536 +       /* trace everything above */
10537 +       TRACE_ALL = 0xffffffffu
10538 +} reiser4_trace_flags;
10539 +
10540 +#if REISER4_LOG
10541 +/* helper macro for tracing, see trace_stamp() below. */
10542 +#define IF_LOG(flags, e)                                                       \
10543 +       if(get_current_log_flags() & (flags)) e
10544 +#else
10545 +#define IF_LOG(flags, e) noop
10546 +#endif
10547 +
10548 +/* log only if appropriate log flag(s) is on */
10549 +#define ON_LOG( f, ... )   IF_LOG(f, printk(__VA_ARGS__))
10550 +
10551 +typedef enum {
10552 +       WRITE_NODE_LOG = (1 << 0),      /* log [zj]node operations */
10553 +       WRITE_PAGE_LOG = (1 << 1),      /* log make_extent calls */
10554 +       WRITE_IO_LOG = (1 << 2),        /* log i/o requests */
10555 +       WRITE_TREE_LOG = (1 << 3),      /* log internal tree operations */
10556 +       WRITE_SYSCALL_LOG = (1 << 4),   /* log system calls */
10557 +       READAHEAD_LOG = (1 << 5),       /* log read-ahead activity */
10558 +       ALLOC_EXTENT_LOG = (1 << 6),    /* log extent allocation */
10559 +       LOG_FILE_PAGE_EVENT = (1 << 7)  /* log events happened to certain file */
10560 +} reiser4_log_flags;
10561 +
10562 +
10563 +extern void reiser4_do_panic(const char *format, ...)
10564 +__attribute__ ((noreturn, format(printf, 1, 2)));
10565 +
10566 +extern void reiser4_print_prefix(const char *level, int reperr, const char *mid,
10567 +                                const char *function,
10568 +                                const char *file, int lineno);
10569 +
10570 +extern int preempt_point(void);
10571 +extern void reiser4_print_stats(void);
10572 +
10573 +extern void *reiser4_kmalloc(size_t size, int gfp_flag);
10574 +extern void reiser4_kfree(void *area);
10575 +extern void reiser4_kfree_in_sb(void *area, struct super_block *sb);
10576 +extern __u32 get_current_trace_flags(void);
10577 +extern __u32 get_current_log_flags(void);
10578 +extern __u32 get_current_oid_to_log(void);
10579 +
10580 +#if REISER4_DEBUG_OUTPUT && REISER4_DEBUG_SPIN_LOCKS
10581 +extern void print_lock_counters(const char *prefix,
10582 +                               const lock_counters_info * info);
10583 +extern int no_counters_are_held(void);
10584 +extern int commit_check_locks(void);
10585 +#else
10586 +#define print_lock_counters(p, i) noop
10587 +#define no_counters_are_held() (1)
10588 +#define commit_check_locks() (1)
10589 +#endif
10590 +
10591 +#define REISER4_STACK_ABORT          (8192 - sizeof(struct thread_info) - 30)
10592 +#define REISER4_STACK_GAP            (REISER4_STACK_ABORT - 100)
10593 +
10594 +#if REISER4_DEBUG_MEMCPY
10595 +extern void *xmemcpy(void *dest, const void *src, size_t n);
10596 +extern void *xmemmove(void *dest, const void *src, size_t n);
10597 +extern void *xmemset(void *s, int c, size_t n);
10598 +#else
10599 +#define xmemcpy( d, s, n ) memcpy( ( d ), ( s ), ( n ) )
10600 +#define xmemmove( d, s, n ) memmove( ( d ), ( s ), ( n ) )
10601 +#define xmemset( s, c, n ) memset( ( s ), ( c ), ( n ) )
10602 +#endif
10603 +
10604 +/* true if @i is power-of-two. Useful for rate-limited warnings, etc. */
10605 +#define IS_POW(i)                              \
10606 +({                                             \
10607 +       typeof(i) __i;                          \
10608 +                                               \
10609 +       __i = (i);                              \
10610 +       !(__i & (__i - 1));                     \
10611 +})
10612 +
10613 +#define KERNEL_DEBUGGER (1)
10614 +
10615 +#if KERNEL_DEBUGGER
10616 +/*
10617 + * Check condition @cond and drop into kernel debugger (kgdb) if it's true. If
10618 + * kgdb is not compiled in, do nothing.
10619 + */
10620 +#define DEBUGON(cond)                          \
10621 +({                                             \
10622 +       extern void debugtrap(void);            \
10623 +                                               \
10624 +       if (unlikely(cond))                     \
10625 +               debugtrap();                    \
10626 +})
10627 +#else
10628 +#define DEBUGON(cond) noop
10629 +#endif
10630 +
10631 +/*
10632 + * Error code tracing facility. (Idea is borrowed from XFS code.)
10633 + *
10634 + * Suppose some strange and/or unexpected code is returned from some function
10635 + * (for example, write(2) returns -EEXIST). It is possible to place a
10636 + * breakpoint in the reiser4_write(), but it is too late here. How to find out
10637 + * in what particular place -EEXIST was generated first?
10638 + *
10639 + * In reiser4 all places where actual error codes are produced (that is,
10640 + * statements of the form
10641 + *
10642 + *     return -EFOO;        // (1), or
10643 + *
10644 + *     result = -EFOO;      // (2)
10645 + *
10646 + * are replaced with
10647 + *
10648 + *     return RETERR(-EFOO);        // (1a), and
10649 + *
10650 + *     result = RETERR(-EFOO);      // (2a) respectively
10651 + *
10652 + * RETERR() macro fills a backtrace in reiser4_context. This back-trace is
10653 + * printed in error and warning messages. Moreover, it's possible to put a
10654 + * conditional breakpoint in return_err (low-level function called by RETERR()
10655 + * to do the actual work) to break into debugger immediately when particular
10656 + * error happens.
10657 + *
10658 + */
10659 +
10660 +#if REISER4_DEBUG
10661 +
10662 +/*
10663 + * data-type to store information about where error happened ("error site").
10664 + */
10665 +typedef struct err_site {
10666 +       backtrace_path path; /* stack back trace of error */
10667 +       int            code; /* error code */
10668 +       const char    *file; /* source file, filled by __FILE__ */
10669 +       int            line; /* source file line, filled by __LINE__ */
10670 +} err_site;
10671 +
10672 +extern void return_err(int code, const char *file, int line);
10673 +extern void report_err(void);
10674 +
10675 +/*
10676 + * fill &get_current_context()->err_site with error information.
10677 + */
10678 +#define RETERR(code)                           \
10679 +({                                             \
10680 +       typeof(code) __code;                    \
10681 +                                               \
10682 +       __code = (code);                        \
10683 +       return_err(__code, __FILE__, __LINE__); \
10684 +       __code;                                 \
10685 +})
10686 +
10687 +#else
10688 +
10689 +/*
10690 + * no-op versions of the above
10691 + */
10692 +
10693 +typedef struct err_site {} err_site;
10694 +#define RETERR(code) code
10695 +#define report_err() noop
10696 +#endif
10697 +
10698 +#if REISER4_LARGE_KEY
10699 +/*
10700 + * conditionally compile arguments only if REISER4_LARGE_KEY is on.
10701 + */
10702 +#define ON_LARGE_KEY(...) __VA_ARGS__
10703 +#else
10704 +#define ON_LARGE_KEY(...)
10705 +#endif
10706 +
10707 +#if REISER4_ALL_IN_ONE
10708 +/*
10709 + * declarator used by REISER4_ALL_IN_ONE mode. Every reiser4 function that is
10710 + * not used externally (that is, not used by non-reiser4 code) should be
10711 + * tagged with this. Normally it expands to nothing. In REISER4_ALL_IN_ONE
10712 + * expands to statics allowing compiler to perform better optimization.
10713 + */
10714 +#define reiser4_internal static
10715 +#else
10716 +#define reiser4_internal
10717 +#endif
10718 +
10719 +/* operations to clog */
10720 +/* debugging re-enterance */
10721 +
10722 +#define GET_USER_PAGES 0
10723 +#define PUT_USER_PAGES 1
10724 +#define EXTENT_WRITE_IN 2
10725 +#define EXTENT_WRITE_OUT 3
10726 +#define READPAGE_IN 4
10727 +#define READPAGE_OUT 5
10728 +#define EXTENT_WRITE_IN2 6
10729 +#define EXTENT_WRITE_OUT2 7
10730 +#define LINK_OBJECT 8
10731 +#define UNLINK_OBJECT 9
10732 +
10733 +#define OP_NUM 10
10734 +
10735 +void clog_op(int op, void *, void *);
10736 +void print_clog(void);
10737 +
10738 +/* __FS_REISER4_DEBUG_H__ */
10739 +#endif
10740 +
10741 +/* Make Linus happy.
10742 +   Local variables:
10743 +   c-indentation-style: "K&R"
10744 +   mode-name: "LC"
10745 +   c-basic-offset: 8
10746 +   tab-width: 8
10747 +   fill-column: 120
10748 +   End:
10749 +*/
10750 diff -rupN linux-2.6.8-rc3/fs/reiser4/dformat.h linux-2.6.8-rc3-a/fs/reiser4/dformat.h
10751 --- linux-2.6.8-rc3/fs/reiser4/dformat.h        1970-01-01 03:00:00.000000000 +0300
10752 +++ linux-2.6.8-rc3-a/fs/reiser4/dformat.h      2004-08-05 21:20:53.492574179 +0400
10753 @@ -0,0 +1,164 @@
10754 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
10755 +
10756 +/* Formats of on-disk data and conversion functions. */
10757 +
10758 +/* put all item formats in the files describing the particular items,
10759 +   our model is, everything you need to do to add an item to reiser4,
10760 +   (excepting the changes to the plugin that uses the item which go
10761 +   into the file defining that plugin), you put into one file. */
10762 +/* Data on disk are stored in little-endian format.
10763 +   To declare fields of on-disk structures, use d8, d16, d32 and d64.
10764 +   d??tocpu() and cputod??() to convert. */
10765 +
10766 +#if !defined( __FS_REISER4_DFORMAT_H__ )
10767 +#define __FS_REISER4_DFORMAT_H__
10768 +
10769 +
10770 +#include <asm/byteorder.h>
10771 +#include <asm/unaligned.h>
10772 +#include <linux/types.h>
10773 +
10774 +/* our default disk byteorder is little endian */
10775 +
10776 +#if defined( __LITTLE_ENDIAN )
10777 +#define CPU_IN_DISK_ORDER  (1)
10778 +#else
10779 +#define CPU_IN_DISK_ORDER  (0)
10780 +#endif
10781 +
10782 +/* code on-disk data-types as structs with a single field
10783 +   to rely on compiler type-checking. Like include/asm-i386/page.h */
10784 +typedef struct d8 {
10785 +       __u8 datum;
10786 +} d8 __attribute__ ((aligned(1)));
10787 +typedef struct d16 {
10788 +       __u16 datum;
10789 +} d16 __attribute__ ((aligned(2)));
10790 +typedef struct d32 {
10791 +       __u32 datum;
10792 +} d32 __attribute__ ((aligned(4)));
10793 +typedef struct d64 {
10794 +       __u64 datum;
10795 +} d64 __attribute__ ((aligned(8)));
10796 +
10797 +#define PACKED __attribute__((packed))
10798 +
10799 +static inline __u8
10800 +d8tocpu(const d8 * ondisk /* on-disk value to convert */ )
10801 +{
10802 +       return ondisk->datum;
10803 +}
10804 +
10805 +static inline __u16
10806 +d16tocpu(const d16 * ondisk /* on-disk value to convert */ )
10807 +{
10808 +       return __le16_to_cpu(get_unaligned(&ondisk->datum));
10809 +}
10810 +
10811 +static inline __u32
10812 +d32tocpu(const d32 * ondisk /* on-disk value to convert */ )
10813 +{
10814 +       return __le32_to_cpu(get_unaligned(&ondisk->datum));
10815 +}
10816 +
10817 +static inline __u64
10818 +d64tocpu(const d64 * ondisk /* on-disk value to convert */ )
10819 +{
10820 +       return __le64_to_cpu(get_unaligned(&ondisk->datum));
10821 +}
10822 +
10823 +static inline d8 *
10824 +cputod8(unsigned int oncpu /* CPU value to convert */ ,
10825 +       d8 * ondisk /* result */ )
10826 +{
10827 +       assert("nikita-1264", oncpu < 0x100);
10828 +       put_unaligned(oncpu, &ondisk->datum);
10829 +       return ondisk;
10830 +}
10831 +
10832 +static inline d16 *
10833 +cputod16(unsigned int oncpu /* CPU value to convert */ ,
10834 +        d16 * ondisk /* result */ )
10835 +{
10836 +       assert("nikita-1265", oncpu < 0x10000);
10837 +       put_unaligned(__cpu_to_le16(oncpu), &ondisk->datum);
10838 +       return ondisk;
10839 +}
10840 +
10841 +static inline d32 *
10842 +cputod32(__u32 oncpu /* CPU value to convert */ ,
10843 +        d32 * ondisk /* result */ )
10844 +{
10845 +       put_unaligned(__cpu_to_le32(oncpu), &ondisk->datum);
10846 +       return ondisk;
10847 +}
10848 +
10849 +static inline d64 *
10850 +cputod64(__u64 oncpu /* CPU value to convert */ ,
10851 +        d64 * ondisk /* result */ )
10852 +{
10853 +       put_unaligned(__cpu_to_le64(oncpu), &ondisk->datum);
10854 +       return ondisk;
10855 +}
10856 +
10857 +/* data-type for block number on disk: these types enable changing the block
10858 +   size to other sizes, but they are only a start.  Suppose we wanted to
10859 +   support 48bit block numbers.  The dblock_nr blk would be changed to "short
10860 +   blk[3]".  The block_nr type should remain an integral type greater or equal
10861 +   to the dblock_nr type in size so that CPU arithmetic operations work. */
10862 +typedef __u64 reiser4_block_nr;
10863 +
10864 +/* data-type for block number on disk, disk format */
10865 +union reiser4_dblock_nr {
10866 +       d64 blk;
10867 +};
10868 +
10869 +static inline reiser4_block_nr
10870 +dblock_to_cpu(const reiser4_dblock_nr * dblock)
10871 +{
10872 +       return d64tocpu(&dblock->blk);
10873 +}
10874 +
10875 +static inline void
10876 +cpu_to_dblock(reiser4_block_nr block, reiser4_dblock_nr * dblock)
10877 +{
10878 +       cputod64(block, &dblock->blk);
10879 +}
10880 +
10881 +/* true if disk addresses are the same */
10882 +static inline int
10883 +disk_addr_eq(const reiser4_block_nr * b1       /* first block
10884 +                                                * number to
10885 +                                                * compare */ ,
10886 +            const reiser4_block_nr * b2        /* second block
10887 +                                                * number to
10888 +                                                * compare */ )
10889 +{
10890 +       assert("nikita-1033", b1 != NULL);
10891 +       assert("nikita-1266", b2 != NULL);
10892 +
10893 +       return !memcmp(b1, b2, sizeof *b1);
10894 +}
10895 +
10896 +/* structure of master reiser4 super block */
10897 +typedef struct reiser4_master_sb {
10898 +       char magic[16];         /* "ReIsEr4" */
10899 +       d16 disk_plugin_id;     /* id of disk layout plugin */
10900 +       d16 blocksize;
10901 +       char uuid[16];          /* unique id */
10902 +       char label[16];         /* filesystem label */
10903 +       d64 diskmap;            /* location of the diskmap. 0 if not present */
10904 +} reiser4_master_sb;
10905 +
10906 +/* __FS_REISER4_DFORMAT_H__ */
10907 +#endif
10908 +
10909 +/* Make Linus happy.
10910 +   Local variables:
10911 +   c-indentation-style: "K&R"
10912 +   mode-name: "LC"
10913 +   c-basic-offset: 8
10914 +   tab-width: 8
10915 +   fill-column: 120
10916 +   End:
10917 +*/
10918 diff -rupN linux-2.6.8-rc3/fs/reiser4/diskmap.c linux-2.6.8-rc3-a/fs/reiser4/diskmap.c
10919 --- linux-2.6.8-rc3/fs/reiser4/diskmap.c        1970-01-01 03:00:00.000000000 +0300
10920 +++ linux-2.6.8-rc3-a/fs/reiser4/diskmap.c      2004-08-05 21:20:53.374599063 +0400
10921 @@ -0,0 +1,76 @@
10922 +/* Copyright 2003 by Hans Reiser, licensing governed by reiser4/README */
10923 +/* Functions to deal with diskmap storage - read-only storage (currently can only be
10924 +   set via fs-creation process) for use by various plugins */
10925 +
10926 +
10927 +#include "debug.h"
10928 +#include "super.h"
10929 +#include "diskmap.h"
10930 +
10931 +#include <linux/fs.h>
10932 +
10933 +/* Looks through chain of diskmap blocks, looking for table entry where label and parameter
10934 +   patch passed in "label" and "parameter"
10935 +   Returns 0 on success, -1 if nothing was found or error have occured. */
10936 +reiser4_internal int
10937 +reiser4_get_diskmap_value( u32 label, u32 parameter, u64 *value)
10938 +{
10939 +       struct super_block *sb = reiser4_get_current_sb();
10940 +       int retval = -1;
10941 +
10942 +       assert("green-2006", label != REISER4_FIXMAP_END_LABEL && label != REISER4_FIXMAP_NEXT_LABEL);
10943 +
10944 +       if ( get_super_private(sb)->diskmap_block ) { /* If there is diskmap table, we need to read and parse it */
10945 +               struct buffer_head *diskmap_bh;
10946 +               struct reiser4_diskmap *diskmap;
10947 +               int i = 0;
10948 +
10949 +               diskmap_bh = sb_bread(sb, get_super_private(sb)->diskmap_block);
10950 +search_table:
10951 +               if ( !diskmap_bh ) {
10952 +                       warning("green-2005", "Cannot read diskmap while doing bitmap checks");
10953 +                       return -1;
10954 +               }
10955 +
10956 +               diskmap = (struct reiser4_diskmap *) diskmap_bh->b_data;
10957 +               if ( strncmp(diskmap->magic, REISER4_FIXMAP_MAGIC, sizeof(REISER4_FIXMAP_MAGIC)-1 ) ) {
10958 +                       /* Wrong magic */
10959 +                       brelse(diskmap_bh);
10960 +                       warning("green-2004", "diskmap is specified, but its magic is wrong");
10961 +                       return -1;
10962 +               }
10963 +
10964 +               /* Since entries in tables are sorted, we iterate until we hit item that we are looking for,
10965 +                  or we reach end of whole fixmap or end of current block */
10966 +               while (((d32tocpu(&diskmap->table[i].label) <= label) &&
10967 +                      (d32tocpu(&diskmap->table[i].parameter) < parameter)) &&
10968 +                       /* Also check that we do not fall out of current block */
10969 +                       ((sb->s_blocksize - sizeof(diskmap->magic))/sizeof(diskmap->table[0]) >= i))
10970 +                       i++;
10971 +
10972 +               if ( i > (sb->s_blocksize - sizeof(diskmap->magic))/sizeof(diskmap->table[0]) ) {
10973 +                       warning("green-2004", "diskmap block %Ld is not properly terminated", (long long)diskmap_bh->b_blocknr);
10974 +                       brelse(diskmap_bh);
10975 +                       return -1;
10976 +               }
10977 +
10978 +               /* Is this last entry in current table that holds disk block with more data ? */
10979 +               if ( d32tocpu(&diskmap->table[i].label) == REISER4_FIXMAP_NEXT_LABEL ) { /* Need to load next diskmap block */
10980 +                       sector_t next_diskmap_block = d64tocpu(&diskmap->table[i].value);
10981 +                       brelse(diskmap_bh);
10982 +                       diskmap_bh = sb_bread(sb, next_diskmap_block);
10983 +                       i = 0;
10984 +                       goto search_table;
10985 +               }
10986 +
10987 +               /* See if we have found table entry we are looking for */
10988 +               if ( (d32tocpu(&diskmap->table[i].label) == label) &&
10989 +                    (d32tocpu(&diskmap->table[i].parameter) == parameter) ) {
10990 +                       *value = d64tocpu(&diskmap->table[i].value);
10991 +                       retval = 0;
10992 +               }
10993 +               brelse(diskmap_bh);
10994 +       }
10995 +
10996 +       return retval;
10997 +}
10998 diff -rupN linux-2.6.8-rc3/fs/reiser4/diskmap.h linux-2.6.8-rc3-a/fs/reiser4/diskmap.h
10999 --- linux-2.6.8-rc3/fs/reiser4/diskmap.h        1970-01-01 03:00:00.000000000 +0300
11000 +++ linux-2.6.8-rc3-a/fs/reiser4/diskmap.h      2004-08-05 21:20:52.936691428 +0400
11001 @@ -0,0 +1,52 @@
11002 +#if !defined (__REISER4_DISKMAP_H__)
11003 +#define __REISER4_DISKMAP_H__
11004 +
11005 +/*
11006 + * Disk map.
11007 + *
11008 + * Disk map is a special data structure used by reiser4 as an optional
11009 + * "anchor" of other meta-data. That is, disk map (if present) may contain
11010 + * disk addresses of the rest of meta-data for this file system: master
11011 + * super-block, bitmaps, journal header and footer, etc. Disk map is used to
11012 + * avoid dependency on fixed disk addresses, with the following goals:
11013 + *
11014 + *     1. allow users to experiment with tuning their file system layout, and,
11015 + *     more importantly,
11016 + *
11017 + *     2. allow reiser4 to be survive bad blocks in critical disk locations.
11018 + *
11019 + * That is, disk map allows to "relocate" meta-data structures if their
11020 + * default disk addresses is not accessible.
11021 + *
11022 + * More generally, disk map can be used as a generic table used to store
11023 + * persistent parameters.
11024 + *
11025 + * Currently disk map is read-only for the kernel. It can only be
11026 + * constructed/modified by user-level utilities.
11027 + *
11028 + */
11029 +
11030 +#include "dformat.h"
11031 +
11032 +#define REISER4_FIXMAP_MAGIC "R4FiXMaPv1.0"
11033 +
11034 +#define REISER4_FIXMAP_END_LABEL -2
11035 +#define REISER4_FIXMAP_NEXT_LABEL -1
11036 +
11037 +/* This is diskmap table, it's entries must be sorted ascending first in label
11038 +   order, then in parameter order.  End of table is marked with label
11039 +   REISER4_FIXMAP_END_LABEL label REISER4_FIXMAP_NEXT_LABEL means that value
11040 +   in this row contains disk block of next diskmap in diskmaps chain */
11041 +struct reiser4_diskmap {
11042 +       char magic[16];
11043 +       struct {
11044 +               d32 label;
11045 +               d32 parameter;
11046 +               d64 value;
11047 +       } table[0];
11048 +};
11049 +
11050 +int reiser4_get_diskmap_value(u32, u32, u64 *);
11051 +
11052 +
11053 +#endif
11054 diff -rupN linux-2.6.8-rc3/fs/reiser4/doc/bk.HOWTO linux-2.6.8-rc3-a/fs/reiser4/doc/bk.HOWTO
11055 --- linux-2.6.8-rc3/fs/reiser4/doc/bk.HOWTO     1970-01-01 03:00:00.000000000 +0300
11056 +++ linux-2.6.8-rc3-a/fs/reiser4/doc/bk.HOWTO   2004-08-05 21:20:53.373599273 +0400
11057 @@ -0,0 +1,192 @@
11058 +
11059 +          MAINTENANCE OF REISER4 BITKEEPER REPOSITORY FOR THE CORE KERNEL.
11060 +
11061 +                                                                  OVERVIEW
11062 +
11063 +Reiser4 receives linux kernel code from http://linux.bkbits.net/linux-2.5.
11064 +
11065 +This repository is pulled into laputa:~god/src/bk-linux-2.5 (BK-LINUX-2.5) by
11066 +nightly cron job (~god/bin/update-src.sh). BK-LINUX-2.5 is only used as a
11067 +local copy of Linus' repository, no changes are made in it.
11068 +
11069 +BK-LINUX-2.5 has child repository laputa:~god/projects/limbo (LIMBO), where it
11070 +is merged with the set of patches to core kernel that are necessary for
11071 +reiser4 to work.
11072 +
11073 +These patches are maintained through Andrew Morton's patch-scripts
11074 +(http://www.zipworld.com.au/~akpm/linux/patches/). Local and slightly modified
11075 +version of patch-scripts is installed at laputa:~god/run/patch-scripts/. See
11076 +laputa:~god/run/patch-scripts/docco.txt for more detailed usage instructions.
11077 +
11078 +patch-scripts are needed, because reiser4 modifications to the core kernel
11079 +should be available as a set of meaningful separately documented patches
11080 +rather than as one single huge patch (that would result from just accumulating
11081 +all changes in the bitkeeper).
11082 +
11083 +Patches themselves are stored in bitkeeper repository
11084 +laputa:~god/projects/core-patches (CORE-PATCHES).
11085 +
11086 +New versions of the core kernel are pulled into LIMBO, merged with
11087 +CORE-PATCHES, conflicts are resolved. This repository is cloned into temporary
11088 +repositories to test resulting kernel. After testing LIMBO is cloned/pulled
11089 +into thebsh:/usr/home/bk/reiser4-linux-2.6 (REISER4-LINUX-2.6). From there
11090 +individual developers clone/pull it to their heart content.
11091 +
11092 +                                                        UPGRADE INSTRUCTIONS
11093 +
11094 +1. backup LIMBO:
11095 +
11096 +$ cd ~god/projects
11097 +$ mv limbo limbo.orig
11098 +
11099 +2. clone BK-LINUX-2.5 into LIMBO
11100 +
11101 +$ bk clone ~god/src/bk-linux-2.5 limbo
11102 +$ cd limbo
11103 +$ bk -r edit -q
11104 +
11105 +3. roll LIMBO back to the desired kernel version: after a clone LIMBO contains
11106 +some version of Linus' repository, but usually we want a repository
11107 +corresponding to the exact kernel version. Use bk changes -L to find changeset
11108 +number corresponding to the desired kernel version, then do bk undo -a<rev.no>
11109 +to move the repository to that version.
11110 +
11111 +4. graft CORE-PATCHES into LIMBO
11112 +
11113 +$ bk clone ~god/projects/core-patches patches
11114 +$ cd patches
11115 +$ bk -r edit -q
11116 +$ cd ../
11117 +$ . patches/setpc # set patch-script variables
11118 +
11119 +5. check status of core patches:
11120 +
11121 +$ pstatus # patch-scripts utility
11122 +1:a:2.6.6-mm2 Needs changelog
11123 +2:a:all-sources.diff Needs changelog
11124 +    ...
11125 +35:a:disable-vermagic Needs changelog
11126 +36:a:make-4kstack-option Needs changelog
11127 +37:a:radix_tree_lookup_slot Needs changelog
11128 +?:-:2.6.6-rc3-mm2 Needs changelog
11129 +?:-:do_mmap2-fix.diff Needs changelog
11130 +    ...
11131 +
11132 +Patches marked with ":a:" are applied (list of all currently applied patches
11133 +is in patches/applied-patches). Patches marked with ":-:" are not
11134 +applied. Patches with "?" (not numbered) are not included into "series"
11135 +(patches/series file)---these are usually some old or testing patches no
11136 +longer used.
11137 +
11138 +So, above pstatus output shows that there are 37 patches in the current
11139 +series, all applied. This is normal situation. LIMBO and CORE-PATCHES
11140 +repositories should always be left in such state after upgrading.
11141 +
11142 +6. Refresh core-patches.
11143 +
11144 +$ echo > patches/applied-patches # pretend patches are not applied
11145 +
11146 +Now for each patch do
11147 +
11148 +$ pushpatch # this applies next patch in series
11149 +
11150 +If patch could not applied successfully, "force it":
11151 +
11152 +$ pushpatch -f # this forces patch and generates .rej and .orig files
11153 +
11154 +Go through all generated .rej and .orig, resolve all conflicts, update
11155 +sources. Delete .rej and .orig files.
11156 +
11157 +Independently of whether patch was applied successfully or not, finish its
11158 +processing by refreshing it:
11159 +
11160 +$ refpatch # refresh current patch
11161 +
11162 +Repeat above pushpatch/refpatch sequence until patch-scripts report that
11163 +"Series are fully applied".
11164 +
11165 +7. Commit changes. Simplest way to do this is to do bk citool in LIMBO. But
11166 +this is time-consuming. Alternatively do:
11167 +
11168 +$ bk -r delta -a -q -y"local reiser4 patches applied"
11169 +$ bk commit -q -y"local reiser4 patches applied"
11170 +$ cd patches
11171 +$ bk citool # revise and commit modifications to the local patches
11172 +$ bk push # push modifications to the CORE-PATCHES
11173 +$ cd ..
11174 +
11175 +8. Test resulting sources
11176 +
11177 +$ cd ~god/projects/tmp
11178 +$ bk clone ../limbo
11179 +
11180 +and then follow standard build procedure: clone reiser4 repository into
11181 +tmp/limbo/fs, configure, build, etc.
11182 +
11183 +9. Pull changes to thebsh
11184 +
11185 +$ ssh thebsh
11186 +$ cd /usr/home/bk
11187 +$ mv reiser4-linux-2.6 reiser4-linux-2.6.orig
11188 +$ bk clone god@laputa:/home/god/projects/limbo reiser4-linux-2.6
11189 +
11190 +If everything is ok, remove backup repositories limbo.orig and
11191 +reiser4-linux-2.6.orig.
11192 +
11193 +                                                          ADDING NEW PATCH
11194 +
11195 +There are two versions of adding-new-patch procedure: first for the "in-order"
11196 +patches, second for the "external" patches, like -mm patch.
11197 +
11198 +                                                                In-order patch
11199 +
11200 +1. Prepare repositories:
11201 +
11202 +$ cd ~god/projects/limbo
11203 +$ bk -r edit -q
11204 +$ cd patches
11205 +$ bk -r edit -q
11206 +$ cd ..
11207 +
11208 +2. As was mentioned above, all patches in the series should be already
11209 +applied. Put new patch into "patches":
11210 +
11211 +$ mv /tmp/new-patch.diff patches/patches/new-patch.patch
11212 +
11213 +.patch suffix is mandatory!
11214 +
11215 +$ pcpatch new-patch # this generates patches/pc/new-patch.pc
11216 +$ vi patches/txt/new-patch.txt # add patch description
11217 +$ echo new-patch.patch >> patches/series # add patch to the series file
11218 +$ pushpatch # apply it.
11219 +
11220 +If patch couldn't be applied, "force" it and resolve conflicts, see above.
11221 +
11222 +$ refpatch # and refresh it.
11223 +
11224 +This again leaves repositories in the consistent state (all patches in the
11225 +series are applied).
11226 +
11227 +3. Commit changes, don't forget to add new files to the CORE-PATCHES. See
11228 +above.
11229 +
11230 +                                                               External patch
11231 +
11232 +External patch (such as combined patch for -mm) sometimes has to be added at
11233 +the beginning of series (rather than at the end as small patches are). Such
11234 +patches are best added during upgrading. Specifically, step 6 becomes:
11235 +
11236 +6. Refresh core-patches.
11237 +
11238 +$ echo > patches/applied-patches # pretend patches are not applied
11239 +$ cp /tmp/external.patch patches/patches
11240 +$ pcpatch external
11241 +$ vi patches/txt/external.txt
11242 +$ vi patches/series # add "external.patch" at the appropriate place
11243 +
11244 +Proceed with pushpatch/refpatch as usual.
11245 +
11246 +To remove patch from series (for example, when upgrading to the new -mm
11247 +kernel), just kill appropriate line in the patches/series.
11248 +
11249 +Nikita. 2004.05.25
11250 diff -rupN linux-2.6.8-rc3/fs/reiser4/doc/directory-service linux-2.6.8-rc3-a/fs/reiser4/doc/directory-service
11251 --- linux-2.6.8-rc3/fs/reiser4/doc/directory-service    1970-01-01 03:00:00.000000000 +0300
11252 +++ linux-2.6.8-rc3-a/fs/reiser4/doc/directory-service  2004-08-05 21:20:52.955687421 +0400
11253 @@ -0,0 +1,203 @@
11254 +
11255 +                                         DIRECTORY SERVICE IN REISER4
11256 +
11257 +Directory is mapping from file name to file itself. This mapping is
11258 +implemented through reiser4 internal balanced tree. Single global tree
11259 +is used as global index of all directories as opposed to having tree per
11260 +directory. Unfortunately file names cannot be used as keys until keys of
11261 +variable length are implemented, or unreasonable limitations on maximal
11262 +file name length are imposed. To work around this file name is hashed
11263 +and hash is used as key in a tree. No hash function is perfect and there
11264 +always be hash collisions, that is, file names having the same value of
11265 +a hash. Previous versions of reiserfs (3.5 and 3.6) used "generation
11266 +counter" to overcome this problem: keys for file names having the same
11267 +hash value were distinguished by having different generation
11268 +counters. This allowed to amortize hash collisions at the cost of
11269 +reducing number of bits used for hashing. This "generation counter"
11270 +technique is actually some ad hoc form of support for non-unique
11271 +keys. Keeping in mind that some form of this have to be implemented
11272 +anyway, it seems justifiable to implement more regular support for
11273 +non-unique keys in reiser4.
11274 +
11275 +NON-UNIQUE KEYS
11276 +
11277 +1.
11278 +
11279 +Non-unique keys require changes in both tree lookup and tree update
11280 +code. In addition some new API to iterate through items with identical
11281 +keys is required.
11282 +
11283 +Before going into detail let's note that non-unique keys weakens
11284 +traditional search tree invariant. Search tree with unique keys, keys of
11285 +all items in a left sub-tree of given delimiting key are less than, and
11286 +in the right sub-tree greater than or equal to the said key. In a search
11287 +tree with non-unique keys both inequalities are not strict.
11288 +
11289 +2.
11290 +
11291 +Tree lookups: we require that node layout ->lookup() methods always
11292 +return leftmost item with the key looked for. The same for item
11293 +->lookup() method for items supporting units with non-unique
11294 +keys. Standard node40 layout plugin handles this, see
11295 +fs/reiser4/plugin/node/node40.c:node40_lookup().
11296 +
11297 +3.
11298 +
11299 +Tree balancing: it seems that only change here is the handling of
11300 +weakened search tree invariant. This can be gathered from the
11301 +observation that balancing never even compares keys, only tests them for
11302 +equality. More thought/research is required though. Looking at the
11303 +existing implementations (like Berkeley db) would be useful also.
11304 +
11305 +4.
11306 +
11307 +Iteration through items/unit with identical keys. There are two
11308 +interfaces to iterating abstraction known as "external" (also known as
11309 +"enumeration") and "internal" iterators.
11310 +
11311 +External iterator:
11312 +
11313 +external_iterator {
11314 +  start();
11315 +  next();
11316 +  has_more_p();
11317 +};
11318 +
11319 +external_iterator eit;
11320 +
11321 +for( eit.start() ; eit.has_more_p() ; ) {
11322 +    object = eit.next();
11323 +    ... do stuff with object ...
11324 +}
11325 +
11326 +Internal operator:
11327 +
11328 +internal_iterator {
11329 +    iterate( int ( *function )( object *obj ) );
11330 +};
11331 +
11332 +internal_iterator iit;
11333 +
11334 +int do_stuff( object *obj )
11335 +{
11336 +   ... do stuff with obj ...
11337 +}
11338 +
11339 +iit( &do_stuff );
11340 +
11341 +External iterator seems easier to use, but they are known to be hard to
11342 +implement, especially for complex data-structures like trees (this is
11343 +because of the amount of state that should be maintained in "eit"
11344 +between its invocations).
11345 +
11346 +Internal iterators are harder to use in C, because new function has to
11347 +be declared to perform actions on objects in sequence, but are obviously
11348 +easier to implement.
11349 +
11350 +Given that in 4.0 version there will be only one client of this
11351 +iteration API (viz. directory lookup routine), it seems that internal
11352 +style is preferable for now. Later, external iterator interface can be
11353 +added if necessary.
11354 +
11355 +IMPLEMENTATION OF DIRECTORIES:
11356 +
11357 +1.
11358 +
11359 +There will be many various directory services implemented through
11360 +different plugins. Default directory plugin uses hashing techniques
11361 +described above. Let's code-name in hdir.
11362 +
11363 +2.
11364 +
11365 +Directory consists of directory entries, stored in a tree in a form of
11366 +directory items. Question about whether each directory entry should be
11367 +separate item or they can be compressed into items is left open by now.
11368 +First this decision is purely per-plugin decidable, second, compression
11369 +is good for performance, but harder to implement.
11370 +
11371 +Single directory entry is binding between file-system object and
11372 +directory. In hdir plugin it consists of full name of a file bound and
11373 +key (or part thereof) of file's stat-data:
11374 +
11375 +typedef struct hdir_entry {
11376 +    /**
11377 +     * key of object stat-data. It's not necessary to store
11378 +     * whole key here, because it's always key of stat-data, so minor packing
11379 +     * locality and offset can be omitted here. But this relies on
11380 +     * particular key allocation scheme for stat-data, so, for extensibility
11381 +     * sake, whole key can be stored here.
11382 +     *
11383 +     * We store key as array of bytes, because we don't want 8-byte alignment
11384 +     * of dir entries.
11385 +     */
11386 +    d8 sdkey[ sizeof( reiser4_key ) ];
11387 +    /**
11388 +     * file name. Null terminated string.
11389 +     */
11390 +    d8 name[ 0 ];
11391 +} hdir_entry;
11392 +
11393 +4.
11394 +
11395 +On creation/linking/lookup of object "bar" in directory "foo" (foo/bar),
11396 +we compose key of directory entry for this object. Key has the form
11397 +
11398 +/*
11399 + * XXX this should be discussed
11400 + */
11401 +dirent_k = (locality=foo_object_id, objectid=???, offset=hash("bar"));
11402 +
11403 +Major packing locality of dirent_k is set to foo_object_id so that all
11404 +objects (files) in this directory and their bodies are close to
11405 +respective directory entries.
11406 +
11407 +It seems that no single key allocation policy for directory entries fits
11408 +everyone's needs, so, this can be implemented as method of directory
11409 +plugin. No then less, choice of default key allocation policy is still
11410 +important decision, although not that important as in plugin-less
11411 +file-system.
11412 +
11413 +4.
11414 +
11415 +Function
11416 +
11417 +int hdir_find_entry( inode *dir, const hdir_entry *entry,
11418 +                     tween_coord *coord, lock_handle *lh );
11419 +
11420 +iterates through all directory entries in @dir that have the same key as
11421 +@entry (scans hash-bucket), looking for exact match for entry->name.
11422 +
11423 +5.
11424 +
11425 +During ->create()/->link() hdir_find_entry() is used to find place to insert new
11426 +item (and to check for -EEXIST).
11427 +
11428 +During ->lookup() hdir_find_entry() is used find entry for the file
11429 +being looked for and to load stat-data afterwards.
11430 +
11431 +During ->unlink() hdir_find_entry() is used to find unit/item to be
11432 +removed.
11433 +
11434 +NOTE ON ->lookup():
11435 +
11436 +VFS implements following protocol when creating new
11437 +file (fs/namei.c:open_namei()):
11438 +
11439 +dentry hash is searched. If search is unsuccessful, file system
11440 +->lookup() is called.
11441 +If lookup didn't find name, call ->create()
11442 +
11443 +While this protocol spares file system from dealing with dcache locking,
11444 +for reiserfs it means that tree traversal is performed twice during file
11445 +creation/deletion. Possible solution is to cache results of ->lookup()
11446 +(e.g, pointer to znode) in dentry and reuse then in ->create(). On the
11447 +other hand, point cache have more or less the same effect and is more
11448 +general.
11449 +
11450 +
11451 +^ Local variables:
11452 +^ mode-name: "Design Document"
11453 +^ indent-tabs-mode: nil
11454 +^ tab-width: 4
11455 +^ eval: (progn (flyspell-mode) (flyspell-buffer))
11456 +^ End:
11457 diff -rupN linux-2.6.8-rc3/fs/reiser4/doc/lock-ordering linux-2.6.8-rc3-a/fs/reiser4/doc/lock-ordering
11458 --- linux-2.6.8-rc3/fs/reiser4/doc/lock-ordering        1970-01-01 03:00:00.000000000 +0300
11459 +++ linux-2.6.8-rc3-a/fs/reiser4/doc/lock-ordering      2004-08-05 21:20:53.485575655 +0400
11460 @@ -0,0 +1,601 @@
11461 +---------------------------------INTRODUCTION-----------------------------------
11462 +
11463 +This document tries to provide concise description of various "locking" issues
11464 +in reiser4 code. There are two major areas here:
11465 +
11466 +1. locking as a device for the concurrency control: various synchronization
11467 +objects are used to maintain integrity of shared data structures.
11468 +
11469 +2. (induced by the former) deadlocks, livelocks, missed wake ups, and alikes.
11470 +
11471 +"Locks" above means both standard synchronization primitives like mutexes,
11472 +semaphores, condition variables and so on, and any other kind of object on
11473 +which thread execution may "block". Waiting on io completion is not considered
11474 +here, because hardware errors barred, it will ultimately finish regardless of
11475 +any other threads and locks in the system (This only holds if io completion
11476 +handlers don't acquire locks themselves.).
11477 +
11478 +-------------------------------LOCKS IN REISER4---------------------------------
11479 +
11480 +Reiser4 introduces following locks:
11481 +
11482 +1.  Per-super-block tree spin lock                              (tree_lock*)
11483 +
11484 +2.  Per-super-block delimiting key spin lock                    (dk_lock*)
11485 +
11486 +3.  Per-jnode spin lock                                         (jnode_lock*)
11487 +
11488 +4.  Per-znode lock with deadlock detection                      (longterm_lock)
11489 +
11490 +5.  Per-reiser4-inode spin lock                                 (inode_guard*)
11491 +
11492 +6.  Per-atom spin lock                                          (atom_lock*)
11493 +
11494 +7.  Per-transaction-handle spin lock                            (txnh_lock*)
11495 +
11496 +8.  Per-transaction-manager spin lock                           (txnmgr_lock*)
11497 +
11498 +9.  Per-lock-stack spin-lock                                    (stack_lock*)
11499 +
11500 +10. Per-inode read-write lock                                   (inode_rw_lock)
11501 +
11502 +11. Per-super-block spin lock                                   (super_guard*+)
11503 +
11504 +12. Per-flushing-thread spin lock                               (ktxnmgrd_lock)
11505 +
11506 +13. Global lnode hash table lock                                (lnode_guard+)
11507 +
11508 +14. Per-super-block cbk cache spin lock                         (cbk_guard)
11509 +
11510 +15. Per-jnode spin lock used by debugging code to access and
11511 +    modify check sum                                            (cksum_guard+)
11512 +
11513 +16. Per-super-block oid map spin lock                           (oid_guard+)
11514 +
11515 +17. Per-super-block spin lock used by "test" disk format plugin to serialize
11516 +    block allocation                                            (test_lock+)
11517 +
11518 +18. Per-condition-variable spin lock                            (kcond_lock+)
11519 +
11520 +19. Single spin lock used to serialize fake block allocation    (fake_lock+)
11521 +
11522 +20. Single spin lock used to serialize calls to reiser4_panic   (panic_guard+)
11523 +
11524 +21. Single spin lock used by debugging code to keep track of all active
11525 +    reiser4_context instances                                   (contexts_lock+)
11526 +
11527 +22. Per-lnode condition variable used by wait for completion of "incompatible
11528 +    access mode"                                                (lnode_kcond)
11529 +
11530 +23. Per-flushing-thread condition variable for startup waiting  (ktxnmgrd_start)
11531 +
11532 +24. Per-flushing-thread condition variable                      (ktxnmgrd_wait)
11533 +
11534 +25. Per-lock-stack wakeup semaphore                             (stack_sema)
11535 +
11536 +26. Per-super-block flush serializing semaphore                 (flush_sema)
11537 +
11538 +27. Per-transaction-manager commit semaphore                    (commit_sema)
11539 +
11540 +28. Per-super-block semaphore used to arbitrate use of 5%       (delete_sema)
11541 +    reserved disk space
11542 +
11543 +30. Global spin lock used to serialize calls to panic           (panic_guard+)
11544 +
11545 +31. Global spin lock used to protect plugin set hash table      (pset_guard+)
11546 +
11547 +32. Global spin lock used to protect phash hash table           (phash_guard+)
11548 +
11549 +33. Per-bitmap-block semaphore used to serialize bitmap loading (bnode_sema+)
11550 +
11551 +34. Per-super-block epoch lock, protecting updates to           (epoch_lock*)
11552 +    znode_epoch field, used to implement seals (seal.[ch])
11553 +    efficiently.
11554 +
11555 +35. Per-atom "event". This is not really lock. Rather, this is an event
11556 +    signaled each time atom changes its state.                  (atom_event)
11557 +
11558 +36. Per-znode spin lock used to protect long term locking
11559 +    structures                                                  (zlock*)
11560 +
11561 +37. Per flush queue lock                                        (fq_lock*)
11562 +
11563 +38. Per-super-block zgen lock, protecting znode generation      (zgen*)
11564 +    counter
11565 +
11566 +39. Per-jnode spin lock used to synchronize jload() with        (jload_lock*)
11567 +    ->releasepage().
11568 +
11569 +40. Per-atom imaginary read-write semaphore handle_sema         (handle_sema)
11570 +
11571 +    let's pretend for the sake of simplicity that there is special per-atom
11572 +    read-write semaphore that threads can claim. Call it
11573 +    handle_sema. This semaphore is acquired on read when thread captures first
11574 +    block and is released when thread's reiser4_context is closed. Formally
11575 +    thread holds this semaphore on read exactly when
11576 +    get_current_context()->trans->atom != NULL, i.e., when thread is
11577 +    associated with atom. Logic behind introducing this imaginary semaphore is
11578 +    that while some thread is associated with an atom (that is, keeps
11579 +    transaction handle opened), this atom cannot commit. In particular, other
11580 +    threads waiting on fusion with atom that is in CAPTURE_WAIT stage wait
11581 +    until this atom commits, that is wait (at least) until there are no opened
11582 +    transaction handles for this atom. Effectively such threads wait until
11583 +    handle_semaphore is free, that is, they in some sense are trying to
11584 +    acquire handle_semaphore in write mode.  So, this circumferential
11585 +    description allows one to reduce (at least partially) problem of waiting
11586 +    on atom fusion to the lock ordering.
11587 +
11588 +41. Per-super-block spin lock protecting consistency of emergency flush hash
11589 +    table, ->eflushed, and ->eflushed_anon counters in inode, and ->flushed
11590 +    counter in atom.                                            (eflush_guard)
11591 +
11592 +42. Per-super-block spin lock protecting detached directory cursors for
11593 +    stateless readdir                                           (d_lock)
11594 +
11595 +99. Various locks used by the user level simulator
11596 +
11597 +Locks marked by (*) after label, are accessed through spin lock macros,
11598 +defined in reiser4.h. For them, locking ordering is checked at the runtime (at
11599 +least in the principle) when REISER4_DEBUG is on(e).
11600 +
11601 +Locks marked by (+) after label exist only for serializing concurrent access
11602 +to the shared data and are not supposed to be used in conjunction with any
11603 +other locks. They are omitted from locking ordering below to simplify the
11604 +picture. One can imaging them to be rightmost in the ordering.
11605 +
11606 +All locks, spin locks, and semaphores, except for stack_sema are subject to
11607 +normal protocol: thread that grabbed the lock will release it. stack_sema is
11608 +described in more details below.
11609 +
11610 +Also, following kernel locks are used by our code:
11611 +
11612 +1. Per-page lock                                                (page_lock)
11613 +
11614 +2. Per-page writeback bit                                       (page_write)
11615 +
11616 +3. Per-inode semaphore                                          (i_sem)
11617 +
11618 +4. Per-inode I_LOCK bit-lock                                    (I_LOCK)
11619 +
11620 +Thread also can block on the following "objects" that are not really locks:
11621 +
11622 +1. Page fault                                                   (pfault)
11623 +
11624 +2. Memory allocation                                            (kalloc)
11625 +
11626 +3. Dirtying a page (through balance_dirty_pages())              (page_dirty)
11627 +
11628 +----------------------------------LOCK SCOPE------------------------------------
11629 +
11630 +Section describing what data are protected by what locks. TBD.
11631 +
11632 +----------------------------------INVARIANTS------------------------------------
11633 +
11634 +Invariants are some (formal or informal) properties of data structures. For
11635 +example, for well-formed doubly linked list, following holds:
11636 +
11637 +item->next->prev == item && item->prev->next == item
11638 +
11639 +In most cases, invariants only hold under proper locks.
11640 +
11641 +LABEL AND DESCRIPTION                                 LOCKS
11642 +
11643 +[inode->eflushed]                                     inode_guard
11644 +
11645 +    inode->eflushed > 0, iff there are emergency flushed jnodes belonging to
11646 +    this inode. Also, each emergency flushed jnode is counted as increase in
11647 +    inode->i_count.
11648 +
11649 +[cbk-cache-invariant]                                 cbk_guard
11650 +
11651 +    If cbk cache is traversed in LRU order, first go all used slots (with
11652 +    slot->node != NULL), then, all unused. All used slots have unique
11653 +    slot->node. (Checked by cbk_cache_invariant().)
11654 +
11655 +[znode-fake]                                          jnode_lock, tree_lock
11656 +
11657 +    /* fake znode doesn't have a parent, and */
11658 +    znode_get_level(node) == 0 => znode_parent(node) == NULL, and
11659 +    /* there is another way to express this very check, and */
11660 +    znode_above_root(node)     => znode_parent(node) == NULL, and
11661 +    /* it has special block number, and */
11662 +    znode_get_level(node) == 0 => *znode_get_block(node) == FAKE_TREE_ADDR, and
11663 +    /* it is the only znode with such block number, and */
11664 +    !znode_above_root(node) && znode_is_loaded(node) =>
11665 +                                  *znode_get_block(node) != FAKE_TREE_ADDR
11666 +    /* it is parent of the tree root node */
11667 +    znode_is_true_root(node)   => znode_above_root(znode_parent(node))
11668 +
11669 +    (Checked by znode_invariant_f().)
11670 +
11671 +[znode-level]                                         jnode_lock, tree_lock
11672 +
11673 +    /* level of parent znode is one larger than that of child, except for the
11674 +       fake znode */
11675 +    znode_parent(node) != NULL && !znode_above_root(znode_parent(node)) =>
11676 +                znode_get_level(znode_parent(node)) == znode_get_level(node) + 1
11677 +    /* left neighbor is at the same level, and */
11678 +    znode_is_left_connected(node) && node->left != NULL =>
11679 +                znode_get_level(node) == znode_get_level(node->left))
11680 +    /* right neighbor is at the same level */
11681 +    znode_is_right_connected(node) && node->right != NULL =>
11682 +                znode_get_level(node) == znode_get_level(node->right)
11683 +
11684 +    (Checked by znode_invariant_f().)
11685 +
11686 +[znode-connected]
11687 +
11688 +     /* ->left, ->right pointers form a valid list and are consistent with
11689 +     JNODE_{LEFT,RIGHT}_CONNECTED bits */
11690 +
11691 +     node->left != NULL => znode_is_left_connected(node)
11692 +     node->right != NULL => znode_is_right_connected(node)
11693 +     node->left != NULL =>
11694 +                     znode_is_right_connected(node->left) &&
11695 +                     node->left->right == node
11696 +     node->right != NULL =>
11697 +                     znode_is_left_connected(node->right) &&
11698 +                     node->right->left == node
11699 +
11700 +[znode-c_count]                                       jnode_lock, tree_lock
11701 +
11702 +    /* for any znode, c_count of its parent is greater than 0, and */
11703 +    znode_parent(node) != NULL && !znode_above_root(znode_parent(node)) =>
11704 +                atomic_read(&znode_parent(node)->c_count) > 0), and
11705 +    /* leaves don't have children */
11706 +    znode_get_level(node) == LEAF_LEVEL => atomic_read(&node->c_count) == 0
11707 +
11708 +    (Checked by znode_invariant_f().)
11709 +
11710 +[znode-modify]                                        zlock_lock(read),
11711 +                                                      jnode_lock, tree_lock
11712 +
11713 +    /* if znode is not write-locked, its checksum remains
11714 +     * invariant */
11715 +       !znode_is_wlocked(node) => znode_at_read(node)
11716 +
11717 +    (Checked by znode_invariant_f().)
11718 +
11719 +[znode-refs]                                          jnode_lock, tree_lock
11720 +
11721 +    /* only referenced znode can be long-term locked */
11722 +    znode_is_locked(node) => atomic_read(&ZJNODE(node)->x_count) != 0
11723 +
11724 +    (Checked by znode_invariant_f().)
11725 +
11726 +[jnode-oid]                                           jnode_lock, tree_lock
11727 +
11728 +    /* for unformatted node ->objectid and ->mapping fields are
11729 +     * consistent */
11730 +    jnode_is_unformatted(node) && node->key.j.mapping != NULL =>
11731 +        node->key.j.objectid == get_inode_oid(node->key.j.mapping->host)
11732 +
11733 +    (Checked by znode_invariant_f().)
11734 +
11735 +[jnode-refs]                                          jnode_lock, tree_lock
11736 +
11737 +    /* only referenced jnode can be loaded */
11738 +    atomic_read(&node->x_count) >= node->d_count
11739 +
11740 +    (Checked by jnode_invariant_f().)
11741 +
11742 +[jnode-dirty]                                         jnode_lock, tree_lock
11743 +
11744 +    /* dirty inode is part of atom */
11745 +    jnode_is_dirty(node) => node->atom != NULL
11746 +
11747 +    (Checked by jnode_invariant_f().)
11748 +
11749 +[jnode-queued]                                         jnode_lock, tree_lock
11750 +
11751 +    /* only relocated node can be queued, except that when znode
11752 +     * is being deleted, its JNODE_RELOC bit is cleared */
11753 +    JF_ISSET(node, JNODE_FLUSH_QUEUED) =>
11754 +                     JF_ISSET(node, JNODE_RELOC) || JF_ISSET(node, JNODE_HEARD_BANSHEE)
11755 +
11756 +    (Checked by jnode_invariant_f().)
11757 +
11758 +[jnode-atom-valid]                                     jnode_lock, tree_lock
11759 +
11760 +    /* node atom has valid state */
11761 +    node->atom != NULL => node->atom->stage != ASTAGE_INVALID
11762 +
11763 +    (Checked by jnode_invariant_f().)
11764 +
11765 +[jnode-page-binding]                                    jnode_lock, tree_lock
11766 +
11767 +    /* if node points to page, it points back to node */
11768 +    node->pg != NULL => node->pg->private == node
11769 +
11770 +    (Checked by jnode_invariant_f().)
11771 +
11772 +[sb-block-counts]                                     super_guard
11773 +
11774 +       reiser4_block_count(super) = reiser4_grabbed_blocks(super) +
11775 +                                 reiser4_free_blocks(super) +
11776 +                                 reiser4_data_blocks(super) +
11777 +                                 reiser4_fake_allocated(super) +
11778 +                                 reiser4_fake_allocated_unformatted(super) +
11779 +                                 reiser4_flush_reserved(super)
11780 +
11781 +    (Checked by check_block_counters().)
11782 +
11783 +[sb-grabbed]                                          super_guard
11784 +
11785 +    reiser4_grabbed_blocks(super) equals the sum of ctx->grabbed_blocks for
11786 +    all grabbed contexts
11787 +
11788 +[sb-fake-allocated]                                   txnmgr_lock, atom_lock
11789 +
11790 +    When all atoms and transaction manager are locked,
11791 +    reiser4_flush_reserved(super) equals to sum of atom->flush_reserved for
11792 +    all atoms.
11793 +
11794 +[tap-sane]
11795 +
11796 +    tap->mode is one of {ZNODE_NO_LOCK, ZNODE_READ_LOCK, ZNODE_WRITE_LOCK}, and
11797 +       tap->coord != NULL, and
11798 +       tap->lh != NULL, and
11799 +       tap->loaded > 0 => znode_is_loaded(tap->coord->node), and
11800 +       tap->coord->node == tap->lh->node
11801 +
11802 +    (Checked by tap_invariant().)
11803 +
11804 +--------------------------------LOCK ORDERING-----------------------------------
11805 +
11806 +Lock ordering for kernel locks is taken from mm/filemap.c. Locks can be taken
11807 +from the left to the right. Locks on the same indentation level are unordered
11808 +with respect to each other. Any spin lock is righter than any long term lock,
11809 +obviously.
11810 +
11811 +i_sem
11812 +..inode_rw_lock <-------DEAD1-----+
11813 +....handle_sema                   |
11814 +......I_LOCK                      |
11815 +......delete_sema                 |
11816 +......flush_sema                  |
11817 +........atom_event                |
11818 +........longterm_lock <---DEAD2-+ |
11819 +......commit_sema               | |
11820 +..........page_lock             | |
11821 +............pfault              | |
11822 +..............mm->mmap_sem------+-+                   [do_page_fault]
11823 +..................ktxnmgrd_lock
11824 +................mapping->i_shared_sem
11825 +................kalloc
11826 +....................inode_guard
11827 +......................d_lock
11828 +....................txnmgr_lock
11829 +......................atom_lock
11830 +..........................super_guard
11831 +........................jnode_lock            [->vm_writeback()->jget()]
11832 +................................eflush_guard
11833 +..........................txnh_lock
11834 +............................zlock
11835 +........................fq_lock
11836 +..............................stack_lock
11837 +..................dk_lock
11838 +..............................tree_lock
11839 +................................cbk_guard
11840 +................................epoch_lock
11841 +................................zgen_lock
11842 +..........................jload_lock
11843 +....................mm->page_table_lock
11844 +......................mapping->private_lock
11845 +........................swaplock
11846 +..........................swap_device_lock
11847 +..........................&inode_lock
11848 +............................&sb_lock
11849 +............................mapping->page_lock
11850 +..............................zone->lru_lock
11851 +                  ^
11852 +                  +-- spin locks are starting here. Don't schedule rightward.
11853 +
11854 +NOT FINISHED.
11855 +
11856 +..............&cache_chain_sem
11857 +......................cachep->spinlock
11858 +......................zone->lock
11859 +
11860 +page_dirty
11861 +....&inode_lock
11862 +....&sb_lock
11863 +....mapping->page_lock [mpage_writepages]
11864 +..page_lock
11865 +..longterm_lock        [__set_page_dirty_buffers->__mark_inode_dirty]
11866 +
11867 +Nice and clear picture with all reiser4 locks totally ordered, right?
11868 +
11869 +Unfortunately, it is not always possible to adhere to this ordering. When it
11870 +is necessary to take locks "decreasing" order, standard trylock-and-repeat
11871 +loop is employed. See:
11872 +
11873 +   atom_get_locked_with_txnh_locked(),
11874 +   atom_get_locked_by_jnode(),
11875 +   atom_free(), and
11876 +   jnode_lock_page()
11877 +
11878 +functions for examples of this.
11879 +
11880 +The only exception from the above locking oder is when thread wants to lock
11881 +object it is just created and hasn't yet announced to other threads (by means
11882 +of placing it into some shared data structure like hash table or list). There
11883 +is special spin lock macro spin_lock_foo_no_ord() defined in reiser4.h for
11884 +this purpose.
11885 +
11886 +pfault and kalloc are something special: when page fault occurs at the page
11887 +occupied by mmapped from reiser4 file, reiser4_readpage() is invoked that
11888 +starts taking locks from the very beginning.
11889 +
11890 +DEAD1
11891 +
11892 +   Scenario:
11893 +
11894 +      process has mmapped reiser4 regular file and then does write(2) into
11895 +      this file from buffer that is in mmaped area. copy_from_user() causes
11896 +      page fault:
11897 +
11898 +         sys_write()
11899 +           reiser4_write()
11900 +             unix_file_write() [inode_rw_lock]
11901 +                         .
11902 +                         .
11903 +                         .
11904 +                 __copy_from_user()
11905 +                         .
11906 +                         .
11907 +                         .
11908 +                   handle_page_fault()
11909 +                     handle_mm_fault()
11910 +                       handle_pte_fault()
11911 +                         do_no_page()
11912 +                           unix_file_filemap_nopage() [inode_rw_lock]
11913 +
11914 +   This is safe, because inode_rw_lock is read-taken by both read/write and
11915 +   unix_file_filemap_nopage(). It is only write-taken during tail<->extent
11916 +   conversion and if file is mmaped is was already converted to extents.
11917 +
11918 +DEAD2
11919 +
11920 +   is safe, because copy_from_user is used only for tails and extents:
11921 +
11922 +    . extent: extent_write_flow() releases longterm_lock before calling
11923 +      copy_from_user.
11924 +
11925 +    . tail: during copying into tail, only node containing this tail is long
11926 +      term locked. It is easy to see, that ->readpage serving page fault (that
11927 +      is, readpage for unformatted data) will never attempt to lock said node.
11928 +
11929 +When memory allocation tries to free some memory it
11930 +
11931 +1. asynchronously launches kswapd that will ultimately call
11932 +   reiser4_writepage().
11933 +
11934 +2. calls reiser4_writepage() synchronously.
11935 +
11936 +----------------------------------LOCK PATTERNS---------------------------------
11937 +
11938 +This section describes where in the code what locks sequences are held. This
11939 +places restrictions on modifications to the lock ordering above and enumerates
11940 +pieces of the code that should be revised if modification of the lock ordering
11941 +is necessary.
11942 +
11943 +flush_sema
11944 +
11945 +    jnode_flush()
11946 +
11947 +        to serialize flushing. This behavior can be disabled with mtflush
11948 +        mount option.
11949 +
11950 +atom_lock->jnode_lock
11951 +
11952 +    uncapture_block()
11953 +
11954 +atom_lock->tree_lock && jnode_lock && page_lock
11955 +
11956 +    uncapture_block() calls jput()
11957 +
11958 +delete_sema
11959 +
11960 +    common_unlink(), shorten_file()->unlink_check_and_grab()
11961 +
11962 +        to serialize access to reserved 5% of disk only used by unlinks. (This
11963 +        is necessary so that it is always possible to unlink something and
11964 +        free more space on file-system.)
11965 +
11966 +delete_sema->flush_sema || commit_sema
11967 +
11968 +    reiser4_release_reserved() calls txnmgr_force_commit_current_atom() under
11969 +    delete_sema
11970 +
11971 +inode_rw_lock->delete_sema
11972 +
11973 +    unix_file_truncate()->shorten_file() takes delete_sema from under write
11974 +    mode of inode_rw_lock
11975 +
11976 +kalloc->jnode_lock
11977 +
11978 +    emergency_flush() takes jnode spin lock
11979 +
11980 +jnode_lock->(mapping->page_lock)
11981 +
11982 +    jnode_set_dirty()->__set_page_dirty_nobuffers()
11983 +
11984 +jnode_lock->(zone->lru_lock)
11985 +
11986 +    jnode_set_dirty()->mark_page_accessed()
11987 +
11988 +
11989 +I_LOCK->longterm_lock
11990 +
11991 +    reiser4_iget()
11992 +
11993 +tree_lock->epoch_lock
11994 +
11995 +    zget() calls znode_build_version()
11996 +
11997 +jnode_lock->stack_lock
11998 +
11999 +    longterm_lock_znode(), longterm_unlock_znode(), wake_up_all_lopri_owners()
12000 +
12001 +tree_lock->cbk_guard
12002 +
12003 +    znode_remove() calls cbk_cache_invalidate()
12004 +
12005 +zlock->stack_lock
12006 +
12007 +    wake_up_all_lopri_owners()
12008 +
12009 +atom->stack_lock
12010 +
12011 +    check_not_fused_lock_owners()
12012 +
12013 +txnh->stack_lock
12014 +
12015 +    check_not_fused_lock_owners()
12016 +
12017 +jnode_lock->jload_lock
12018 +
12019 +    reiser4_releasepage(), emergency_flush(). But this can actually be made
12020 +    other way around.
12021 +
12022 +jnode_lock->eflush_guard
12023 +
12024 +    eflush_add(), eflush_del()
12025 +
12026 +atom_lock->super_guard
12027 +
12028 +    grabbed2flush_reserved_nolock()
12029 +
12030 +inode_guard->d_lock
12031 +
12032 +    detach_fsdata()
12033 +
12034 +----------------------------------DEADLOCKS-------------------------------------
12035 +
12036 +Big section describing found/possible/already-worked-around deadlocks.
12037 +
12038 +1. Locking during tree traversal.
12039 +
12040 +2. Locking during balancing.
12041 +
12042 +3. Locking during squalloc.
12043 +
12044 +4. Page locking.
12045 +
12046 +5. Atom fusion.
12047 +
12048 +Please, fill gaps up.
12049 +
12050 +TBD.
12051 +
12052 +2002.09.19. Nikita.
12053 +
12054 +--------------------------------------------------------------------------------
12055 +
12056 +^ Local variables:
12057 +^ mode-name: "Memo"
12058 +^ indent-tabs-mode: nil
12059 +^ tab-width: 4
12060 +^ eval: (progn (flyspell-mode) (flyspell-buffer))
12061 +^ End:
12062 diff -rupN linux-2.6.8-rc3/fs/reiser4/doc/lock-ordering.dot linux-2.6.8-rc3-a/fs/reiser4/doc/lock-ordering.dot
12063 --- linux-2.6.8-rc3/fs/reiser4/doc/lock-ordering.dot    1970-01-01 03:00:00.000000000 +0300
12064 +++ linux-2.6.8-rc3-a/fs/reiser4/doc/lock-ordering.dot  2004-08-05 21:20:53.277619518 +0400
12065 @@ -0,0 +1,276 @@
12066 +/* this is dot(1) input file for lock-ordering diagram */
12067 +/* it should be passed through C preprocessor first */
12068 +/* cpp -P -DFITPAGE lock-ordering.dot | tred | dot -Tps | gv -media a4 - */
12069 +
12070 +#define CATTR fontsize=14, fontname=Helvetica
12071 +#define NATTR CATTR
12072 +#define EATTR CATTR
12073 +
12074 +#define SYSATTR color=yellow, style=filled
12075 +#define PSEUDOATTR color=pink, style=filled, peripheries=2
12076 +
12077 +#define LONGATTR shape=ellipse
12078 +#define SPINATTR shape=box
12079 +
12080 +#define CONDATTR color=blue, peripheries=2, LONGATTR
12081 +
12082 +#define MARKLONG(name) name -> schedulable [style=invis, weight=0]
12083 +
12084 +#define SYSLONG(name, l) name [label=l, NATTR, LONGATTR, SYSATTR]; MARKLONG(name)
12085 +#define SYSPSEUDO(name) name [NATTR, LONGATTR, PSEUDOATTR]; MARKLONG(name)
12086 +#define RLONG(name) name [NATTR, LONGATTR]; MARKLONG(name)
12087 +
12088 +#define RCOND(name, l) name [label=l, NATTR, CONDATTR]; MARKLONG(name)
12089 +
12090 +#define MARKSPIN(name) schedulable -> name [style=invis, weight=0]
12091 +
12092 +#define SYSSPIN(name, l) name [label=l, NATTR, SYSATTR, SPINATTR]; MARKSPIN(name)
12093 +#define RSPIN(name) name [NATTR, SPINATTR]; MARKSPIN(name)
12094 +
12095 +#define ARC(from, to, func, ...) from -> to [EATTR, label=func, ## __VA_ARGS__]
12096 +
12097 +digraph locks {
12098 +
12099 +//clusterrank=none
12100 +#if defined(FITPAGE)
12101 +size="7.5, 10.5";
12102 +ratio=compress;
12103 +center=true;
12104 +#endif
12105 +
12106 +subgraph long {
12107 +       /* reiser4 long term locks */
12108 +       RLONG(longterm_lock);
12109 +       RLONG(inode_rw_lock);
12110 +       RLONG(stack_sema);
12111 +       RLONG(flush_sema);
12112 +       RLONG(commit_sema);
12113 +       RLONG(delete_sema);
12114 +    /* txncommit is a synonym for flush_sema and commit_sema */
12115 +       txncommit [LONGATTR, PSEUDOATTR]; MARKLONG(txncommit);
12116 +       txncommit -> flush_sema [style=dotted, dir=both];
12117 +       txncommit -> commit_sema [style=dotted, dir=both];
12118 +
12119 +    /* atom_event is not really a lock: you can wait on it, but cannot "own"
12120 +       it. */
12121 +       RCOND(atom_event,atom_event);
12122 +
12123 +       //RLONG(lnode_kcond);
12124 +       //RLONG(ktxnmgrd_start);
12125 +       //RLONG(ktxnmgrd_wait);
12126 +       //RLONG(bnode_sema);
12127 +
12128 +       /* pseudo locks */
12129 +       SYSPSEUDO(pfault);
12130 +       SYSPSEUDO(kalloc);
12131 +       SYSPSEUDO(schedulable);
12132 +
12133 +       /* system long term locks */
12134 +       SYSLONG(page_write, page_write);
12135 +       SYSLONG(mm_mmap_sem, "mm->mmap_sem");
12136 +       SYSLONG(mapping_i_shared_sem, "mapping->i_shared_sem");
12137 +
12138 +       SYSLONG(i_sem, i_sem);
12139 +       SYSLONG(page_lock, page_lock);
12140 +       SYSLONG(cache_chain_sem, "&cache_chain_sem");
12141 +       SYSLONG(I_LOCK, "I_LOCK");
12142 +
12143 +       SYSLONG(namespace_sem, "namespace->sem");
12144 +       // SYSLONG(bdev_bd_sem, "bdev->bd_sem");
12145 +       SYSLONG(sb_s_lock, "sb->s_lock");
12146 +       SYSLONG(sb_s_umount, "sb->s_umount");
12147 +}
12148 +
12149 +subgraph spin {
12150 +
12151 +       /* reiser4 spin locks */
12152 +
12153 +       RSPIN(tree_lock);
12154 +       RSPIN(dk_lock);
12155 +       RSPIN(jnode_lock);
12156 +       RSPIN(inode_guard);
12157 +       RSPIN(atom_lock);
12158 +       RSPIN(txnh_lock);
12159 +       RSPIN(txnmgr_lock);
12160 +       RSPIN(ktxnmgrd_lock);
12161 +       RSPIN(cbk_guard);
12162 +       RSPIN(epoch_lock);
12163 +       RSPIN(zgen_lock);
12164 +       RSPIN(stack_lock);
12165 +       RSPIN(zlock);
12166 +       RSPIN(fq_lock);
12167 +       RSPIN(jload_lock);
12168 +       RSPIN(super_guard);
12169 +    RSPIN(eflush_guard);
12170 +    RSPIN(d_lock);
12171 +
12172 +       //RSPIN(stack_lock);
12173 +       //RSPIN(lnode_guard);
12174 +       //RSPIN(cksum_guard);
12175 +       //RSPIN(oid_guard);
12176 +       //RSPIN(test_lock);
12177 +       //RSPIN(kcond_lock);
12178 +       //RSPIN(fake_lock);
12179 +       //RSPIN(panic_guard);
12180 +       //RSPIN(contexts_lock);
12181 +       //RSPIN(pset_guard);
12182 +       //RSPIN(phash_guard);
12183 +
12184 +       /* system spin locks */
12185 +       SYSSPIN(bkl, "BKL");
12186 +       SYSSPIN(cachep_spinlock, "cachep->spinlock");
12187 +       SYSSPIN(zone_lock, "zone->lock");
12188 +       SYSSPIN(swaplock, "&swaplock");
12189 +       SYSSPIN(zone_lru_lock, "zone->lru_lock");
12190 +       SYSSPIN(mapping_private_lock, "mapping->private_lock");
12191 +       SYSSPIN(mapping_page_lock, "mapping->page_lock");
12192 +       SYSSPIN(inode_lock, "&inode_lock");
12193 +       SYSSPIN(swap_device_lock, "swap->device_lock");
12194 +       SYSSPIN(mm_page_table_lock, "mm->page_table_lock");
12195 +       SYSSPIN(sb_lock, "&sb_lock");
12196 +       SYSSPIN(page_chain_lock, "page->chain_lock");
12197 +    //removed at 2003.04.04 by akpm@digeo.com
12198 +       //SYSSPIN(dparent_lock, "dparent_lock");
12199 +       SYSSPIN(dcache_lock, "dcache_lock");
12200 +       SYSSPIN(fs_struct_lock, "fs_struct->lock");
12201 +       SYSSPIN(tasklist_lock, "&tasklist_lock");
12202 +       SYSSPIN(sig_siglock, "sig->siglock");
12203 +       SYSSPIN(fown_lock, "fown->lock");
12204 +       SYSSPIN(task_switch_lock, "task->switch_lock");
12205 +       SYSSPIN(task_proc_lock, "task->proc_lock");
12206 +       SYSSPIN(task_alloc_lock, "task->alloc_lock");
12207 +       /* rq->lock is special: it can be unlocked by thread different from locker */
12208 +       SYSSPIN(rq_lock, "rq->lock");
12209 +       SYSSPIN(task_capability_lock, "&task_capability_lock");
12210 +    SYSSPIN(mmlist_lock, "&mmlist_lock");
12211 +       SYSSPIN(files_file_lock, "files->file_lock");
12212 +       SYSSPIN(dn_lock, "&dn_lock");
12213 +       //SYSSPIN(bdev_lock, "&bdev_lock");
12214 +       SYSSPIN(suspend_pagedir_lock, "&suspend_pagedir_lock")
12215 +}
12216 +
12217 +/* dependencies */
12218 +
12219 +ARC(inode_guard, tree_lock, "update_sd_at()");
12220 +ARC(inode_guard, jnode_lock, "update_sd_at()");
12221 +ARC(inode_guard, atom_lock, "update_sd_at()");
12222 +ARC(atom_lock, jnode_lock, "uncapture_block()"); //capture_fuse_jnode_lists()
12223 +ARC(jnode_lock, txnh_lock, "try_capture_block()");
12224 +//alredy covered
12225 +ARC(atom_lock, txnh_lock, "capture_fuse_txnh_lists()");
12226 +ARC(jnode_lock, tree_lock, "jdrop_in_tree()");
12227 +ARC(tree_lock, cbk_guard, "cbk_cache_invalidate()");
12228 +ARC(dk_lock, tree_lock, "sync_dkeys()");
12229 +ARC(txnmgr_lock, atom_lock, "atom_dec_and_unlock()"); //txnmgr_force_commit_all(),\ncommit_some_atoms(),\nflush_one_atom()");
12230 +ARC(txnmgr_lock, jnode_lock, "atom_begin_andlock()");
12231 +ARC(txnmgr_lock, txnh_lock, "atom_begin_andlock()");
12232 +ARC(i_sem, inode_rw_lock, "unix_file_setattr()");//,\nunix_file_write()");
12233 +ARC(page_lock, i_sem, "reiserfs_unpack()");
12234 +ARC(inode_rw_lock, delete_sema, "shorten()");
12235 +//ARC(delete_sema, txncommit, "reiser4_release_reserved()");
12236 +ARC(flush_sema, longterm_lock, "flush_scan_left()");//,\nflush_allocate_znode_update(),\nflush_scan_formatted(),\nflush_pos_to_child_and_alloc()");
12237 +ARC(longterm_lock, page_lock, "cbk_level_lookup()");
12238 +ARC(commit_sema, page_lock, "submit_write()");
12239 +ARC(pfault, mm_mmap_sem, "handle_page_fault()");
12240 +ARC(page_lock, pfault, "extent_write_flow()");
12241 +ARC(mm_mmap_sem, kalloc, "unix_file_readpage()");
12242 +
12243 +//ARC(inode_rw_lock, mm_mmap_sem, "unix_file_filemap_nopage()", style=dotted, dir=back);
12244 +//ARC(mm_mmap_sem, kalloc, "DEAD2", style="dotted");
12245 +ARC(kalloc, jnode_lock, "emergency_flush()");
12246 +ARC(longterm_lock, jnode_lock, "longterm_unlock_znode()");//,\nflush_allocate_znode()");
12247 +
12248 +ARC(kalloc, inode_guard, "eflush_add()");
12249 +ARC(ktxnmgrd_lock, txnmgr_lock, "commit_some_atoms()");
12250 +
12251 +//already covered
12252 +ARC(mapping_i_shared_sem, mapping_private_lock, "__set_page_dirty_buffers()");
12253 +//already covered
12254 +ARC(mapping_i_shared_sem, mapping_page_lock, "");
12255 +ARC(mapping_i_shared_sem, mm_page_table_lock, "vma_link()");
12256 +
12257 +ARC(inode_lock, mapping_page_lock, "__sync_single_inode()");
12258 +ARC(inode_lock, sb_lock, "writeback_inodes()");
12259 +
12260 +ARC(mm_page_table_lock, swap_device_lock, "try_to_unmap_one()");
12261 +ARC(mm_page_table_lock, mapping_private_lock, "try_to_unmap_one()");
12262 +//already covered
12263 +ARC(mm_page_table_lock, mapping_page_lock, "try_to_unmap_one()");
12264 +
12265 +ARC(mm_mmap_sem, mapping_i_shared_sem, "do_mmap_pgoff()");
12266 +
12267 +ARC(swaplock, swap_device_lock, "swap_info_get()");
12268 +ARC(swap_device_lock, mapping_page_lock, "exclusive_swap_page()");
12269 +
12270 +ARC(page_lock, page_chain_lock, "shrink_list()");
12271 +ARC(mm_page_table_lock, page_chain_lock, "page_add_rmap()");//,\npage_remove_rmap()");
12272 +ARC(mapping_page_lock, zone_lru_lock, "add_to_page_cache()");//,\nfilemap_fdatawait()");
12273 +ARC(mm_page_table_lock, zone_lru_lock, "page_add_rmap()");//,\npage_remove_rmap()");
12274 +ARC(zone_lru_lock, page_chain_lock, "rmap.c");
12275 +
12276 +ARC(cache_chain_sem, kalloc, "cpuup_callback()");
12277 +//ARC(cache_chain_sem, pfault, "kmem_cache_create()");
12278 +
12279 +//obsolete ARC(dcache_lock, dparent_lock, "d_move()");
12280 +ARC(fs_struct_lock, dcache_lock, "set_fs_pwd()");//,\nset_fs_root()");
12281 +
12282 +ARC(namespace_sem, i_sem, "sys_pivot_root()");
12283 +
12284 +ARC(sb_s_lock, txncommit, "reiser4_write_super()");
12285 +ARC(sb_s_umount, txncommit, "reiser4_kill_super()");
12286 +
12287 +ARC(task_switch_lock, rq_lock, "finish_arch_switch()");
12288 +ARC(task_proc_lock, tasklist_lock, "unhash_process()"); // de_thread()
12289 +ARC(task_proc_lock, dcache_lock, "proc_pid_unhash()");
12290 +
12291 +ARC(tasklist_lock, sig_siglock, "de_thread()");//,\ndo_notify_parent(),\nsys_tkill(),\ncopy_process()"); //collect_sigign_sigcatch(),\n__exit_sighand(),\nfreeze_processes()
12292 +ARC(dn_lock, fown_lock, "__inode_dir_notify()");
12293 +ARC(fown_lock, tasklist_lock, "send_sigio()");//,\nsend_sigurg()");
12294 +ARC(tasklist_lock, task_alloc_lock, "chroot_fs_refs()");
12295 +ARC(tasklist_lock, rq_lock, "setscheduler()");
12296 +ARC(task_capability_lock, tasklist_lock, "sys_capget()");//,\nsys_capset()");
12297 +ARC(task_alloc_lock, files_file_lock, "match_comm()");//,\nmatch_pid()");
12298 +
12299 +ARC(mmlist_lock, mm_page_table_lock, "unuse_process()");
12300 +
12301 +ARC(tree_lock, zone_lock, "page_clear_jnode()");//,\njrelse_nolock()");
12302 +ARC(tree_lock, zone_lru_lock, "page_clear_jnode()");//,\njrelse_nolock()");
12303 +ARC(tree_lock, mapping_page_lock, "jdrop_in_tree()");
12304 +ARC(tree_lock, epoch_lock, "zget()");
12305 +ARC(tree_lock, zgen_lock, "zget()");
12306 +
12307 +ARC(bkl, inode_lock, "iget()");
12308 +
12309 +ARC(jnode_lock, mapping_page_lock, "jnode_set_dirty()");
12310 +ARC(jnode_lock, zone_lru_lock, "jnode_set_dirty()");
12311 +
12312 +ARC(I_LOCK, longterm_lock, "reiser4_iget()");
12313 +
12314 +//one cannot wait for atom event keeping longterm lock
12315 +ARC(atom_event, longterm_lock, "flush");
12316 +//one cannot wait for atom event keeping page lock
12317 +ARC(atom_event, page_lock, "jnode_extent_write()");
12318 +ARC(zlock, stack_lock, "longterm_lock_znode()");//,\nlongterm_unlock_znode(), wake_up_all_lopri_owners()");
12319 +
12320 +ARC(atom_lock, stack_lock, "check_not_fused_lock_owners()");//atom_send_event()
12321 +ARC(txnh_lock, stack_lock, "check_not_fused_lock_owners()");
12322 +ARC(fq_lock, stack_lock, "wakeup_atom_waitfor_list()");
12323 +ARC(atom_lock, fq_lock, "detach_fq()");
12324 +ARC(jnode_lock, zlock, "check_not_fused_lock_owners()");
12325 +ARC(txnh_lock, zlock, "check_not_fused_lock_owners()");
12326 +
12327 +ARC(suspend_pagedir_lock, zone_lock, "do_magic_suspend_2()");
12328 +ARC(cachep_spinlock, zone_lock, "cache_flusharray()");
12329 +
12330 +ARC(mapping_page_lock, zone_lock, "add_to_page_cache()"); // find_lock_page
12331 +ARC(mapping_page_lock, zone_lru_lock, "add_to_page_cache()"); // find_lock_page
12332 +ARC(mm_page_table_lock, zone_lock, "try_to_unmap_one()"); // get_user_pages, do_wp_page, do_anonymous_page, do_no_page
12333 +ARC(mm_page_table_lock, zone_lru_lock, "try_to_unmap_one()"); // get_user_pages, do_wp_page, do_anonymous_page, do_no_page
12334 +ARC(jnode_lock, zone_lock, "page_clear_jnode()"); // uncapture_page, extent_write_flow
12335 +ARC(jnode_lock, zone_lru_lock, "page_clear_jnode()"); // uncapture_page, extent_write_flow
12336 +ARC(jnode_lock, jload_lock, "reiser4_releasepage()");
12337 +ARC(atom_lock, super_guard, "grabbed2flush_reserved_nolock()");
12338 +
12339 +ARC(jnode_lock, eflush_guard, "eflush_add()");
12340 +ARC(inode_guard, d_lock, "detach_fsdata()");
12341 +}
12342 diff -rupN linux-2.6.8-rc3/fs/reiser4/doc/metadata-in-pagecache linux-2.6.8-rc3-a/fs/reiser4/doc/metadata-in-pagecache
12343 --- linux-2.6.8-rc3/fs/reiser4/doc/metadata-in-pagecache        1970-01-01 03:00:00.000000000 +0300
12344 +++ linux-2.6.8-rc3-a/fs/reiser4/doc/metadata-in-pagecache      2004-08-05 21:20:53.104656000 +0400
12345 @@ -0,0 +1,57 @@
12346 +Hello,
12347 +
12348 +In upcoming reiser4 we are planning to use page cache to store all file system
12349 +meta data. In some cases it is straightforward; for example, bitmaps blocks,
12350 +placed on the disk through (almost) equal intervals ask to be bound to special
12351 +fake inode and indexed by their disk offsets.
12352 +
12353 +There is one important (most important actually) case where using fake inode
12354 +is inconvenient: blocks of internal balanced tree used by reiser4, known as
12355 +"formatted nodes". Natural solution of using block number as offset within
12356 +some fake inode doesn't pass, because when block size is smaller than page
12357 +some blocks mapped to the same page may be either occupied by something other
12358 +than formatted nodes, or just be free.
12359 +
12360 +This leads to the following complications:
12361 +
12362 + 1. we cannot simply use block_{read|write}_full_page(), because this will
12363 + waste IO bandwidth: block that doesn't contain formatted node will be read
12364 + into memory. Moreover, this block can be later read again, for example,
12365 + because this is data block of some file and hashed into different place in
12366 + the page cache, creating alias. This will definitely confuse buffer cache;
12367 +
12368 + 2. even is we keep track of what blocks have to be actually read, there still
12369 + will be "internal memory fragmentation", because some parts of page cache
12370 + pages will be unused.
12371 +
12372 +In brief, formatted nodes form a tree and because of this don't fit into
12373 +<inode, offset> hashing scheme---there is no linear ordering among them.
12374 +
12375 +Moreover, formatted node is never looked up in the page cache by its block
12376 +number, because for each formatted node in memory there is special data
12377 +structure (znode) and znodes are hashed in the hash table anyway.
12378 +
12379 +So, all functionality that we need from the page cache is memory allocator
12380 +with attached memory pressure hooks (I guess, this is close to what Hans
12381 +called "sub-cache" in lkml discussions on this topic).
12382 +
12383 +It seems that we have two solutions:
12384 +
12385 + 1. change page cache to use different indexing for formatted nodes;
12386 +
12387 + 2. implement our own memory allocator sitting directly on the top of
12388 + alloc_pages() and installing proper ->mapping for pages that it grabs.
12389 +
12390 +(2) will only work if generic VM code (e.g., shrink_cache() or
12391 +page_launder_zone() in rmap VM) don't depend on particulars of page cache
12392 +hashing, that, fortunately, seems to be the case. This approach has following
12393 +advantages:
12394 +
12395 + . we can try to collocate related blocks on the same page, for example
12396 + blocks from the same transaction, of block with similar cache "hotness";
12397 +
12398 + . we can use blocks larger than page size.
12399 +
12400 +Nikita.
12401 +
12402 +
12403 diff -rupN linux-2.6.8-rc3/fs/reiser4/doc/oid-locid linux-2.6.8-rc3-a/fs/reiser4/doc/oid-locid
12404 --- linux-2.6.8-rc3/fs/reiser4/doc/oid-locid    1970-01-01 03:00:00.000000000 +0300
12405 +++ linux-2.6.8-rc3-a/fs/reiser4/doc/oid-locid  2004-08-05 21:20:53.308612981 +0400
12406 @@ -0,0 +1,108 @@
12407 +MIME-Version: 1.0
12408 +Content-Type: text/plain; charset=us-ascii
12409 +Content-Transfer-Encoding: 7bit
12410 +Message-ID: <15392.39020.573047.826769@laputa.namesys.com>
12411 +Date: Wed, 19 Dec 2001 16:38:52 +0300
12412 +To: Reiserfs developers mail-list <Reiserfs-Dev@Namesys.COM>
12413 +Subject: [RFC]: objectids and localities management
12414 +X-Mailer: VM 6.96 under 21.4 (patch 3) "Academic Rigor" XEmacs Lucid
12415 +FCC: ~/documents/mail/outgoing
12416 +--text follows this line--
12417 +Hello,
12418 +
12419 +there is one thing that seems awkward in current reiser{fs|4} design: in
12420 +a key we have both locality id (locid) and object id (oid). This is
12421 +slightly illogical because oid alone is unique, but we cannot find an
12422 +object given oid. This was, by the way, main reason behind our NFS
12423 +troubles. So, why is this strictly necessary? I'll try to reason from
12424 +the "first principles". Following account doesn't pretend to be of any
12425 +historical accuracy of course.
12426 +
12427 +1. In a data structure we use to store objects (tree) items
12428 +   with close keys are packed into the same disk block. This means that
12429 +   we cannot completely separate key allocation from block
12430 +   allocation. That is,
12431 +
12432 +      - tree forces us to encode disk location preferences in a key. (A1)
12433 +
12434 +2. If we cannot completely separate key and block allocation let's try
12435 +   in stead to blend them together. That is, we rely on block allocator
12436 +   to follow tree ordering and topology: blocks containing items with
12437 +   close keys are allocated close on disk and blocks contiguous in tree
12438 +   order are more or less contiguous on disk. How far bitmap.c fulfill
12439 +   or can fulfill these goals is out of the scope of this discussion,
12440 +
12441 +      - let's suppose that we have ideal block allocator. (A2)
12442 +
12443 +3. Given this, why cannot we encode disk location preferences in oid
12444 +   alone? Because oid has to be unique and we cannot predict how many
12445 +   objects we are going to group together in a future (how many objects
12446 +   there will be in a directory that is). That is, suppose we create two
12447 +   directories "a" and "b" in succession. If oid were the only thing to
12448 +   store location preference, than we should leave after the oid of "a"
12449 +   enough unused oids for all objects within "a", but we don't know how
12450 +   many of them will be there.
12451 +
12452 +4. To solve this (locid, oid) scheme was born. It has following
12453 +   advantages:
12454 +
12455 +      - it is simple to implement
12456 +      - it allows one to encode enough location preference into the key (A3)
12457 +
12458 +But the more people used reiserfs and the more files they started to
12459 +store in a single directory, the less valid (A3) became. oid became
12460 +inadequate location preference, because while it allows to separate
12461 +files from different directories it doesn't allow to order files within
12462 +single directory. For example readdir of big directory is slow, because
12463 +files are not sorted within directory. Various ad-hoc solutions have
12464 +been proposed (oid==hash, add "band" to oid, etc), but there is obvious
12465 +conflict between requirement that oid is unique and desire to encode
12466 +additional information in it. In effect all such solutions amount to
12467 +further splitting of (locid,oid) pair into (locid, someid, oid) for the
12468 +reasons similar to those on the steps 3,4 above.
12469 +
12470 +The scheme proposed below tries to meet following goals:
12471 +
12472 + G1. only keep unique oid in a key, thus making it possible to find file
12473 +     given its inode number and additionally shrink key, increasing
12474 +     fanout.
12475 +
12476 + G2. allow configurable amount of fine-grained locality preference
12477 +     information to be associated with each oid, thus allowing files
12478 +     to be ordered in a tree according to some hierarchical "packing
12479 +     localities", for example: first order files by oid of parent
12480 +     directory, then by hash of name within this directory.
12481 +
12482 +
12483 +Proposal:
12484 +
12485 +Maintain separate map (oidlocmap, implementation discussed below) from
12486 +oid to "locpref", where locpref is additional fine-grained location
12487 +preference data, associated with oid. For example locpref may be just
12488 +(locid) to emulate existing behavior, or (locid, hash) or (locid,
12489 +user-supplied-grouping-info), etc.
12490 +
12491 +Key only contains oid, that is, ceteris paribus, key has form
12492 +(item-type, oid, offset). If oid is 60 bits, this is 16 bytes.
12493 +
12494 +Ordering of items within tree (and, given (A2), their ordering on disk)
12495 +is completely determined by keycmp() function that compares two
12496 +keys. Before comparing two keys, keycmp(k1, k2) consults oidlocmap and
12497 +obtains locprefs, associated with oids of k1 and k2. locprefs then are
12498 +"pasted" into k1 and k2, producing "expanded" keys, containing full
12499 +location preferences information. Expanded keys are compared as usual.
12500 +
12501 +In simplest case oidlocmap can be implemented as normal balanced tree,
12502 +where keys are oids (60 bits) and values locprefs. If we limit ourselves
12503 +to fixed format of locpref (at least per file system) than, we get
12504 +standard text-book balanced tree storing values of fixed size which is
12505 +simple to implement.
12506 +
12507 +There is of course overhead of maintaining oidlocmap and, especially, of
12508 +consulting it on each keycmp(), but it looks to me that it will be not
12509 +that significant, because oidlocmap is compact and will be out-weighted
12510 +by increased fanout in the main tree.
12511 +
12512 +Comments?
12513 +
12514 +Nikita.
12515 diff -rupN linux-2.6.8-rc3/fs/reiser4/doc/page-cache-for-formatted-nodes linux-2.6.8-rc3-a/fs/reiser4/doc/page-cache-for-formatted-nodes
12516 --- linux-2.6.8-rc3/fs/reiser4/doc/page-cache-for-formatted-nodes       1970-01-01 03:00:00.000000000 +0300
12517 +++ linux-2.6.8-rc3-a/fs/reiser4/doc/page-cache-for-formatted-nodes     2004-08-05 21:20:52.953687843 +0400
12518 @@ -0,0 +1,60 @@
12519 +PROPOSAL:
12520 +
12521 +Keep formatted nodes in a page cache, binding them to the special fake inode
12522 +and using block number divided by number of blocks in a page as page index.
12523 +
12524 +ADVANTAGES:
12525 +
12526 +Page cache is preferred over buffer cache. Much more optimization and
12527 +scalability efforts are going into it. The fewer separate caches are in the
12528 +system, the simpler and better VM can handle load.
12529 +
12530 +DISADVANTAGES:
12531 +
12532 +As formatted nodes are indexed by block number, each page will contain
12533 +blocks with consequentive block numbers. This poses several problems:
12534 +
12535 +  1. When we need to read particular block from the disk (e.g., to load child
12536 +  node during tree lookup), it is not clear that blocks with neighboring block
12537 +  numbers are worth reading into memory at all.
12538 +
12539 +  2. Some of the blocks that have to go in the same page as block we need can
12540 +  be unformatted ones.
12541 +
12542 +SOLUTIONS:
12543 +
12544 +There are several possible workarounds:
12545 +
12546 +  1. rely on the fact that in vast majority of cases block size is equal to
12547 +  the page size. So, we can index formatted nodes by block number storing
12548 +  exactly one block in the page. This will eliminate both problems at the
12549 +  expense of the memory wasting in the setups where block size is smaller than
12550 +  page size.
12551 +
12552 +  2. only load required block in the page marking other blocks mapped to this
12553 +  page as up-to-date. It is not obvious that this will work at all, and in any
12554 +  case, this will force us to use special API to access such pages, bypassing
12555 +  VM interface.
12556 +
12557 +  3. rely on good repacker and load all blocks in the page hoping that they
12558 +  are close to each other in tree order and will be accessed shortly.
12559 +
12560 +  4. allocate unformatted nodes such that they will never go into the same
12561 +  frame as formatted. For example:
12562 +
12563 +    - always align extent to the page boundary on the disk (page is CPU
12564 +    specific though);
12565 +
12566 +    - use some variation of border algorithm to separate formatted and
12567 +    unformatted nodes;
12568 +
12569 +    - use "enriched" bitmap where formatted and unformatted nodes are
12570 +    distinguishable.
12571 +
12572 +
12573 +# Local variables:
12574 +# mode-name: "proposal"
12575 +# indent-tabs-mode: nil
12576 +# tab-width: 4
12577 +# eval: (if (fboundp 'flyspell-mode) (flyspell-mode))
12578 +# End:
12579 diff -rupN linux-2.6.8-rc3/fs/reiser4/doc/plugin.inheritance linux-2.6.8-rc3-a/fs/reiser4/doc/plugin.inheritance
12580 --- linux-2.6.8-rc3/fs/reiser4/doc/plugin.inheritance   1970-01-01 03:00:00.000000000 +0300
12581 +++ linux-2.6.8-rc3-a/fs/reiser4/doc/plugin.inheritance 2004-08-05 21:20:53.332607919 +0400
12582 @@ -0,0 +1,119 @@
12583 +
12584 +                                Report about "plugin inheritance discussion"
12585 +
12586 +    1. Basic plugin support, psets, default plugins.
12587 +
12588 +    2. Plugin inheritance.
12589 +
12590 +    3. Meta-data inheritance, light-weight files.
12591 +
12592 +1. Basic plugin support, psets, default plugins.
12593 +
12594 +    Let's call Reiser4 file system object "active" when it is used by the
12595 +    kernel, that is, when initialized inode exists for it. Associated with
12596 +    each active object is its "plugin set" ("pset" for short) that is an array
12597 +    of all plugins necessary for proper interaction with this object. Pointer
12598 +    to pset is stored in inode. Pset is constructed when:
12599 +
12600 +        1. new object is created, or
12601 +
12602 +        2. existing object is looked up.
12603 +
12604 +    New object is always created as a child of some already existing
12605 +    object. During object creation its pset is constructed on the basic of
12606 +    parent's one---this is plugin inheritance. Details of plugin inheritance
12607 +    are delegated to the object plugin of new object for flexibility.
12608 +
12609 +    File system has "default pset". In current implementation it is just pset
12610 +    of the root directory, created by mkfs.reiser4.
12611 +
12612 +    When stat-data is saved to disk, pset is saved as part of stat-data. At
12613 +    least this is what default static stat-data plugin does. More advanced
12614 +    stat-data plugins are free to save psets separately, implement sharing,
12615 +    etc.
12616 +
12617 +    As an optimization, only plugins different from default ones are stored in
12618 +    stat-data. Correspondingly, when object is looked up, plugins found in
12619 +    stat-data are installed into pset, and missing plugins are taken from the
12620 +    default pset.
12621 +
12622 +    Plugins in pset can be divided into two types:
12623 +
12624 +        1. "essential"---ones that cannot be changed without some explicit
12625 +        effort. For example, hash and fibration plugins are essential, because
12626 +        changing them would render directory content invalid.
12627 +
12628 +        2. "non-essential"---plugins that can be changed implicitly. For
12629 +        example, security plugin and formatting-policy plugin are
12630 +        non-essential.
12631 +
12632 +    From previous description it is clear that essential plugins in default
12633 +    pset cannot be modified once file system was created, because this would
12634 +    implicitly change plugins of all objects in whose stat-data appropriate
12635 +    plugin is missing, which is contrary to the definition of essential
12636 +    plugin.
12637 +
12638 +    This poses a problem: what to do when new member is added to pset
12639 +    (consider recent addition of fibration plugin)? And, conversely, what to
12640 +    do when mounting a file system with unknown member in default pset?
12641 +
12642 +    The former is only an issue for essential plugins. When new essential
12643 +    plugin is added to pset, backward-compatible implementation of this plugin
12644 +    should be provided as default. That is, for example, when kernel with
12645 +    support for fibration mounts file system without fibration plugin it the
12646 +    root-directory stat-data, "lexicographic" fibration plugin should be
12647 +    used. This guarantees that old file-systems can be used without corrupting
12648 +    them. Of course, new versions of mkfs.reiser4 can set up whatever
12649 +    fibration plugin is deemed best to be default.
12650 +
12651 +    "Forward-compatibility" that is, mounting a file system with
12652 +    unknown plugin in default pset, can be simply refused.
12653 +
12654 +2. Plugin inheritance.
12655 +
12656 +    In addition to pset each active object also has a "hset"---"heir
12657 +    set". When new child is created, it first tries to inherit plugins from
12658 +    parent's hset, and only if plugin is missing there---from parent's
12659 +    pset. hset is treated exactly like pset in all other respects. NOTE:
12660 +    storing hset on disk is not yet implemented.
12661 +
12662 +    One question still remains to be answered: how object plugin of a child
12663 +    being created is selected? One possible solution is to add two new members
12664 +    PSET_CREAT, and PSET_MKDIR to the pset. They specify object plugins used
12665 +    when child is being created through sys_creat() and sys_mkdir() system
12666 +    calls. (Other system calls, such as sys_symlink() and sys_mknod() are too
12667 +    specialized for such flexibility.) NOTE: this is also not yet implemented.
12668 +
12669 +3. Meta-data inheritance, light-weight files.
12670 +
12671 +    Through meta-data inheritance file system object can somehow indicate that
12672 +    some portion of its meta-data should be taken from some place other than
12673 +    object's stat-data. Three obvious scenarios for meta-data inheritance are:
12674 +
12675 +        1. meta-data are taken from file-system level default place,
12676 +
12677 +        2. meta-data are taken from some specially indicated place (i.e.,
12678 +        stat-data contains a key of item(s) where meta-data have to be taken
12679 +        from), and
12680 +
12681 +        3. meta-data are taken from the parent.
12682 +
12683 +    Note, that the last option is ambiguous, because the notion of _the_
12684 +    parent is not well-defined in general. This can be worked around in two
12685 +    ways:
12686 +
12687 +        1. only use it when there is _the_ parent, for example, disable
12688 +        light-weight files with multiple names, or
12689 +
12690 +        2. don't care, for example, allow uid of light-weight file to depend
12691 +        on path-name through which this file was reached.
12692 +
12693 +    In any case, meta-data inheritance can be implemented by re-using existing
12694 +    static stat-data item plugin with simple additional plumbing in the kernel
12695 +    code (pointer to parent inode should be passed to the stat-data
12696 +    methods). It is not clear what to do when light-weight file is accessed
12697 +    through NFS, and there is no parent. Simplest solution is to just disable
12698 +    NFS access to them. This is trivial, because our ->{en,de}code_fh()
12699 +    methods are delegated to object plugin.
12700 +
12701 +
12702 diff -rupN linux-2.6.8-rc3/fs/reiser4/doc/readdir-problems-and-implementations linux-2.6.8-rc3-a/fs/reiser4/doc/readdir-problems-and-implementations
12703 --- linux-2.6.8-rc3/fs/reiser4/doc/readdir-problems-and-implementations 1970-01-01 03:00:00.000000000 +0300
12704 +++ linux-2.6.8-rc3-a/fs/reiser4/doc/readdir-problems-and-implementations       2004-08-05 21:20:52.873704713 +0400
12705 @@ -0,0 +1,12 @@
12706 +1.
12707 +
12708 +User level API.
12709 +
12710 +Standard
12711 +
12712 +^ Local variables:
12713 +^ mode-name: "Design Document"
12714 +^ indent-tabs-mode: nil
12715 +^ tab-width: 4
12716 +^ eval: (if (fboundp 'flyspell-mode) (flyspell-mode))
12717 +^ End:
12718 diff -rupN linux-2.6.8-rc3/fs/reiser4/doc/reiser4.writeback.overview linux-2.6.8-rc3-a/fs/reiser4/doc/reiser4.writeback.overview
12719 --- linux-2.6.8-rc3/fs/reiser4/doc/reiser4.writeback.overview   1970-01-01 03:00:00.000000000 +0300
12720 +++ linux-2.6.8-rc3-a/fs/reiser4/doc/reiser4.writeback.overview 2004-08-05 21:20:53.102656422 +0400
12721 @@ -0,0 +1,68 @@
12722 +Hello,
12723 +
12724 +reiser4 has some features that make it somewhat difficult to integrate with
12725 +existing VM mechanisms.
12726 +
12727 +Reiser4 maintains all meta data in the single balanced tree. This tree is
12728 +maintained in the memory in the form different from what will be ultimately
12729 +written to the disk. Roughly speaking, before writing tree node to the disk,
12730 +some complex process ("flush") is to be performed. This process has following
12731 +characteristics:
12732 +
12733 + 1 it is not local, that is it operates on big number of nodes, possibly far
12734 +   away from the starting node, both in tree and disk order.
12735 +
12736 + 2 it can involve reading of the large number of nodes from the disk (for
12737 +   example, bitmap nodes are read during extent allocation that is deferred
12738 +   until flush).
12739 +
12740 + 3 it can allocate unbounded amount of memory (during insertion of allocated
12741 +   extents).
12742 +
12743 + 4 it participates in the locking protocol which reiser4 uses to implement
12744 +   concurrent tree modifications.
12745 +
12746 + 5 it is CPU consuming and long
12747 +
12748 +As a result, flush reorganizes some part of reiser4 tree and produces large
12749 +queue of nodes ready to be submitted for io (as a matter of fact, flush write
12750 +clustering is so good that it used to hit BIO_MAX_PAGES all the time, until
12751 +checks were added for this).
12752 +
12753 +Items (3) and (4) alone make flush unsuitable for being called directly from
12754 +reiser4 ->vm_writeback() callback, because of OOM and deadlocks against
12755 +threads waiting for memory.
12756 +
12757 +So, it was decided that flush has to be performed from the separate
12758 +thread. Reiser4 has thread used to periodically commit old transactions and
12759 +this thread can be used for the flushing. That is, flushing thread does flush
12760 +and accumulates nodes prepared for the IO on the special
12761 +queue. reiser4_vm_writeback() submits nodes from this queue, if queue is
12762 +empty, it only wakes up flushing thread and immediately returns.
12763 +
12764 +Still there are some problems with integrating this stuff into VM scanning:
12765 +
12766 + 1 As ->vm_writeback() returns immediately without actually submitting pages
12767 +   for IO, throttling on PG_writeback in shrink_list() will not work. This
12768 +   opens a possibility (on a fast CPU), of try_to_free_pages() completing
12769 +   scanning and calling out_of_memory() before flushing thread managed to add
12770 +   anything to the queue.
12771 +
12772 + 2 It is possible, however unlikely, that flushing thread will be unable to flush
12773 +   anything, because there is not enough memory. In this case reiser4 resorts
12774 +   to the "emergency flush": some dumb algorithm that writes tree nodes to the
12775 +   disk without taking locks and without optimizing tree layout.
12776 +
12777 + 3 Nodes prepared for IO can be from the active list, this means that they
12778 +   will not be met/freed by shrink_list() after IO completion. New
12779 +   blk_congestion_wait() should help here though.
12780 +
12781 +It looks like we need following changes to make this stuff working:
12782 +
12783 + 1 Adding ->priority field into struct writeback_control, so that file system
12784 +   can vary its behavior depending on how desperate memory pressure is.
12785 +
12786 + 2 Different mechanism for scan throttling.
12787 +
12788 +Actually latter can be implemented completely within reiser4 but with some
12789 +awkwardness.
12790 diff -rupN linux-2.6.8-rc3/fs/reiser4/doc/set-theoretic-stuff.tex linux-2.6.8-rc3-a/fs/reiser4/doc/set-theoretic-stuff.tex
12791 --- linux-2.6.8-rc3/fs/reiser4/doc/set-theoretic-stuff.tex      1970-01-01 03:00:00.000000000 +0300
12792 +++ linux-2.6.8-rc3-a/fs/reiser4/doc/set-theoretic-stuff.tex    2004-08-05 21:20:53.502572070 +0400
12793 @@ -0,0 +1,82 @@
12794 +\documentclass[a4paper, oneside, fleqn]{article}
12795 +
12796 +\usepackage{latexsym}
12797 +\usepackage{url}
12798 +\usepackage[T2A]{fontenc}
12799 +
12800 +\pagestyle{empty}
12801 +\listfiles
12802 +\setcounter{errorcontextlines}{100}
12803 +\makeindex
12804 +\pagestyle{headings}
12805 +\frenchspacing
12806 +\tolerance=1000
12807 +\parindent=0pt
12808 +\raggedbottom
12809 +\setlength\parskip{6pt}
12810 +
12811 +\DeclareMathAlphabet{\mathbsf}{T2A}{cmss}{b}{n}
12812 +\SetMathAlphabet{\mathbsf}{normal}{T2A}{cmss}{b}{n}
12813 +
12814 +\def\qopname@#1{\mathop{\fam 0#1}\nolimits}
12815 +\newcommand{\mathsign}[1]
12816 +       {\index{#1@$\mathbsf{#1}$}\qopname@{\mathbsf{#1}}}
12817 +
12818 +\def\As{\mathsign{Assoc}}
12819 +\newcommand{\svi}[2]
12820 +    {\texttt{[} #1 \ V \texttt{]}}
12821 +
12822 +\begin{document}
12823 +
12824 +\thispagestyle{empty}
12825 +
12826 +%\section{Definitions}
12827 +
12828 +We have a set $X$ of objects, and ``associated-with'' relation. We shall write
12829 +
12830 +$$a\As b, \quad a\in X, \ b\in X$$
12831 +
12832 +to denote that $a$ is associated with $b$.
12833 +
12834 +One can imagine $\As$ relation as graph where elements of $X$ are nodes and
12835 +where there is arc (arrow) from $a$ to $b$ iff $a$ is associated with
12836 +$b$. Note that no further restrictions are placed on $\As$. In particular, it
12837 +is not supposed that $\As$ is reflexive (object is not necessary associated
12838 +with itself), symmetric, or transitive.
12839 +
12840 +$\beta(X)$ is set of all subsets of $X$, that is $$\beta(X) = \{ U \subseteq X
12841 +\}$$
12842 +
12843 +Let's define function $A:X\to^{}\beta(X)$ as follows:
12844 +
12845 +$$A(x)=\{y\in X\ |\ y\As x\}, \quad x\in X.$$
12846 +
12847 +that is $A(x)$ is a set of all objects in $X$ associated with $x$.
12848 +Then, define \mbox{$A^*:\beta(X)\to^{}\beta(X)$} as follows:
12849 +
12850 +$$A^*(U)=\bigcup\limits_{x\in U} A(x), \quad U\subseteq X.$$
12851 +
12852 +that is, $A(U)$ is set of all objects associated with any element of $U$. Now
12853 +we can define $\svi{U}{V}$, where $U, V\subseteq X$---``set vicinity
12854 +intersection'' operation as:
12855 +
12856 +%\begin{displaymath}
12857 +%A^+(U) = \left\{
12858 +%    \begin{array}{rl}
12859 +%    U = \{x\}      & \Rightarrow A(x),\\
12860 +%    \textrm{else}  & \Rightarrow A^*(U)
12861 +%    \end{array} \right.
12862 +%\end{displaymath}
12863 +
12864 +$$\svi{U}{V} = A^*(U) \cap A^*(V).$$
12865 +
12866 +In other words, $\svi{U}{V}$ is a set of all objects associated with some
12867 +element of $U$ \emph{and} some element of $V$.
12868 +
12869 +\end{document}
12870 +
12871 +% Local variables:
12872 +% indent-tabs-mode: nil
12873 +% tab-width: 4
12874 +% eval: (progn (if (fboundp 'flyspell-mode) (flyspell-mode)) (set (make-local-variable 'compile-command) "latex set-theoretic-stuff.tex ; dvips -o set-theoretic-stuff.ps set-theoretic-stuff.dvi"))
12875 +% End:
12876 diff -rupN linux-2.6.8-rc3/fs/reiser4/doc/sys-reiser4-implemenation-overview linux-2.6.8-rc3-a/fs/reiser4/doc/sys-reiser4-implemenation-overview
12877 --- linux-2.6.8-rc3/fs/reiser4/doc/sys-reiser4-implemenation-overview   1970-01-01 03:00:00.000000000 +0300
12878 +++ linux-2.6.8-rc3-a/fs/reiser4/doc/sys-reiser4-implemenation-overview 2004-08-05 21:20:52.790722217 +0400
12879 @@ -0,0 +1,222 @@
12880 +SYS_REISER4 IMPLEMENTATION OVERVIEW
12881 +
12882 +
12883 +A. Basics
12884 +*****************************************************************
12885 +
12886 +sys_reiser4() system call executing a sequence of actions upon the
12887 +file-system(s). Actions are specified by the user in the form of a command
12888 +string. For the purposes of present discussion, said command string can be
12889 +thought of as a program in a special purpose programming language, which will
12890 +be further referred to as reiser4_lang.
12891 +
12892 +Canonical example of reiser4_lang program is
12893 +
12894 +/dir1/dir2/dir3/file1 <- /dir4/dir5/dir6/file2
12895 +
12896 +It semantics is following:
12897 +
12898 +1. resolve "/dir1/dir2/dir3/file1" into file-system object (lookup operation)
12899 +2. resolve "/dir4/dir5/dir6/file2" into file-system object (lookup operation)
12900 +3. assign latter to the former.
12901 +
12902 +This is "assignment" operator. Assignment involves two "file-system objects"
12903 +and semantics of both lookup stage and assignment proper depends upon the type
12904 +of the file-system object.
12905 +
12906 +Following types of file-system objects are recognized:
12907 +
12908 +1. foreign objects: objects of different file-systems. Foreign object cannot
12909 +be target or source of an assignment. Rather, foreign objects can only appear
12910 +during path name lookup, while traversing non-reiser4 part of the file-system
12911 +name-space. Probably one should distinguish between objects belonging to
12912 +different file-system types (etx2, NFS) and objects belonging to different
12913 +reiser4 mounts. After sys_reiser4() is stable, foreign objects will be more
12914 +fully supported.
12915 +
12916 +2. reiser4 objects.
12917 +
12918 +3. pseudo-objects: these are entities injected into reiser4 name-space to
12919 +provide uniform access to various file-system meta-data. Pseudo-objects are
12920 +(usually) attached to some particular "host" object. [In the initial version,]
12921 +host objects are reiser4 objects. [Later it is possible to implement some
12922 +pseudo-objects for foreign objects.] Convention (but not enforced rule) is
12923 +that pseudo-objects are accessible through names starting with some well-known
12924 +prefix (".." is current favorite). Examples: ..owner, ..acl, etc. See comment
12925 +at the top of fs/reiser4/plugin/pseudo/pseudo.c for more details.
12926 +
12927 +B. lnodes
12928 +*****************************************************************
12929 +
12930 +lnodes are handles for file-system objects described above. They serve dual
12931 +purpose:
12932 +
12933 +1. uniform interface to the various types of objects. This allows the
12934 +reiser4_lang implementation to treat various types of objects in the same
12935 +manner. When new type of object has to be added, all changes will be grouped
12936 +in one place, rather than scattered across various files. This uniformity also
12937 +allows code sharing between reiser4_lang and VFS access paths. For example,
12938 +the same ->write method can be used by both. That is, ->read(), and ->write()
12939 +plugin methods used in VFS access paths will take lnode(s) as arguments and
12940 +can share code with sys_reiser4() implementation. For example, assignment is
12941 +particular case of write (or visa versa, depending on point of view).
12942 +
12943 +
12944 +2. synchronization. reiser4_lang doesn't use inodes and this poses a problem of
12945 +synchronization with VFS. Each lnode serves as a lock. See lnode.c for more
12946 +details.
12947 +
12948 +C. lookup
12949 +*****************************************************************
12950 +
12951 +reiser4_lang still supports only two traditional UNIX kinds of ordered names
12952 +(pathnames): absolute and relative to the current working directory. In both
12953 +cases, lookup starts from some file-system object represented by lnode. Then
12954 +lookup proceeds component-by-component as follows:
12955 +
12956 +   lnode *parent;
12957 +   lnode  child;
12958 +
12959 +   ret_code = lnode_get_dir_plugin( parent ) -> lnode_by_name( parent,
12960 +                                                               path_component,
12961 +                                                               &child );
12962 +
12963 +1. Abovementioned locking issues require that parent lnode has to be kept
12964 +until operation on child finishes. In effect we get lock-coupling much like in
12965 +internal tree traversal. Also, possibility to use lock on node with directory
12966 +entry in stead of object lock was discussed. We have to think more on this.
12967 +
12968 +
12969 +2. Mount points crossing. It is possible, because dentries and therefore
12970 +inodes of all mount points are pinned in memory and lookup code can check at
12971 +each step whether mount point is crossed. Details are not very nice, because
12972 +for each inode in a path we have to scan list of all its dentries and check
12973 +whether correct one (corresponding to our path) is mount point.
12974 +
12975 +3. It is also possible to pass ->lnode_by_name the whole of the remaining
12976 +name, and let it decide how much of it it should handle. This will complicate
12977 +locking somewhat. But this is doable, though requires changes to the parser.
12978 +
12979 +
12980 +D. assignment
12981 +*****************************************************************
12982 +
12983 +Assignment A<-B basically means duplicating content of B into A. No
12984 +copy-on-write optimizations will be in version 4.0.
12985 +
12986 +Assignment implementation is based on the notion of flow (flow_t). Flow is a
12987 +source from which data can be obtained. Flow can be "backed up" by one of the
12988 +following:
12989 +
12990 +1. memory area in user space. (char *area, size_t length)
12991 +2. memory area in kernel space. (caddr_t *area, size_t length)
12992 +3. file-system object (lnode *obj, loff_t offset, size_t length)
12993 +
12994 +Main function to manipulate flows is:
12995 +
12996 +int flow_place( flow_t *flow, char *area, size_t length );
12997 +
12998 +it copies @length bytes of @flow into @area and updated @flow correspondingly.
12999 +Behavior of flow_place() depends on the type of entity backing up @flow. If
13000 +@flow is based on the kernel-space area, memmove() is used to copy data. If
13001 +@flow is based on the user-space area, copy_from_user() is used. If @flow is
13002 +based on file-system object, flow_place() loads object's data into page cache
13003 +and copies them into @area.
13004 +
13005 +Thus, assignment code looks like following:
13006 +
13007 +typedef int ( *connect_t )( sink_t *target, flow_t *source );
13008 +
13009 +int reiser4_assign( lnode *dst, lnode *src )
13010 +{
13011 +    flow_t        source;
13012 +    sink_t        target;
13013 +    int           ret_code;
13014 +    file_plugin  *src_fplug;
13015 +    file_plugin  *dst_fplug;
13016 +    connect_t     connection;
13017 +
13018 +    /* get plugins */
13019 +
13020 +    src_fplug = lnode_get_file_plugin( src );
13021 +    dst_fplug = lnode_get_file_plugin( dst );
13022 +
13023 +    /* build source flow */
13024 +    ret_code = src_fplug -> build_flow( src, &source, 0 /* offset */ );
13025 +
13026 +    /* build target sink */
13027 +    ret_code = dst_fplug -> build_sink( dst, &target, 0 /* offset */ );
13028 +
13029 +    /*
13030 +     * select how to transfer data from @src to @dst.
13031 +     *
13032 +     * Default implementation of this is common_transfer() (see below).
13033 +     *
13034 +     * Smart file plugin can choose connection based on type of @dst.
13035 +     *
13036 +     */
13037 +    connection = src_fplug -> select_connection( src, dst );
13038 +
13039 +    /* do transfer */
13040 +    return connection( &target, &source );
13041 +}
13042 +
13043 +
13044 +/* look to chain conversion of (lnode * dst) -> (sink_t target) -> (lnode * dst)
13045 + I think, functions build_sink(...) and  sink_object(...) - superfluous */
13046 +
13047 +int common_transfer( sink_t *target, flow_t *source )
13048 +{
13049 +    lnode  *dst;
13050 +
13051 +    dst = sink_object( target );
13052 +    while( flow_not_empty( source ) ) {
13053 +        char   *area;
13054 +        size_t  length;
13055 +
13056 +        /*
13057 +         * append some space to @target. Reasonable implementation will
13058 +         * allocate several pagesful here
13059 +         */
13060 +        ret_code = lnode_get_body_plugin( dst ) -> prepare_append( dst,
13061 +                                                                   &area,
13062 +                                                                   &length );
13063 +                                            /* why @length not depended from source? */
13064 +        /*
13065 +         * put data from flow into newly alloted space. This also updates
13066 +         * @flow.
13067 +         */
13068 +        flow_place( source, area, length );
13069 +        /*
13070 +         * perform necessary post-write activity required by @dst plugin, like
13071 +         * encryption, compression, etc. Release pages.
13072 +         */
13073 +        ret_code = lnode_get_body_plugin( dst ) -> commit_append( dst,
13074 +                                                                  area, length );
13075 +    }
13076 +}
13077 +
13078 +
13079 +E. parsing
13080 +*****************************************************************
13081 +
13082 +It is not clear what parts of reiser4_lang processing should go into
13083 +kernel. In any case, providing direct system call as main (or, worse, the
13084 +only) way to access reiser4_lang functionality bounds as to maintain binary
13085 +compatibility in a future. To avoid this, reiser4 should be shipped with
13086 +user-level library, containing
13087 +
13088 +int reiser4( const char *cmd, size_t length );
13089 +
13090 +function. For now, this function will directly despatch @cmd to the
13091 +sys_reiser4() in a future, it may do parsing itself and pass parse tree to the
13092 +kernel interpreter.
13093 +
13094 +*****************************************************************
13095 +
13096 +# Local variables:
13097 +# mode-name: "proposal"
13098 +# indent-tabs-mode: nil
13099 +# tab-width: 4
13100 +# eval: (if (fboundp 'flyspell-mode) (flyspell-mode))
13101 +# End:
13102 diff -rupN linux-2.6.8-rc3/fs/reiser4/dscale.c linux-2.6.8-rc3-a/fs/reiser4/dscale.c
13103 --- linux-2.6.8-rc3/fs/reiser4/dscale.c 1970-01-01 03:00:00.000000000 +0300
13104 +++ linux-2.6.8-rc3-a/fs/reiser4/dscale.c       2004-08-05 21:20:53.131650306 +0400
13105 @@ -0,0 +1,173 @@
13106 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
13107 + * reiser4/README */
13108 +
13109 +/* Scalable on-disk integers */
13110 +
13111 +/*
13112 + * Various on-disk structures contain integer-like structures. Stat-data
13113 + * contain [yes, "data" is plural, check the dictionary] file size, link
13114 + * count; extent unit contains extent width etc. To accommodate for general
13115 + * case enough space is reserved to keep largest possible value. 64 bits in
13116 + * all cases above. But in overwhelming majority of cases numbers actually
13117 + * stored in these fields will be comparatively small and reserving 8 bytes is
13118 + * a waste of precious disk bandwidth.
13119 + *
13120 + * Scalable integers are one way to solve this problem. dscale_write()
13121 + * function stores __u64 value in the given area consuming from 1 to 9 bytes,
13122 + * depending on the magnitude of the value supplied. dscale_read() reads value
13123 + * previously stored by dscale_write().
13124 + *
13125 + * dscale_write() produces format not completely unlike of UTF: two highest
13126 + * bits of the first byte are used to store "tag". One of 4 possible tag
13127 + * values is chosen depending on the number being encoded:
13128 + *
13129 + *           0 ... 0x3f               => 0           [table 1]
13130 + *        0x40 ... 0x3fff             => 1
13131 + *      0x4000 ... 0x3fffffff         => 2
13132 + *  0x40000000 ... 0xffffffffffffffff => 3
13133 + *
13134 + * (see dscale_range() function)
13135 + *
13136 + * Values in the range 0x40000000 ... 0xffffffffffffffff require 8 full bytes
13137 + * to be stored, so in this case there is no place in the first byte to store
13138 + * tag. For such values tag is stored in an extra 9th byte.
13139 + *
13140 + * As _highest_ bits are used for the test (which is natural) scaled integers
13141 + * are stored in BIG-ENDIAN format in contrast with the rest of reiser4 which
13142 + * uses LITTLE-ENDIAN.
13143 + *
13144 + */
13145 +
13146 +#include "debug.h"
13147 +#include "dscale.h"
13148 +
13149 +/* return tag of scaled integer stored at @address */
13150 +static int gettag(const unsigned char *address)
13151 +{
13152 +       /* tag is stored in two highest bits */
13153 +       return (*address) >> 6;
13154 +}
13155 +
13156 +/* clear tag from value. Clear tag embedded into @value. */
13157 +static void cleartag(__u64 *value, int tag)
13158 +{
13159 +       /*
13160 +        * W-w-what ?!
13161 +        *
13162 +        * Actually, this is rather simple: @value passed here was read by
13163 +        * dscale_read(), converted from BIG-ENDIAN, and padded to __u64 by
13164 +        * zeroes. Tag is still stored in the highest (arithmetically)
13165 +        * non-zero bits of @value, but relative position of tag within __u64
13166 +        * depends on @tag.
13167 +        *
13168 +        * For example if @tag is 0, it's stored 2 highest bits of lowest
13169 +        * byte, and its offset (counting from lowest bit) is 8 - 2 == 6 bits.
13170 +        *
13171 +        * If tag is 1, it's stored in two highest bits of 2nd lowest byte,
13172 +        * and it's offset if (2 * 8) - 2 == 14 bits.
13173 +        *
13174 +        * See table 1 above for details.
13175 +        *
13176 +        * All these cases are captured by the formula:
13177 +        */
13178 +       *value &= ~(3 << (((1 << tag) << 3) - 2));
13179 +       /*
13180 +        * That is, clear two (3 == 0t11) bits at the offset
13181 +        *
13182 +        *                  8 * (2 ^ tag) - 2,
13183 +        *
13184 +        * that is, two highest bits of (2 ^ tag)-th byte of @value.
13185 +        */
13186 +}
13187 +
13188 +/* return tag for @value. See table 1 above for details. */
13189 +static int dscale_range(__u64 value)
13190 +{
13191 +       if (value > 0x3fffffff)
13192 +               return 3;
13193 +       if (value > 0x3fff)
13194 +               return 2;
13195 +       if (value > 0x3f)
13196 +               return 1;
13197 +       return 0;
13198 +}
13199 +
13200 +/* restore value stored at @adderss by dscale_write() and return number of
13201 + * bytes consumed */
13202 +reiser4_internal int dscale_read(unsigned char *address, __u64 *value)
13203 +{
13204 +       int tag;
13205 +
13206 +       /* read tag */
13207 +       tag = gettag(address);
13208 +       switch (tag) {
13209 +       case 3:
13210 +               /* In this case tag is stored in an extra byte, skip this byte
13211 +                * and decode value stored in the next 8 bytes.*/
13212 +               *value = __be64_to_cpu(get_unaligned((__u64 *)(address + 1)));
13213 +               /* worst case: 8 bytes for value itself plus one byte for
13214 +                * tag. */
13215 +               return 9;
13216 +       case 0:
13217 +               *value = get_unaligned(address);
13218 +               break;
13219 +       case 1:
13220 +               *value = __be16_to_cpu(get_unaligned((__u16 *)address));
13221 +               break;
13222 +       case 2:
13223 +               *value = __be32_to_cpu(get_unaligned((__u32 *)address));
13224 +               break;
13225 +       default:
13226 +               return RETERR(-EIO);
13227 +       }
13228 +       /* clear tag embedded into @value */
13229 +       cleartag(value, tag);
13230 +       /* number of bytes consumed is (2 ^ tag)---see table 1.*/
13231 +       return 1 << tag;
13232 +}
13233 +
13234 +/* store @value at @address and return number of bytes consumed */
13235 +reiser4_internal int dscale_write(unsigned char *address, __u64 value)
13236 +{
13237 +       int tag;
13238 +       int shift;
13239 +       unsigned char *valarr;
13240 +
13241 +       tag = dscale_range(value);
13242 +       value = __cpu_to_be64(value);
13243 +       valarr = (unsigned char *)&value;
13244 +       shift = (tag == 3) ? 1 : 0;
13245 +       memcpy(address + shift, valarr + sizeof value - (1 << tag), 1 << tag);
13246 +       *address |= (tag << 6);
13247 +       return shift + (1 << tag);
13248 +}
13249 +
13250 +/* number of bytes required to store @value */
13251 +reiser4_internal int dscale_bytes(__u64 value)
13252 +{
13253 +       int bytes;
13254 +
13255 +       bytes = 1 << dscale_range(value);
13256 +       if (bytes == 8)
13257 +               ++ bytes;
13258 +       return bytes;
13259 +}
13260 +
13261 +/* returns true if @value and @other require the same number of bytes to be
13262 + * stored. Used by detect when data structure (like stat-data) has to be
13263 + * expanded or contracted. */
13264 +reiser4_internal int dscale_fit(__u64 value, __u64 other)
13265 +{
13266 +       return dscale_range(value) == dscale_range(other);
13267 +}
13268 +
13269 +/* Make Linus happy.
13270 +   Local variables:
13271 +   c-indentation-style: "K&R"
13272 +   mode-name: "LC"
13273 +   c-basic-offset: 8
13274 +   tab-width: 8
13275 +   fill-column: 120
13276 +   scroll-step: 1
13277 +   End:
13278 +*/
13279 diff -rupN linux-2.6.8-rc3/fs/reiser4/dscale.h linux-2.6.8-rc3-a/fs/reiser4/dscale.h
13280 --- linux-2.6.8-rc3/fs/reiser4/dscale.h 1970-01-01 03:00:00.000000000 +0300
13281 +++ linux-2.6.8-rc3-a/fs/reiser4/dscale.h       2004-08-05 21:20:52.944689741 +0400
13282 @@ -0,0 +1,27 @@
13283 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
13284 + * reiser4/README */
13285 +
13286 +/* Scalable on-disk integers. See dscale.h for details. */
13287 +
13288 +#if !defined( __FS_REISER4_DSCALE_H__ )
13289 +#define __FS_REISER4_DSCALE_H__
13290 +
13291 +#include "dformat.h"
13292 +
13293 +extern int dscale_read (unsigned char *address, __u64 *value);
13294 +extern int dscale_write(unsigned char *address, __u64 value);
13295 +extern int dscale_bytes(__u64 value);
13296 +extern int dscale_fit  (__u64 value, __u64 other);
13297 +
13298 +/* __FS_REISER4_DSCALE_H__ */
13299 +#endif
13300 +
13301 +/* Make Linus happy.
13302 +   Local variables:
13303 +   c-indentation-style: "K&R"
13304 +   mode-name: "LC"
13305 +   c-basic-offset: 8
13306 +   tab-width: 8
13307 +   fill-column: 120
13308 +   End:
13309 +*/
13310 diff -rupN linux-2.6.8-rc3/fs/reiser4/emergency_flush.c linux-2.6.8-rc3-a/fs/reiser4/emergency_flush.c
13311 --- linux-2.6.8-rc3/fs/reiser4/emergency_flush.c        1970-01-01 03:00:00.000000000 +0300
13312 +++ linux-2.6.8-rc3-a/fs/reiser4/emergency_flush.c      2004-08-05 21:20:52.810717999 +0400
13313 @@ -0,0 +1,925 @@
13314 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
13315 +
13316 +/* This file exists only until VM gets fixed to reserve pages properly, which
13317 + * might or might not be very political. */
13318 +
13319 +/* Implementation of emergency flush. */
13320 +
13321 +/* OVERVIEW:
13322 +
13323 +     Before writing a node to the disk, some complex process (flush.[ch]) is
13324 +     to be performed. Flush is the main necessary preliminary step before
13325 +     writing pages back to the disk, but it has some characteristics that make
13326 +     it completely different from traditional ->writepage():
13327 +
13328 +        1 It operates on a large number of nodes, possibly far away from the
13329 +        starting node, both in tree and disk order.
13330 +
13331 +        2 it can involve reading of nodes from the disk (during extent
13332 +        allocation, for example).
13333 +
13334 +        3 it can allocate memory (during insertion of allocated extents).
13335 +
13336 +        4 it participates in the locking protocol which reiser4 uses to
13337 +        implement concurrent tree modifications.
13338 +
13339 +        5 it is CPU consuming and long
13340 +
13341 +     As a result, flush reorganizes some part of reiser4 tree and produces
13342 +     large queue of nodes ready to be submitted for io.
13343 +
13344 +     Items (3) and (4) alone make flush unsuitable for being called directly
13345 +     from reiser4 ->writepage() callback, because of OOM and deadlocks
13346 +     against threads waiting for memory.
13347 +
13348 +     So, flush is performed from within balance_dirty_page() path when dirty
13349 +     pages are generated. If balance_dirty_page() fails to throttle writers
13350 +     and page replacement finds dirty page on the inactive list, we resort to
13351 +     "emergency flush" in our ->vm_writeback().
13352 +
13353 +     Emergency flush is relatively dumb algorithm, implemented in this file,
13354 +     that tries to write tree nodes to the disk without taking locks and without
13355 +     thoroughly optimizing tree layout. We only want to call emergency flush in
13356 +     desperate situations, because it is going to produce sub-optimal disk
13357 +     layouts.
13358 +
13359 +  DETAILED DESCRIPTION
13360 +
13361 +     Emergency flush (eflush) is designed to work as low level mechanism with
13362 +     no or little impact on the rest of (already too complex) code.
13363 +
13364 +     eflush is initiated from ->writepage() method called by VM on memory
13365 +     pressure. It is supposed that ->writepage() is rare call path, because
13366 +     balance_dirty_pages() throttles writes and tries to keep memory in
13367 +     balance.
13368 +
13369 +     eflush main entry point (emergency_flush()) checks whether jnode is
13370 +     eligible for emergency flushing. Check is performed by flushable()
13371 +     function which see for details. After successful check, new block number
13372 +     ("emergency block") is allocated and io is initiated to write jnode
13373 +     content to that block.
13374 +
13375 +     After io is finished, jnode will be cleaned and VM will be able to free
13376 +     page through call to ->releasepage().
13377 +
13378 +     emergency_flush() also contains special case invoked when it is possible
13379 +     to avoid allocation of new node.
13380 +
13381 +     Node selected for eflush is marked (by JNODE_EFLUSH bit in ->flags field)
13382 +     and added to the special hash table of all eflushed nodes. This table
13383 +     doesn't have linkage within each jnode, as this would waste memory in
13384 +     assumption that eflush is rare. In stead new small memory object
13385 +     (eflush_node_t) is allocated that contains pointer to jnode, emergency
13386 +     block number, and is inserted into hash table. Per super block counter of
13387 +     eflushed nodes is incremented. See section [INODE HANDLING] below for
13388 +     more on this.
13389 +
13390 +     It should be noted that emergency flush may allocate memory and wait for
13391 +     io completion (bitmap read).
13392 +
13393 +     Basically eflushed node has following distinctive characteristics:
13394 +
13395 +          (1) JNODE_EFLUSH bit is set
13396 +
13397 +          (2) no page
13398 +
13399 +          (3) there is an element in hash table, for this node
13400 +
13401 +          (4) node content is stored on disk in block whose number is stored
13402 +          in the hash table element
13403 +
13404 +  UNFLUSH
13405 +
13406 +      Unflush is reverse of eflush, that is process bringing page of eflushed
13407 +      inode back into memory.
13408 +
13409 +      In accordance with the policy that eflush is low level and low impact
13410 +      mechanism, transparent to the rest of the code, unflushing is performed
13411 +      deeply within jload_gfp() which is main function used to load and pin
13412 +      jnode page into memory.
13413 +
13414 +      Specifically, if jload_gfp() determines that it is called on eflushed
13415 +      node it gets emergency block number to start io against from the hash
13416 +      table rather than from jnode itself. This is done in
13417 +      jnode_get_io_block() function. After io completes, hash table element
13418 +      for this node is removed and JNODE_EFLUSH bit is cleared.
13419 +
13420 +  LOCKING
13421 +
13422 +      The page lock is used to avoid eflush/e-unflush/jnode_get_io_block races.
13423 +      emergency_flush() and jnode_get_io_block are called under the page lock.
13424 +      The eflush_del() function (emergency unflush) may be called for a node w/o
13425 +      page attached.  In that case eflush_del() allocates a page and locks it.
13426 +
13427 +  PROBLEMS
13428 +
13429 +  1. INODE HANDLING
13430 +
13431 +      Usually (i.e., without eflush), jnode has a page attached to it. This
13432 +      page pins corresponding struct address_space, and, hence, inode in
13433 +      memory. Once inode has been eflushed, its page is gone and inode can be
13434 +      wiped out of memory by the memory pressure (prune_icache()). This leads
13435 +      to the number of complications:
13436 +
13437 +           (1) jload_gfp() has to attach jnode tho the address space's radix
13438 +           tree. This requires existence if inode.
13439 +
13440 +           (2) normal flush needs jnode's inode to start slum collection from
13441 +           unformatted jnode.
13442 +
13443 +      (1) is really a problem, because it is too late to load inode (which
13444 +      would lead to loading of stat data, etc.) within jload_gfp().
13445 +
13446 +      We, therefore, need some way to protect inode from being recycled while
13447 +      having accessible eflushed nodes.
13448 +
13449 +      I'll describe old solution here so it can be compared with new one.
13450 +
13451 +      Original solution pinned inode by __iget() when first its node was
13452 +      eflushed and released (through iput()) when last was unflushed. This
13453 +      required maintenance of inode->eflushed counter in inode.
13454 +
13455 +      Problem arise if last name of inode is unlinked when it has eflushed
13456 +      nodes. In this case, last iput() that leads to the removal of file is
13457 +      iput() made by unflushing from within jload_gfp(). Obviously, calling
13458 +      truncate, and tree traversals from jload_gfp() is not a good idea.
13459 +
13460 +      New solution is to pin inode in memory by adding I_EFLUSH bit to its
13461 +      ->i_state field. This protects inode from being evicted by
13462 +      prune_icache().
13463 +
13464 +  DISK SPACE ALLOCATION
13465 +
13466 +      This section will describe how emergency block is allocated and how
13467 +      block counters (allocated, grabbed, etc.) are manipulated. To be done.
13468 +
13469 +   *****HISTORICAL SECTION****************************************************
13470 +
13471 +   DELAYED PARENT UPDATE
13472 +
13473 +     Important point of emergency flush is that update of parent is sometimes
13474 +     delayed: we don't update parent immediately if:
13475 +
13476 +      1 Child was just allocated, but parent is locked. Waiting for parent
13477 +      lock in emergency flush is impossible (deadlockable).
13478 +
13479 +      2 Part of extent was allocated, but parent has not enough space to
13480 +      insert allocated extent unit. Balancing in emergency flush is
13481 +      impossible, because it will possibly wait on locks.
13482 +
13483 +     When we delay update of parent node, we mark it as such (and possibly
13484 +     also mark children to simplify delayed update later). Question: when
13485 +     parent should be really updated?
13486 +
13487 +   WHERE TO WRITE PAGE INTO?
13488 +
13489 +
13490 +     So, it was decided that flush has to be performed from a separate
13491 +     thread. Reiser4 has a thread used to periodically commit old transactions,
13492 +     and this thread can be used for the flushing. That is, flushing thread
13493 +     does flush and accumulates nodes prepared for the IO on the special
13494 +     queue. reiser4_vm_writeback() submits nodes from this queue, if queue is
13495 +     empty, it only wakes up flushing thread and immediately returns.
13496 +
13497 +     Still there are some problems with integrating this stuff into VM
13498 +     scanning:
13499 +
13500 +        1 As ->vm_writeback() returns immediately without actually submitting
13501 +        pages for IO, throttling on PG_writeback in shrink_list() will not
13502 +        work. This opens a possibility (on a fast CPU), of try_to_free_pages()
13503 +        completing scanning and calling out_of_memory() before flushing thread
13504 +        managed to add anything to the queue.
13505 +
13506 +        2 It is possible, however unlikely, that flushing thread will be
13507 +        unable to flush anything, because there is not enough memory. In this
13508 +        case reiser4 resorts to the "emergency flush": some dumb algorithm,
13509 +        implemented in this file, that tries to write tree nodes to the disk
13510 +        without taking locks and without thoroughly optimizing tree layout. We
13511 +        only want to call emergency flush in desperate situations, because it
13512 +        is going to produce sub-optimal disk layouts.
13513 +
13514 +        3 Nodes prepared for IO can be from the active list, this means that
13515 +        they will not be met/freed by shrink_list() after IO completion. New
13516 +        blk_congestion_wait() should help with throttling but not
13517 +        freeing. This is not fatal though, because inactive list refilling
13518 +        will ultimately get to these pages and reclaim them.
13519 +
13520 +   REQUIREMENTS
13521 +
13522 +     To make this work we need at least some hook inside VM scanning which
13523 +     gets triggered after scanning (or scanning with particular priority)
13524 +     failed to free pages. This is already present in the
13525 +     mm/vmscan.c:set_shrinker() interface.
13526 +
13527 +     Another useful thing that we would like to have is passing scanning
13528 +     priority down to the ->vm_writeback() that will allow file system to
13529 +     switch to the emergency flush more gracefully.
13530 +
13531 +   POSSIBLE ALGORITHMS
13532 +
13533 +     1 Start emergency flush from ->vm_writeback after reaching some priority.
13534 +     This allows to implement simple page based algorithm: look at the page VM
13535 +     supplied us with and decide what to do.
13536 +
13537 +     2 Start emergency flush from shrinker after reaching some priority.
13538 +     This delays emergency flush as far as possible.
13539 +
13540 +   *****END OF HISTORICAL SECTION**********************************************
13541 +
13542 +*/
13543 +
13544 +#include "forward.h"
13545 +#include "debug.h"
13546 +#include "page_cache.h"
13547 +#include "tree.h"
13548 +#include "jnode.h"
13549 +#include "znode.h"
13550 +#include "inode.h"
13551 +#include "super.h"
13552 +#include "block_alloc.h"
13553 +#include "emergency_flush.h"
13554 +
13555 +#include <linux/mm.h>
13556 +#include <linux/writeback.h>
13557 +#include <linux/slab.h>
13558 +#include <linux/vmalloc.h>
13559 +#include <linux/swap.h>
13560 +
13561 +#if REISER4_USE_EFLUSH
13562 +
13563 +static int flushable(const jnode * node, struct page *page, int);
13564 +static int needs_allocation(const jnode * node);
13565 +static eflush_node_t *ef_alloc(int flags);
13566 +static reiser4_ba_flags_t ef_block_flags(const jnode *node);
13567 +static int ef_free_block(jnode *node, const reiser4_block_nr *blk, block_stage_t stage, eflush_node_t *ef);
13568 +static int ef_prepare(jnode *node, reiser4_block_nr *blk, eflush_node_t **enode, reiser4_blocknr_hint *hint);
13569 +static int eflush_add(jnode *node, reiser4_block_nr *blocknr, eflush_node_t *ef);
13570 +
13571 +/* slab for eflush_node_t's */
13572 +static kmem_cache_t *eflush_slab;
13573 +
13574 +#define EFLUSH_START_BLOCK ((reiser4_block_nr)0)
13575 +
13576 +#define INC_STAT(node, counter)                                                \
13577 +       reiser4_stat_inc_at_level(jnode_get_level(node), counter);
13578 +
13579 +/* this function exists only until VM gets fixed to reserve pages properly,
13580 + * which might or might not be very political. */
13581 +/* try to flush @page to the disk
13582 + *
13583 + * Return 0 if page was successfully paged out. 1 if it is busy, error
13584 + * otherwise.
13585 + */
13586 +reiser4_internal int
13587 +emergency_flush(struct page *page)
13588 +{
13589 +       struct super_block *sb;
13590 +       jnode *node;
13591 +       int result;
13592 +       assert("nikita-2721", page != NULL);
13593 +       assert("nikita-2724", PageLocked(page));
13594 +
13595 +       // warning("nikita-3112", "Emergency flush. Notify Reiser@Namesys.COM");
13596 +
13597 +       /*
13598 +        * Page is locked, hence page<->jnode mapping cannot change.
13599 +        */
13600 +
13601 +       sb = page->mapping->host->i_sb;
13602 +       node = jprivate(page);
13603 +
13604 +       assert("vs-1452", node != NULL);
13605 +
13606 +       jref(node);
13607 +       INC_STAT(node, vm.eflush.called);
13608 +
13609 +       result = 0;
13610 +       LOCK_JNODE(node);
13611 +       /*
13612 +        * page was dirty and under eflush. This is (only?) possible if page
13613 +        * was re-dirtied through mmap(2) after eflush IO was submitted, but
13614 +        * before ->releasepage() freed page.
13615 +        */
13616 +       eflush_del(node, 1);
13617 +
13618 +       LOCK_JLOAD(node);
13619 +       if (flushable(node, page, 1)) {
13620 +               if (needs_allocation(node)) {
13621 +                       reiser4_block_nr blk;
13622 +                       eflush_node_t *efnode;
13623 +                       reiser4_blocknr_hint hint;
13624 +
13625 +                       blk = 0ull;
13626 +                       efnode = NULL;
13627 +
13628 +                       /* Set JNODE_EFLUSH bit _before_ allocating a block,
13629 +                        * that prevents flush reserved block from using here
13630 +                        * and by a reiser4 flush process  */
13631 +                       JF_SET(node, JNODE_EFLUSH);
13632 +
13633 +                       blocknr_hint_init(&hint);
13634 +
13635 +                       INC_STAT(node, vm.eflush.needs_block);
13636 +                       result = ef_prepare(node, &blk, &efnode, &hint);
13637 +                       if (flushable(node, page, 0) && result == 0) {
13638 +                               assert("nikita-2759", efnode != NULL);
13639 +                               eflush_add(node, &blk, efnode);
13640 +
13641 +                               result = page_io(page, node, WRITE,
13642 +                                                GFP_NOFS | __GFP_HIGH);
13643 +                               INC_STAT(node, vm.eflush.ok);
13644 +                       } else {
13645 +                               JF_CLR(node, JNODE_EFLUSH);
13646 +                               UNLOCK_JLOAD(node);
13647 +                               UNLOCK_JNODE(node);
13648 +                               if (blk != 0ull) {
13649 +                                       ef_free_block(node, &blk,
13650 +                                                     hint.block_stage, efnode);
13651 +                                       kmem_cache_free(eflush_slab, efnode);
13652 +                               }
13653 +                               ON_TRACE(TRACE_EFLUSH, "failure-2\n");
13654 +                               result = 1;
13655 +                               INC_STAT(node, vm.eflush.nolonger);
13656 +                       }
13657 +
13658 +                       blocknr_hint_done(&hint);
13659 +               } else {
13660 +                       txn_atom *atom;
13661 +                       flush_queue_t *fq;
13662 +
13663 +                       /* eflush without allocation temporary location for a node */
13664 +                       ON_TRACE(TRACE_EFLUSH, "flushing to relocate place: %llu..", *jnode_get_block(node));
13665 +
13666 +                       /* get flush queue for this node */
13667 +                       result = fq_by_jnode_gfp(node, &fq, GFP_ATOMIC);
13668 +
13669 +                       if (result)
13670 +                               return result;
13671 +
13672 +                       atom = node->atom;
13673 +
13674 +                       if (!flushable(node, page, 1) || needs_allocation(node) || !jnode_is_dirty(node)) {
13675 +                               ON_TRACE(TRACE_EFLUSH, "failure-3\n");
13676 +                               UNLOCK_JLOAD(node);
13677 +                               UNLOCK_JNODE(node);
13678 +                               UNLOCK_ATOM(atom);
13679 +                               fq_put(fq);
13680 +                               return 1;
13681 +                       }
13682 +
13683 +                       /* ok, now we can flush it */
13684 +                       unlock_page(page);
13685 +
13686 +                       queue_jnode(fq, node);
13687 +
13688 +                       UNLOCK_JLOAD(node);
13689 +                       UNLOCK_JNODE(node);
13690 +                       UNLOCK_ATOM(atom);
13691 +
13692 +                       result = write_fq(fq, NULL, 0);
13693 +                       if (result != 0)
13694 +                               lock_page(page);
13695 +
13696 +                       ON_TRACE(TRACE_EFLUSH, "flushed %d blocks\n", result);
13697 +                       /* Even if we wrote nothing, We unlocked the page, so let know to the caller that page should
13698 +                          not be unlocked again */
13699 +                       fq_put(fq);
13700 +               }
13701 +
13702 +       } else {
13703 +               UNLOCK_JLOAD(node);
13704 +               UNLOCK_JNODE(node);
13705 +               ON_TRACE(TRACE_EFLUSH, "failure-1\n");
13706 +               result = 1;
13707 +       }
13708 +
13709 +       jput(node);
13710 +       return result;
13711 +}
13712 +
13713 +static int
13714 +flushable(const jnode * node, struct page *page, int check_eflush)
13715 +{
13716 +       assert("nikita-2725", node != NULL);
13717 +       assert("nikita-2726", spin_jnode_is_locked(node));
13718 +       assert("nikita-3388", spin_jload_is_locked(node));
13719 +
13720 +       if (jnode_is_loaded(node)) {             /* loaded */
13721 +               INC_STAT(node, vm.eflush.loaded);
13722 +               return 0;
13723 +       }
13724 +       if (JF_ISSET(node, JNODE_FLUSH_QUEUED)) { /* already pending io */
13725 +               INC_STAT(node, vm.eflush.queued);
13726 +               return 0;
13727 +       }
13728 +       if (JF_ISSET(node, JNODE_EPROTECTED)) {  /* protected from e-flush */
13729 +               INC_STAT(node, vm.eflush.protected);
13730 +               return 0;
13731 +       }
13732 +       if (JF_ISSET(node, JNODE_HEARD_BANSHEE)) {
13733 +               INC_STAT(node, vm.eflush.heard_banshee);
13734 +               return 0;
13735 +       }
13736 +       if (page == NULL) {                     /* nothing to flush */
13737 +               INC_STAT(node, vm.eflush.nopage);
13738 +               return 0;
13739 +       }
13740 +       if (PageWriteback(page)) {               /* already under io */
13741 +               INC_STAT(node, vm.eflush.writeback);
13742 +               return 0;
13743 +       }
13744 +       /* don't flush bitmaps or journal records */
13745 +       if (!jnode_is_znode(node) && !jnode_is_unformatted(node)) {
13746 +               INC_STAT(node, vm.eflush.bitmap);
13747 +               return 0;
13748 +       }
13749 +       /* don't flush cluster pages */
13750 +       if (jnode_is_cluster_page(node)) {
13751 +               INC_STAT(node, vm.eflush.clustered);
13752 +               return 0;
13753 +       }
13754 +       if (check_eflush && JF_ISSET(node, JNODE_EFLUSH)) {      /* already flushed */
13755 +               INC_STAT(node, vm.eflush.eflushed);
13756 +               return 0;
13757 +       }
13758 +       return 1;
13759 +}
13760 +
13761 +#undef INC_STAT
13762 +
13763 +/* does node need allocation for eflushing? */
13764 +static int
13765 +needs_allocation(const jnode * node)
13766 +{
13767 +       return !(JF_ISSET(node, JNODE_RELOC) && !blocknr_is_fake(jnode_get_block(node)));
13768 +}
13769 +
13770 +
13771 +static inline int
13772 +jnode_eq(jnode * const * j1, jnode * const * j2)
13773 +{
13774 +       assert("nikita-2733", j1 != NULL);
13775 +       assert("nikita-2734", j2 != NULL);
13776 +
13777 +       return *j1 == *j2;
13778 +}
13779 +
13780 +static ef_hash_table *
13781 +get_jnode_enhash(const jnode *node)
13782 +{
13783 +       struct super_block *super;
13784 +
13785 +       assert("nikita-2739", node != NULL);
13786 +
13787 +       super = jnode_get_tree(node)->super;
13788 +       return &get_super_private(super)->efhash_table;
13789 +}
13790 +
13791 +static inline __u32
13792 +jnode_hfn(ef_hash_table *table, jnode * const * j)
13793 +{
13794 +       __u32 val;
13795 +
13796 +       assert("nikita-2735", j != NULL);
13797 +       assert("nikita-3346", IS_POW(table->_buckets));
13798 +
13799 +       val = (unsigned long)*j;
13800 +       val /= sizeof(**j);
13801 +       return val & (table->_buckets - 1);
13802 +}
13803 +
13804 +
13805 +/* The hash table definition */
13806 +#define KMALLOC(size) vmalloc(size)
13807 +#define KFREE(ptr, size) vfree(ptr)
13808 +TYPE_SAFE_HASH_DEFINE(ef, eflush_node_t, jnode *, node, linkage, jnode_hfn, jnode_eq);
13809 +#undef KFREE
13810 +#undef KMALLOC
13811 +
13812 +reiser4_internal int
13813 +eflush_init(void)
13814 +{
13815 +       eflush_slab = kmem_cache_create("eflush", sizeof (eflush_node_t),
13816 +                                       0, SLAB_HWCACHE_ALIGN, NULL, NULL);
13817 +       if (eflush_slab == NULL)
13818 +               return RETERR(-ENOMEM);
13819 +       else
13820 +               return 0;
13821 +}
13822 +
13823 +reiser4_internal int
13824 +eflush_done(void)
13825 +{
13826 +       return kmem_cache_destroy(eflush_slab);
13827 +}
13828 +
13829 +reiser4_internal int
13830 +eflush_init_at(struct super_block *super)
13831 +{
13832 +       int buckets;
13833 +       int result;
13834 +
13835 +       buckets = 1 << fls(nr_free_pagecache_pages() >> 2);
13836 +       do {
13837 +               result = ef_hash_init(&get_super_private(super)->efhash_table,
13838 +                                     buckets,
13839 +                                     reiser4_stat(super, hashes.eflush));
13840 +               buckets >>= 1;
13841 +       } while(result == -ENOMEM);
13842 +       return result;
13843 +}
13844 +
13845 +reiser4_internal void
13846 +eflush_done_at(struct super_block *super)
13847 +{
13848 +       ef_hash_done(&get_super_private(super)->efhash_table);
13849 +}
13850 +
13851 +static eflush_node_t *
13852 +ef_alloc(int flags)
13853 +{
13854 +       return kmem_cache_alloc(eflush_slab, flags);
13855 +}
13856 +
13857 +#define EFLUSH_MAGIC 4335203
13858 +
13859 +static int
13860 +eflush_add(jnode *node, reiser4_block_nr *blocknr, eflush_node_t *ef)
13861 +{
13862 +       reiser4_tree  *tree;
13863 +
13864 +       assert("nikita-2737", node != NULL);
13865 +       assert("nikita-2738", JF_ISSET(node, JNODE_EFLUSH));
13866 +       assert("nikita-3382", !JF_ISSET(node, JNODE_EPROTECTED));
13867 +       assert("nikita-2765", spin_jnode_is_locked(node));
13868 +       assert("nikita-3381", spin_jload_is_locked(node));
13869 +
13870 +       tree = jnode_get_tree(node);
13871 +
13872 +       ef->node = node;
13873 +       ef->blocknr = *blocknr;
13874 +       ef->hadatom = (node->atom != NULL);
13875 +       ef->incatom = 0;
13876 +       jref(node);
13877 +       spin_lock_eflush(tree->super);
13878 +       ef_hash_insert(get_jnode_enhash(node), ef);
13879 +       ON_DEBUG(++ get_super_private(tree->super)->eflushed);
13880 +       spin_unlock_eflush(tree->super);
13881 +
13882 +       if (jnode_is_unformatted(node)) {
13883 +               struct inode  *inode;
13884 +               reiser4_inode *info;
13885 +
13886 +               WLOCK_TREE(tree);
13887 +
13888 +               inode = mapping_jnode(node)->host;
13889 +               info = reiser4_inode_data(inode);
13890 +
13891 +               if (!ef->hadatom) {
13892 +                       radix_tree_tag_set(jnode_tree_by_reiser4_inode(info),
13893 +                                          index_jnode(node), EFLUSH_TAG_ANONYMOUS);
13894 +                       ON_DEBUG(info->anonymous_eflushed ++);
13895 +               } else {
13896 +                       radix_tree_tag_set(jnode_tree_by_reiser4_inode(info),
13897 +                                          index_jnode(node), EFLUSH_TAG_CAPTURED);
13898 +                       ON_DEBUG(info->captured_eflushed ++);
13899 +               }
13900 +               WUNLOCK_TREE(tree);
13901 +               /*XXXX*/
13902 +               inc_unfm_ef();
13903 +       }
13904 +
13905 +       /* FIXME: do we need it here, if eflush add/del are protected by page lock? */
13906 +       UNLOCK_JLOAD(node);
13907 +
13908 +       /*
13909 +        * jnode_get_atom() can possible release jnode spin lock. This
13910 +        * means it can only be called _after_ JNODE_EFLUSH is set, because
13911 +        * otherwise we would have to re-check flushable() once more. No
13912 +        * thanks.
13913 +        */
13914 +
13915 +       if (ef->hadatom) {
13916 +               txn_atom *atom;
13917 +
13918 +               atom = jnode_get_atom(node);
13919 +               if (atom != NULL) {
13920 +                       ++ atom->flushed;
13921 +                       ef->incatom = 1;
13922 +                       UNLOCK_ATOM(atom);
13923 +               }
13924 +       }
13925 +
13926 +       UNLOCK_JNODE(node);
13927 +       return 0;
13928 +}
13929 +
13930 +/* Arrghh... cast to keep hash table code happy. */
13931 +#define C(node) ((jnode *const *)&(node))
13932 +
13933 +reiser4_internal reiser4_block_nr *
13934 +eflush_get(const jnode *node)
13935 +{
13936 +       eflush_node_t *ef;
13937 +       reiser4_tree  *tree;
13938 +
13939 +       assert("nikita-2740", node != NULL);
13940 +       assert("nikita-2741", JF_ISSET(node, JNODE_EFLUSH));
13941 +       assert("nikita-2767", spin_jnode_is_locked(node));
13942 +
13943 +
13944 +       tree = jnode_get_tree(node);
13945 +       spin_lock_eflush(tree->super);
13946 +       ef = ef_hash_find(get_jnode_enhash(node), C(node));
13947 +       spin_unlock_eflush(tree->super);
13948 +
13949 +       assert("nikita-2742", ef != NULL);
13950 +       return &ef->blocknr;
13951 +}
13952 +
13953 +/* free resources taken for emergency flushing of the node */
13954 +static void eflush_free (jnode * node)
13955 +{
13956 +       eflush_node_t *ef;
13957 +       ef_hash_table *table;
13958 +       reiser4_tree  *tree;
13959 +       txn_atom      *atom;
13960 +       struct inode  *inode = NULL;
13961 +       reiser4_block_nr blk;
13962 +
13963 +       assert ("zam-1026", spin_jnode_is_locked(node));
13964 +
13965 +       table = get_jnode_enhash(node);
13966 +       tree = jnode_get_tree(node);
13967 +
13968 +       spin_lock_eflush(tree->super);
13969 +       ef = ef_hash_find(table, C(node));
13970 +       BUG_ON(ef == NULL);
13971 +       assert("nikita-2745", ef != NULL);
13972 +       blk = ef->blocknr;
13973 +       ef_hash_remove(table, ef);
13974 +       ON_DEBUG(-- get_super_private(tree->super)->eflushed);
13975 +       spin_unlock_eflush(tree->super);
13976 +
13977 +       if (ef->incatom) {
13978 +               atom = jnode_get_atom(node);
13979 +               assert("nikita-3311", atom != NULL);
13980 +               -- atom->flushed;
13981 +               UNLOCK_ATOM(atom);
13982 +       }
13983 +
13984 +       assert("vs-1215", JF_ISSET(node, JNODE_EFLUSH));
13985 +
13986 +       if (jnode_is_unformatted(node)) {
13987 +               reiser4_inode *info;
13988 +
13989 +               WLOCK_TREE(tree);
13990 +
13991 +               inode = mapping_jnode(node)->host;
13992 +               info = reiser4_inode_data(inode);
13993 +
13994 +               /* clear e-flush specific tags from node's radix tree slot */
13995 +               radix_tree_tag_clear(
13996 +                       jnode_tree_by_reiser4_inode(info), index_jnode(node),
13997 +                       ef->hadatom ? EFLUSH_TAG_CAPTURED : EFLUSH_TAG_ANONYMOUS);
13998 +               ON_DEBUG(ef->hadatom ? (info->captured_eflushed --) : (info->anonymous_eflushed --));
13999 +
14000 +               assert("nikita-3355", ergo(jnode_tree_by_reiser4_inode(info)->rnode == NULL,
14001 +                                          (info->captured_eflushed == 0 && info->anonymous_eflushed == 0)));
14002 +
14003 +               WUNLOCK_TREE(tree);
14004 +
14005 +               /*XXXX*/
14006 +               dec_unfm_ef();
14007 +
14008 +       }
14009 +       UNLOCK_JNODE(node);
14010 +
14011 +#if REISER4_DEBUG
14012 +       if (blocknr_is_fake(jnode_get_block(node)))
14013 +               assert ("zam-817", ef->initial_stage == BLOCK_UNALLOCATED);
14014 +       else
14015 +               assert ("zam-818", ef->initial_stage == BLOCK_GRABBED);
14016 +#endif
14017 +
14018 +       jput(node);
14019 +
14020 +       ef_free_block(node, &blk,
14021 +                     blocknr_is_fake(jnode_get_block(node)) ?
14022 +                     BLOCK_UNALLOCATED : BLOCK_GRABBED, ef);
14023 +
14024 +       kmem_cache_free(eflush_slab, ef);
14025 +
14026 +       LOCK_JNODE(node);
14027 +}
14028 +
14029 +reiser4_internal void eflush_del (jnode * node, int page_locked)
14030 +{
14031 +        struct page * page;
14032 +
14033 +        assert("nikita-2743", node != NULL);
14034 +        assert("nikita-2770", spin_jnode_is_locked(node));
14035 +
14036 +        if (!JF_ISSET(node, JNODE_EFLUSH))
14037 +                return;
14038 +
14039 +        if (page_locked) {
14040 +                page = jnode_page(node);
14041 +                assert("nikita-2806", page != NULL);
14042 +                assert("nikita-2807", PageLocked(page));
14043 +        } else {
14044 +                UNLOCK_JNODE(node);
14045 +                page = jnode_get_page_locked(node, GFP_NOFS);
14046 +                LOCK_JNODE(node);
14047 +                if (page == NULL) {
14048 +                        warning ("zam-1025", "eflush_del failed to get page back\n");
14049 +                        return;
14050 +                }
14051 +                if (unlikely(!JF_ISSET(node, JNODE_EFLUSH)))
14052 +                        /* race: some other thread unflushed jnode. */
14053 +                        goto out;
14054 +        }
14055 +
14056 +        if (PageWriteback(page)) {
14057 +                UNLOCK_JNODE(node);
14058 +                page_cache_get(page);
14059 +                reiser4_wait_page_writeback(page);
14060 +                page_cache_release(page);
14061 +                LOCK_JNODE(node);
14062 +                if (unlikely(!JF_ISSET(node, JNODE_EFLUSH)))
14063 +                        /* race: some other thread unflushed jnode. */
14064 +                        goto out;
14065 +        }
14066 +
14067 +       if (JF_ISSET(node, JNODE_KEEPME))
14068 +               set_page_dirty(page);
14069 +       else
14070 +               /*
14071 +                * either jnode was dirty or page was dirtied through mmap. Page's dirty
14072 +                * bit was cleared before io was submitted. If page is left clean, we
14073 +                * would have dirty jnode with clean page. Neither ->writepage() nor
14074 +                * ->releasepage() can free it. Re-dirty page, so ->writepage() will be
14075 +                * called again if necessary.
14076 +                */
14077 +               set_page_dirty_internal(page, 0);
14078 +
14079 +        assert("nikita-2766", atomic_read(&node->x_count) > 1);
14080 +        /* release allocated disk block and in-memory structures  */
14081 +        eflush_free(node);
14082 +        JF_CLR(node, JNODE_EFLUSH);
14083 + out:
14084 +        if (!page_locked)
14085 +                unlock_page(page);
14086 +}
14087 +
14088 +reiser4_internal int
14089 +emergency_unflush(jnode *node)
14090 +{
14091 +       int result;
14092 +
14093 +       assert("nikita-2778", node != NULL);
14094 +       assert("nikita-3046", schedulable());
14095 +
14096 +       if (JF_ISSET(node, JNODE_EFLUSH)) {
14097 +               result = jload(node);
14098 +               if (result == 0) {
14099 +                       struct page *page;
14100 +
14101 +                       assert("nikita-2777", !JF_ISSET(node, JNODE_EFLUSH));
14102 +                       page = jnode_page(node);
14103 +                       assert("nikita-2779", page != NULL);
14104 +                       wait_on_page_writeback(page);
14105 +
14106 +                       jrelse(node);
14107 +               }
14108 +       } else
14109 +               result = 0;
14110 +       return result;
14111 +}
14112 +
14113 +static reiser4_ba_flags_t
14114 +ef_block_flags(const jnode *node)
14115 +{
14116 +       return jnode_is_znode(node) ? BA_FORMATTED : 0;
14117 +}
14118 +
14119 +static int ef_free_block(jnode *node,
14120 +                        const reiser4_block_nr *blk,
14121 +                        block_stage_t stage, eflush_node_t *ef)
14122 +{
14123 +       int result = 0;
14124 +
14125 +       /* We cannot just ask block allocator to return block into flush
14126 +        * reserved space, because there is no current atom at this point. */
14127 +       result = reiser4_dealloc_block(blk, stage, ef_block_flags(node));
14128 +       if (result == 0 && stage == BLOCK_GRABBED) {
14129 +               txn_atom *atom;
14130 +
14131 +               if (ef->reserve) {
14132 +                       /* further, transfer block from grabbed into flush
14133 +                        * reserved space. */
14134 +                       LOCK_JNODE(node);
14135 +                       atom = jnode_get_atom(node);
14136 +                       assert("nikita-2785", atom != NULL);
14137 +                       grabbed2flush_reserved_nolock(atom, 1);
14138 +                       UNLOCK_ATOM(atom);
14139 +                       JF_SET(node, JNODE_FLUSH_RESERVED);
14140 +                       UNLOCK_JNODE(node);
14141 +               } else {
14142 +                       reiser4_context * ctx = get_current_context();
14143 +                       grabbed2free(ctx, get_super_private(ctx->super),
14144 +                                    (__u64)1);
14145 +               }
14146 +       }
14147 +       return result;
14148 +}
14149 +
14150 +static int
14151 +ef_prepare(jnode *node, reiser4_block_nr *blk, eflush_node_t **efnode, reiser4_blocknr_hint * hint)
14152 +{
14153 +       int result;
14154 +       int usedreserve;
14155 +
14156 +       assert("nikita-2760", node != NULL);
14157 +       assert("nikita-2761", blk != NULL);
14158 +       assert("nikita-2762", efnode != NULL);
14159 +       assert("nikita-2763", spin_jnode_is_locked(node));
14160 +       assert("nikita-3387", spin_jload_is_locked(node));
14161 +
14162 +       hint->blk         = EFLUSH_START_BLOCK;
14163 +       hint->max_dist    = 0;
14164 +       hint->level       = jnode_get_level(node);
14165 +       usedreserve = 0;
14166 +       if (blocknr_is_fake(jnode_get_block(node)))
14167 +               hint->block_stage = BLOCK_UNALLOCATED;
14168 +       else {
14169 +               txn_atom *atom;
14170 +               switch (jnode_is_leaf(node)) {
14171 +               default:
14172 +                       /* We cannot just ask block allocator to take block from
14173 +                        * flush reserved space, because there is no current
14174 +                        * atom at this point. */
14175 +                       atom = jnode_get_atom(node);
14176 +                       if (atom != NULL) {
14177 +                               if (JF_ISSET(node, JNODE_FLUSH_RESERVED)) {
14178 +                                       usedreserve = 1;
14179 +                                       flush_reserved2grabbed(atom, 1);
14180 +                                       JF_CLR(node, JNODE_FLUSH_RESERVED);
14181 +                                       UNLOCK_ATOM(atom);
14182 +                                       break;
14183 +                               } else
14184 +                                       UNLOCK_ATOM(atom);
14185 +                       }
14186 +                       /* fall through */
14187 +                       /* node->atom == NULL if page was dirtied through
14188 +                        * mmap */
14189 +               case 0:
14190 +                       result = reiser4_grab_space_force((__u64)1, BA_RESERVED);
14191 +                       grab_space_enable();
14192 +                       if (result) {
14193 +                               warning("nikita-3323",
14194 +                                       "Cannot allocate eflush block");
14195 +                               return result;
14196 +                       }
14197 +               }
14198 +
14199 +               hint->block_stage = BLOCK_GRABBED;
14200 +       }
14201 +
14202 +       /* XXX protect @node from being concurrently eflushed. Otherwise,
14203 +        * there is a danger of underflowing block space */
14204 +       UNLOCK_JLOAD(node);
14205 +       UNLOCK_JNODE(node);
14206 +
14207 +       *efnode = ef_alloc(GFP_NOFS | __GFP_HIGH);
14208 +       if (*efnode == NULL) {
14209 +               result = RETERR(-ENOMEM);
14210 +               goto out;
14211 +       }
14212 +
14213 +#if REISER4_DEBUG
14214 +       (*efnode)->initial_stage = hint->block_stage;
14215 +#endif
14216 +       (*efnode)->reserve = usedreserve;
14217 +
14218 +       result = reiser4_alloc_block(hint, blk, ef_block_flags(node));
14219 +       if (result)
14220 +               kmem_cache_free(eflush_slab, *efnode);
14221 + out:
14222 +       LOCK_JNODE(node);
14223 +       LOCK_JLOAD(node);
14224 +       return result;
14225 +}
14226 +
14227 +#endif /* REISER4_USE_EFLUSH */
14228 +
14229 +/* Make Linus happy.
14230 +   Local variables:
14231 +   c-indentation-style: "K&R"
14232 +   mode-name: "LC"
14233 +   c-basic-offset: 8
14234 +   tab-width: 8
14235 +   fill-column: 80
14236 +   LocalWords: " unflush eflushed LocalWords eflush writepage VM releasepage unflushing io "
14237 +   End:
14238 +*/
14239 diff -rupN linux-2.6.8-rc3/fs/reiser4/emergency_flush.h linux-2.6.8-rc3-a/fs/reiser4/emergency_flush.h
14240 --- linux-2.6.8-rc3/fs/reiser4/emergency_flush.h        1970-01-01 03:00:00.000000000 +0300
14241 +++ linux-2.6.8-rc3-a/fs/reiser4/emergency_flush.h      2004-08-05 21:20:53.477577342 +0400
14242 @@ -0,0 +1,75 @@
14243 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
14244 +
14245 +/* Emergency flush */
14246 +
14247 +#ifndef __EMERGENCY_FLUSH_H__
14248 +#define __EMERGENCY_FLUSH_H__
14249 +
14250 +#if REISER4_USE_EFLUSH
14251 +
14252 +#include "block_alloc.h"
14253 +
14254 +struct eflush_node;
14255 +typedef struct eflush_node eflush_node_t;
14256 +
14257 +TYPE_SAFE_HASH_DECLARE(ef, eflush_node_t);
14258 +
14259 +struct eflush_node {
14260 +       jnode           *node;
14261 +       reiser4_block_nr blocknr;
14262 +       ef_hash_link     linkage;
14263 +       struct list_head inode_link; /* for per inode list of eflush nodes */
14264 +       struct list_head inode_anon_link;
14265 +       int              hadatom :1;
14266 +       int              incatom :1;
14267 +       int              reserve :1;
14268 +#if REISER4_DEBUG
14269 +       block_stage_t    initial_stage;
14270 +#endif
14271 +};
14272 +
14273 +int eflush_init(void);
14274 +int eflush_done(void);
14275 +
14276 +extern int  eflush_init_at(struct super_block *super);
14277 +extern void eflush_done_at(struct super_block *super);
14278 +
14279 +extern reiser4_block_nr *eflush_get(const jnode *node);
14280 +extern void eflush_del(jnode *node, int page_locked);
14281 +
14282 +extern int emergency_flush(struct page *page);
14283 +extern int emergency_unflush(jnode *node);
14284 +
14285 +/* eflushed jnodes are stored in reiser4_inode's radix tree. Eflushed jnodes may be either "captured" or
14286 + * "anonymous". Use existing tags to tag jnodes in reiser4_inode's tree of eflushed jnodes */
14287 +#define EFLUSH_TAG_ANONYMOUS PAGECACHE_TAG_DIRTY
14288 +#define EFLUSH_TAG_CAPTURED PAGECACHE_TAG_WRITEBACK
14289 +
14290 +#else /* REISER4_USE_EFLUSH */
14291 +
14292 +#define eflush_init()  (0)
14293 +#define eflush_done()  (0)
14294 +
14295 +#define eflush_init_at(super) (0)
14296 +#define eflush_done_at(super) (0)
14297 +
14298 +#define eflush_get(node)  NULL
14299 +#define eflush_del(node, flag) do{}while(0)
14300 +
14301 +#define emergency_unflush(node) (0)
14302 +#define emergency_flush(page) (1)
14303 +
14304 +#endif  /* REISER4_USE_EFLUSH */
14305 +
14306 +/* __EMERGENCY_FLUSH_H__ */
14307 +#endif
14308 +
14309 +/* Make Linus happy.
14310 +   Local variables:
14311 +   c-indentation-style: "K&R"
14312 +   mode-name: "LC"
14313 +   c-basic-offset: 8
14314 +   tab-width: 8
14315 +   fill-column: 120
14316 +   End:
14317 +*/
14318 diff -rupN linux-2.6.8-rc3/fs/reiser4/entd.c linux-2.6.8-rc3-a/fs/reiser4/entd.c
14319 --- linux-2.6.8-rc3/fs/reiser4/entd.c   1970-01-01 03:00:00.000000000 +0300
14320 +++ linux-2.6.8-rc3-a/fs/reiser4/entd.c 2004-08-05 21:20:53.357602647 +0400
14321 @@ -0,0 +1,377 @@
14322 +/* Copyright 2003, 2004 by Hans Reiser, licensing governed by
14323 + * reiser4/README */
14324 +
14325 +/* Ent daemon. */
14326 +
14327 +#include "debug.h"
14328 +#include "kcond.h"
14329 +#include "txnmgr.h"
14330 +#include "tree.h"
14331 +#include "entd.h"
14332 +#include "super.h"
14333 +#include "context.h"
14334 +#include "reiser4.h"
14335 +#include "vfs_ops.h"
14336 +#include "page_cache.h"
14337 +
14338 +#include <linux/sched.h>       /* struct task_struct */
14339 +#include <linux/suspend.h>
14340 +#include <linux/kernel.h>
14341 +#include <linux/writeback.h>
14342 +#include <linux/time.h>         /* INITIAL_JIFFIES */
14343 +#include <linux/backing-dev.h>  /* bdi_write_congested */
14344 +
14345 +TYPE_SAFE_LIST_DEFINE(wbq, struct wbq, link);
14346 +
14347 +#define DEF_PRIORITY 12
14348 +#define MAX_ENTD_ITERS 10
14349 +#define ENTD_ASYNC_REQUESTS_LIMIT 32
14350 +
14351 +static void entd_flush(struct super_block *super);
14352 +static int entd(void *arg);
14353 +
14354 +/*
14355 + * set ->comm field of end thread to make its state visible to the user level
14356 + */
14357 +#define entd_set_comm(state)                                   \
14358 +       snprintf(current->comm, sizeof(current->comm),  \
14359 +                "ent:%s%s", super->s_id, (state))
14360 +
14361 +/* get ent context for the @super */
14362 +static inline entd_context *
14363 +get_entd_context(struct super_block *super)
14364 +{
14365 +       return &get_super_private(super)->entd;
14366 +}
14367 +
14368 +/* initialize ent thread context */
14369 +reiser4_internal void
14370 +init_entd_context(struct super_block *super)
14371 +{
14372 +       entd_context * ctx;
14373 +
14374 +       assert("nikita-3104", super != NULL);
14375 +
14376 +       ctx = get_entd_context(super);
14377 +
14378 +       xmemset(ctx, 0, sizeof *ctx);
14379 +       kcond_init(&ctx->startup);
14380 +       kcond_init(&ctx->wait);
14381 +       init_completion(&ctx->finish);
14382 +       spin_lock_init(&ctx->guard);
14383 +
14384 +       /* start ent thread.. */
14385 +       kernel_thread(entd, super, CLONE_VM | CLONE_FS | CLONE_FILES);
14386 +
14387 +       spin_lock(&ctx->guard);
14388 +       /* and wait for its initialization to finish */
14389 +       while (ctx->tsk == NULL)
14390 +               kcond_wait(&ctx->startup, &ctx->guard, 0);
14391 +       spin_unlock(&ctx->guard);
14392 +#if REISER4_DEBUG
14393 +       flushers_list_init(&ctx->flushers_list);
14394 +#endif
14395 +       wbq_list_init(&ctx->wbq_list);
14396 +}
14397 +
14398 +static void wakeup_wbq (entd_context * ent, struct wbq * rq)
14399 +{
14400 +       wbq_list_remove(rq);
14401 +       ent->nr_synchronous_requests --;
14402 +       rq->wbc->nr_to_write --;
14403 +       up(&rq->sem);
14404 +}
14405 +
14406 +static void wakeup_all_wbq (entd_context * ent)
14407 +{
14408 +       struct wbq * rq;
14409 +
14410 +       spin_lock(&ent->guard);
14411 +       while (!wbq_list_empty(&ent->wbq_list)) {
14412 +               rq = wbq_list_front(&ent->wbq_list);
14413 +               wakeup_wbq(ent, rq);
14414 +       }
14415 +       spin_unlock(&ent->guard);
14416 +}
14417 +
14418 +/* ent thread function */
14419 +static int
14420 +entd(void *arg)
14421 +{
14422 +       struct super_block *super;
14423 +       struct task_struct *me;
14424 +       entd_context       *ent;
14425 +
14426 +       assert("vs-1655", list_empty(&current->private_pages));
14427 +
14428 +       super = arg;
14429 +       /* standard kernel thread prologue */
14430 +       me = current;
14431 +       /* reparent_to_init() is done by daemonize() */
14432 +       daemonize("ent:%s", super->s_id);
14433 +
14434 +       /* block all signals */
14435 +       spin_lock_irq(&me->sighand->siglock);
14436 +       siginitsetinv(&me->blocked, 0);
14437 +       recalc_sigpending();
14438 +       spin_unlock_irq(&me->sighand->siglock);
14439 +
14440 +       /* do_fork() just copies task_struct into the new
14441 +          thread. ->fs_context shouldn't be copied of course. This shouldn't
14442 +          be a problem for the rest of the code though.
14443 +       */
14444 +       me->journal_info = NULL;
14445 +
14446 +       ent = get_entd_context(super);
14447 +
14448 +       spin_lock(&ent->guard);
14449 +       ent->tsk = me;
14450 +       /* signal waiters that initialization is completed */
14451 +       kcond_broadcast(&ent->startup);
14452 +       spin_unlock(&ent->guard);
14453 +       while (1) {
14454 +               int result = 0;
14455 +
14456 +               if (me->flags & PF_FREEZE)
14457 +                       refrigerator(PF_FREEZE);
14458 +
14459 +               spin_lock(&ent->guard);
14460 +
14461 +               while (ent->nr_all_requests != 0) {
14462 +                       assert("zam-1043", ent->nr_all_requests >= ent->nr_synchronous_requests);
14463 +                       if (ent->nr_synchronous_requests != 0) {
14464 +                               struct wbq * rq = wbq_list_front(&ent->wbq_list);
14465 +
14466 +                               if (++ rq->nr_entd_iters > MAX_ENTD_ITERS) {
14467 +                                       ent->nr_all_requests --;
14468 +                                       wakeup_wbq(ent, rq);
14469 +                                       continue;
14470 +                               }
14471 +                       } else {
14472 +                               /* endless loop avoidance. */
14473 +                               ent->nr_all_requests --;
14474 +                       }
14475 +
14476 +                       spin_unlock(&ent->guard);
14477 +                       entd_set_comm("!");
14478 +                       entd_flush(super);
14479 +                       spin_lock(&ent->guard);
14480 +               }
14481 +
14482 +               entd_set_comm(".");
14483 +
14484 +               /* wait for work */
14485 +               result = kcond_wait(&ent->wait, &ent->guard, 1);
14486 +               if (result != -EINTR && result != 0)
14487 +                       /* some other error */
14488 +                       warning("nikita-3099", "Error: %i", result);
14489 +
14490 +               /* we are asked to exit */
14491 +               if (ent->done) {
14492 +                       spin_unlock(&ent->guard);
14493 +                       break;
14494 +               }
14495 +
14496 +               spin_unlock(&ent->guard);
14497 +       }
14498 +       wakeup_all_wbq(ent);
14499 +       complete_and_exit(&ent->finish, 0);
14500 +       /* not reached. */
14501 +       return 0;
14502 +}
14503 +
14504 +/* called by umount */
14505 +reiser4_internal void
14506 +done_entd_context(struct super_block *super)
14507 +{
14508 +       entd_context * ent;
14509 +
14510 +       assert("nikita-3103", super != NULL);
14511 +
14512 +       ent = get_entd_context(super);
14513 +
14514 +       spin_lock(&ent->guard);
14515 +       ent->done = 1;
14516 +       kcond_signal(&ent->wait);
14517 +       spin_unlock(&ent->guard);
14518 +
14519 +       /* wait until daemon finishes */
14520 +       wait_for_completion(&ent->finish);
14521 +}
14522 +
14523 +/* called at the beginning of jnode_flush to register flusher thread with ent
14524 + * daemon */
14525 +reiser4_internal void enter_flush (struct super_block * super)
14526 +{
14527 +       entd_context * ent;
14528 +
14529 +       assert ("zam-1029", super != NULL);
14530 +       ent = get_entd_context(super);
14531 +
14532 +       assert ("zam-1030", ent != NULL);
14533 +
14534 +       spin_lock(&ent->guard);
14535 +       ent->flushers ++;
14536 +#if REISER4_DEBUG
14537 +       flushers_list_push_front(&ent->flushers_list, get_current_context());
14538 +#endif
14539 +       spin_unlock(&ent->guard);
14540 +}
14541 +
14542 +/* called at the end of jnode_flush */
14543 +reiser4_internal void leave_flush (struct super_block * super)
14544 +{
14545 +       entd_context * ent;
14546 +
14547 +       assert ("zam-1027", super != NULL);
14548 +       ent = get_entd_context(super);
14549 +
14550 +       assert ("zam-1028", ent != NULL);
14551 +
14552 +       spin_lock(&ent->guard);
14553 +       ent->flushers --;
14554 +       if (ent->flushers == 0 && ent->nr_synchronous_requests != 0)
14555 +               kcond_signal(&ent->wait);
14556 +#if REISER4_DEBUG
14557 +       flushers_list_remove_clean(get_current_context());
14558 +#endif
14559 +       spin_unlock(&ent->guard);
14560 +}
14561 +
14562 +/* signal to ent thread that it has more work to do */
14563 +static void kick_entd(entd_context * ent)
14564 +{
14565 +       kcond_signal(&ent->wait);
14566 +}
14567 +
14568 +static void entd_capture_anonymous_pages(
14569 +       struct super_block * super, struct writeback_control * wbc)
14570 +{
14571 +       spin_lock(&inode_lock);
14572 +       capture_reiser4_inodes(super, wbc);
14573 +       spin_unlock(&inode_lock);
14574 +}
14575 +
14576 +static void entd_flush(struct super_block *super)
14577 +{
14578 +       long            nr_submitted = 0;
14579 +       int             result;
14580 +       reiser4_context ctx;
14581 +       struct writeback_control wbc = {
14582 +               .bdi            = NULL,
14583 +               .sync_mode      = WB_SYNC_NONE,
14584 +               .older_than_this = NULL,
14585 +               .nr_to_write    = 32,
14586 +               .nonblocking    = 0,
14587 +       };
14588 +
14589 +       init_context(&ctx, super);
14590 +
14591 +       ctx.entd = 1;
14592 +
14593 +       entd_capture_anonymous_pages(super, &wbc);
14594 +       result = flush_some_atom(&nr_submitted, &wbc, 0);
14595 +       if (result != 0)
14596 +               warning("nikita-3100", "Flush failed: %i", result);
14597 +
14598 +       context_set_commit_async(&ctx);
14599 +       reiser4_exit_context(&ctx);
14600 +}
14601 +
14602 +void write_page_by_ent (struct page * page, struct writeback_control * wbc)
14603 +{
14604 +       struct super_block * sb;
14605 +       entd_context * ent;
14606 +       struct wbq rq;
14607 +       int phantom;
14608 +
14609 +       sb = page->mapping->host->i_sb;
14610 +       ent = get_entd_context(sb);
14611 +
14612 +       phantom = jprivate(page) == NULL || !jnode_check_dirty(jprivate(page));
14613 +       /* re-dirty page */
14614 +       set_page_dirty_internal(page, phantom);
14615 +       /* unlock it to avoid deadlocks with the thread which will do actual i/o  */
14616 +       unlock_page(page);
14617 +
14618 +       /* entd is not running. */
14619 +       if (ent == NULL || ent->done)
14620 +               return;
14621 +
14622 +       /* init wbq */
14623 +       wbq_list_clean(&rq);
14624 +       rq.nr_entd_iters = 0;
14625 +       rq.page = page;
14626 +       rq.wbc = wbc;
14627 +
14628 +       spin_lock(&ent->guard);
14629 +       if (ent->flushers == 0)
14630 +               kick_entd(ent);
14631 +       ent->nr_all_requests ++;
14632 +       if (ent->nr_all_requests <= ent->nr_synchronous_requests + ENTD_ASYNC_REQUESTS_LIMIT) {
14633 +               spin_unlock(&ent->guard);
14634 +               return;
14635 +       }
14636 +       sema_init(&rq.sem, 0);
14637 +       wbq_list_push_back(&ent->wbq_list, &rq);
14638 +       ent->nr_synchronous_requests ++;
14639 +       spin_unlock(&ent->guard);
14640 +       down(&rq.sem);
14641 +
14642 +       /* don't release rq until wakeup_wbq stops using it. */
14643 +       spin_lock(&ent->guard);
14644 +       spin_unlock(&ent->guard);
14645 +       /* wbq dequeued by the ent thread (by another then current thread). */
14646 +}
14647 +
14648 +/* ent should be locked */
14649 +static struct wbq * get_wbq (entd_context * ent)
14650 +{
14651 +       if (wbq_list_empty(&ent->wbq_list)) {
14652 +               spin_unlock(&ent->guard);
14653 +               return NULL;
14654 +       }
14655 +       return wbq_list_front(&ent->wbq_list);
14656 +}
14657 +
14658 +
14659 +void ent_writes_page (struct super_block * sb, struct page * page)
14660 +{
14661 +       entd_context * ent = get_entd_context(sb);
14662 +       struct wbq * rq;
14663 +
14664 +       assert("zam-1041", ent != NULL);
14665 +
14666 +       if (PageActive(page) || ent->nr_all_requests == 0)
14667 +               return;
14668 +
14669 +       SetPageReclaim(page);
14670 +
14671 +       spin_lock(&ent->guard);
14672 +       if (ent->nr_all_requests > 0) {
14673 +               ent->nr_all_requests --;
14674 +               rq = get_wbq(ent);
14675 +               if (rq == NULL)
14676 +                       /* get_wbq() releases entd->guard spinlock if NULL is
14677 +                        * returned. */
14678 +                       return;
14679 +               wakeup_wbq(ent, rq);
14680 +       }
14681 +       spin_unlock(&ent->guard);
14682 +}
14683 +
14684 +int wbq_available (void) {
14685 +       struct super_block * sb = reiser4_get_current_sb();
14686 +       entd_context * ent = get_entd_context(sb);
14687 +       return ent->nr_all_requests;
14688 +}
14689 +
14690 +/* Make Linus happy.
14691 +   Local variables:
14692 +   c-indentation-style: "K&R"
14693 +   mode-name: "LC"
14694 +   c-basic-offset: 8
14695 +   tab-width: 8
14696 +   fill-column: 80
14697 +   End:
14698 +*/
14699 diff -rupN linux-2.6.8-rc3/fs/reiser4/entd.h linux-2.6.8-rc3-a/fs/reiser4/entd.h
14700 --- linux-2.6.8-rc3/fs/reiser4/entd.h   1970-01-01 03:00:00.000000000 +0300
14701 +++ linux-2.6.8-rc3-a/fs/reiser4/entd.h 2004-08-05 21:20:53.041669285 +0400
14702 @@ -0,0 +1,83 @@
14703 +/* Copyright 2003 by Hans Reiser, licensing governed by reiser4/README */
14704 +
14705 +/* Ent daemon. */
14706 +
14707 +#ifndef __ENTD_H__
14708 +#define __ENTD_H__
14709 +
14710 +#include "kcond.h"
14711 +#include "context.h"
14712 +
14713 +#include <linux/fs.h>
14714 +#include <linux/completion.h>
14715 +#include <linux/spinlock.h>
14716 +#include <linux/sched.h>       /* for struct task_struct */
14717 +#include "type_safe_list.h"
14718 +
14719 +TYPE_SAFE_LIST_DECLARE(wbq);
14720 +
14721 +/* write-back request. */
14722 +struct wbq {
14723 +       wbq_list_link link;
14724 +       struct writeback_control * wbc;
14725 +       struct page * page;
14726 +       struct semaphore sem;
14727 +       int    nr_entd_iters;
14728 +};
14729 +
14730 +/* ent-thread context. This is used to synchronize starting/stopping ent
14731 + * threads. */
14732 +typedef struct entd_context {
14733 +       /*
14734 +        * condition variable that is signaled by ent thread after it
14735 +        * successfully started up.
14736 +        */
14737 +       kcond_t             startup;
14738 +       /*
14739 +        * completion that is signaled by ent thread just before it
14740 +        * terminates.
14741 +        */
14742 +       struct completion   finish;
14743 +       /*
14744 +        * condition variable that ent thread waits on for more work. It's
14745 +        * signaled by write_page_by_ent().
14746 +        */
14747 +       kcond_t             wait;
14748 +       /* spinlock protecting other fields */
14749 +       spinlock_t          guard;
14750 +       /* ent thread */
14751 +       struct task_struct *tsk;
14752 +       /* set to indicate that ent thread should leave. */
14753 +       int                 done;
14754 +       /* counter of active flushers */
14755 +       int                 flushers;
14756 +#if REISER4_DEBUG
14757 +       /* list of all active flushers */
14758 +       flushers_list_head  flushers_list;
14759 +#endif
14760 +       int                 nr_all_requests;
14761 +       int                 nr_synchronous_requests;
14762 +       wbq_list_head       wbq_list;
14763 +} entd_context;
14764 +
14765 +extern void init_entd_context(struct super_block *super);
14766 +extern void done_entd_context(struct super_block *super);
14767 +
14768 +extern void enter_flush(struct super_block *super);
14769 +extern void leave_flush(struct super_block *super);
14770 +
14771 +extern void write_page_by_ent(struct page *, struct writeback_control *);
14772 +extern int  wbq_available (void);
14773 +extern void ent_writes_page (struct super_block *, struct page *);
14774 +/* __ENTD_H__ */
14775 +#endif
14776 +
14777 +/* Make Linus happy.
14778 +   Local variables:
14779 +   c-indentation-style: "K&R"
14780 +   mode-name: "LC"
14781 +   c-basic-offset: 8
14782 +   tab-width: 8
14783 +   fill-column: 120
14784 +   End:
14785 +*/
14786 diff -rupN linux-2.6.8-rc3/fs/reiser4/eottl.c linux-2.6.8-rc3-a/fs/reiser4/eottl.c
14787 --- linux-2.6.8-rc3/fs/reiser4/eottl.c  1970-01-01 03:00:00.000000000 +0300
14788 +++ linux-2.6.8-rc3-a/fs/reiser4/eottl.c        2004-08-05 21:20:52.852709142 +0400
14789 @@ -0,0 +1,372 @@
14790 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
14791 +
14792 +#include "forward.h"
14793 +#include "debug.h"
14794 +#include "key.h"
14795 +#include "coord.h"
14796 +#include "plugin/item/item.h"
14797 +#include "plugin/node/node.h"
14798 +#include "znode.h"
14799 +#include "block_alloc.h"
14800 +#include "tree_walk.h"
14801 +#include "tree_mod.h"
14802 +#include "carry.h"
14803 +#include "tree.h"
14804 +#include "super.h"
14805 +
14806 +#include <linux/types.h>       /* for __u??  */
14807 +
14808 +/* Extents on the twig level (EOTTL) handling.
14809 +
14810 +   EOTTL poses some problems to the tree traversal, that are better
14811 +   explained by example.
14812 +
14813 +   Suppose we have block B1 on the twig level with the following items:
14814 +
14815 +   0. internal item I0 with key (0:0:0:0) (locality, key-type, object-id, offset)
14816 +   1. extent item E1 with key (1:4:100:0), having 10 blocks of 4k each
14817 +   2. internal item I2 with key (10:0:0:0)
14818 +
14819 +   We are trying to insert item with key (5:0:0:0). Lookup finds node
14820 +   B1, and then intra-node lookup is done. This lookup finished on the
14821 +   E1, because the key we are looking for is larger than the key of E1
14822 +   and is smaller than key the of I2.
14823 +
14824 +   Here search is stuck.
14825 +
14826 +   After some thought it is clear what is wrong here: extents on the
14827 +   twig level break some basic property of the *search* tree (on the
14828 +   pretext, that they restore property of balanced tree).
14829 +
14830 +   Said property is the following: if in the internal node of the search
14831 +   tree we have [ ... Key1 Pointer Key2 ... ] then, all data that are or
14832 +   will be keyed in the tree with the Key such that Key1 <= Key < Key2
14833 +   are accessible through the Pointer.
14834 +
14835 +   This is not true, when Pointer is Extent-Pointer, simply because
14836 +   extent cannot expand indefinitely to the right to include any item
14837 +   with
14838 +
14839 +     Key1 <= Key <= Key2.
14840 +
14841 +   For example, our E1 extent is only responsible for the data with keys
14842 +
14843 +     (1:4:100:0) <= key <= (1:4:100:0xffffffffffffffff), and
14844 +
14845 +   so, key range
14846 +
14847 +     ( (1:4:100:0xffffffffffffffff), (10:0:0:0) )
14848 +
14849 +   is orphaned: there is no way to get there from the tree root.
14850 +
14851 +   In other words, extent pointers are different than normal child
14852 +   pointers as far as search tree is concerned, and this creates such
14853 +   problems.
14854 +
14855 +   Possible solution for this problem is to insert our item into node
14856 +   pointed to by I2. There are some problems through:
14857 +
14858 +   (1) I2 can be in a different node.
14859 +   (2) E1 can be immediately followed by another extent E2.
14860 +
14861 +   (1) is solved by calling reiser4_get_right_neighbor() and accounting
14862 +   for locks/coords as necessary.
14863 +
14864 +   (2) is more complex. Solution here is to insert new empty leaf node
14865 +   and insert internal item between E1 and E2 pointing to said leaf
14866 +   node. This is further complicated by possibility that E2 is in a
14867 +   different node, etc.
14868 +
14869 +   Problems:
14870 +
14871 +   (1) if there was internal item I2 immediately on the right of an
14872 +   extent E1 we and we decided to insert new item S1 into node N2
14873 +   pointed to by I2, then key of S1 will be less than smallest key in
14874 +   the N2. Normally, search key checks that key we are looking for is in
14875 +   the range of keys covered by the node key is being looked in. To work
14876 +   around of this situation, while preserving useful consistency check
14877 +   new flag CBK_TRUST_DK was added to the cbk falgs bitmask. This flag
14878 +   is automatically set on entrance to the coord_by_key() and is only
14879 +   cleared when we are about to enter situation described above.
14880 +
14881 +   (2) If extent E1 is immediately followed by another extent E2 and we
14882 +   are searching for the key that is between E1 and E2 we only have to
14883 +   insert new empty leaf node when coord_by_key was called for
14884 +   insertion, rather than just for lookup. To distinguish these cases,
14885 +   new flag CBK_FOR_INSERT was added to the cbk falgs bitmask. This flag
14886 +   is automatically set by coord_by_key calls performed by
14887 +   insert_by_key() and friends.
14888 +
14889 +   (3) Insertion of new empty leaf node (possibly) requires
14890 +   balancing. In any case it requires modification of node content which
14891 +   is only possible under write lock. It may well happen that we only
14892 +   have read lock on the node where new internal pointer is to be
14893 +   inserted (common case: lookup of non-existent stat-data that fells
14894 +   between two extents). If only read lock is held, tree traversal is
14895 +   restarted with lock_level modified so that next time we hit this
14896 +   problem, write lock will be held. Once we have write lock, balancing
14897 +   will be performed.
14898 +
14899 +
14900 +
14901 +
14902 +
14903 +
14904 +*/
14905 +
14906 +/* look to the right of @coord. If it is an item of internal type - 1 is
14907 +   returned. If that item is in right neighbor and it is internal - @coord and
14908 +   @lh are switched to that node: move lock handle, zload right neighbor and
14909 +   zrelse znode coord was set to at the beginning
14910 +*/
14911 +/* Audited by: green(2002.06.15) */
14912 +static int
14913 +is_next_item_internal(coord_t * coord)
14914 +{
14915 +       if (coord->item_pos != node_num_items(coord->node) - 1) {
14916 +               /* next item is in the same node */
14917 +               coord_t right;
14918 +
14919 +               coord_dup(&right, coord);
14920 +               check_me("vs-742", coord_next_item(&right) == 0);
14921 +               if (item_is_internal(&right)) {
14922 +                       coord_dup(coord, &right);
14923 +                       return 1;
14924 +               }
14925 +       }
14926 +       return 0;
14927 +}
14928 +
14929 +/* inserting empty leaf after (or between) item of not internal type we have to
14930 +   know which right delimiting key corresponding znode has to be inserted with */
14931 +static reiser4_key *
14932 +rd_key(coord_t * coord, reiser4_key * key)
14933 +{
14934 +       coord_t dup;
14935 +
14936 +       assert("nikita-2281", coord_is_between_items(coord));
14937 +       coord_dup(&dup, coord);
14938 +
14939 +       RLOCK_DK(current_tree);
14940 +
14941 +       if (coord_set_to_right(&dup) == 0)
14942 +               /* get right delimiting key from an item to the right of @coord */
14943 +               unit_key_by_coord(&dup, key);
14944 +       else
14945 +               /* use right delimiting key of parent znode */
14946 +               *key = *znode_get_rd_key(coord->node);
14947 +
14948 +       RUNLOCK_DK(current_tree);
14949 +       return key;
14950 +}
14951 +
14952 +
14953 +ON_DEBUG(void check_dkeys(const znode *);)
14954 +
14955 +/* this is used to insert empty node into leaf level if tree lookup can not go
14956 +   further down because it stopped between items of not internal type */
14957 +static int
14958 +add_empty_leaf(coord_t * insert_coord, lock_handle * lh, const reiser4_key * key, const reiser4_key * rdkey)
14959 +{
14960 +       int result;
14961 +       carry_pool pool;
14962 +       carry_level todo;
14963 +       carry_op *op;
14964 +       /*znode *parent_node;*/
14965 +       znode *node;
14966 +       reiser4_item_data item;
14967 +       carry_insert_data cdata;
14968 +       reiser4_tree *tree;
14969 +
14970 +       init_carry_pool(&pool);
14971 +       init_carry_level(&todo, &pool);
14972 +       ON_STATS(todo.level_no = TWIG_LEVEL);
14973 +       assert("vs-49827", znode_contains_key_lock(insert_coord->node, key));
14974 +
14975 +       tree = znode_get_tree(insert_coord->node);
14976 +       node = new_node(insert_coord->node, LEAF_LEVEL);
14977 +       if (IS_ERR(node))
14978 +               return PTR_ERR(node);
14979 +
14980 +       /* setup delimiting keys for node being inserted */
14981 +       WLOCK_DK(tree);
14982 +       znode_set_ld_key(node, key);
14983 +       znode_set_rd_key(node, rdkey);
14984 +       ON_DEBUG(node->creator = current);
14985 +       ON_DEBUG(node->first_key = *key);
14986 +       WUNLOCK_DK(tree);
14987 +
14988 +       ZF_SET(node, JNODE_ORPHAN);
14989 +       op = post_carry(&todo, COP_INSERT, insert_coord->node, 0);
14990 +       if (!IS_ERR(op)) {
14991 +               cdata.coord = insert_coord;
14992 +               cdata.key = key;
14993 +               cdata.data = &item;
14994 +               op->u.insert.d = &cdata;
14995 +               op->u.insert.type = COPT_ITEM_DATA;
14996 +               build_child_ptr_data(node, &item);
14997 +               item.arg = NULL;
14998 +               /* have @insert_coord to be set at inserted item after
14999 +                  insertion is done */
15000 +               todo.track_type = CARRY_TRACK_CHANGE;
15001 +               todo.tracked = lh;
15002 +
15003 +               result = carry(&todo, 0);
15004 +               if (result == 0) {
15005 +                       /*
15006 +                        * pin node in memory. This is necessary for
15007 +                        * znode_make_dirty() below.
15008 +                        */
15009 +                       result = zload(node);
15010 +                       if (result == 0) {
15011 +                               lock_handle local_lh;
15012 +
15013 +                               /*
15014 +                                * if we inserted new child into tree we have
15015 +                                * to mark it dirty so that flush will be able
15016 +                                * to process it.
15017 +                                */
15018 +                               init_lh(&local_lh);
15019 +                               result = longterm_lock_znode(&local_lh, node,
15020 +                                                            ZNODE_WRITE_LOCK,
15021 +                                                            ZNODE_LOCK_LOPRI);
15022 +                               if (result == 0) {
15023 +                                       znode_make_dirty(node);
15024 +
15025 +                                       /* when internal item pointing to @node
15026 +                                          was inserted into twig node
15027 +                                          create_hook_internal did not connect
15028 +                                          it properly because its right
15029 +                                          neighbor was not known. Do it
15030 +                                          here */
15031 +                                       WLOCK_TREE(tree);
15032 +                                       assert("nikita-3312", znode_is_right_connected(node));
15033 +                                       assert("nikita-2984", node->right == NULL);
15034 +                                       ZF_CLR(node, JNODE_RIGHT_CONNECTED);
15035 +                                       WUNLOCK_TREE(tree);
15036 +                                       result = connect_znode(insert_coord, node);
15037 +                                       if (result == 0)
15038 +                                               ON_DEBUG(check_dkeys(node));
15039 +
15040 +                                       done_lh(lh);
15041 +                                       move_lh(lh, &local_lh);
15042 +                                       assert("vs-1676", node_is_empty(node));
15043 +                                       coord_init_first_unit(insert_coord, node);
15044 +                               } else {
15045 +                                       warning("nikita-3136",
15046 +                                               "Cannot lock child");
15047 +                                       print_znode("child", node);
15048 +                               }
15049 +                               done_lh(&local_lh);
15050 +                               zrelse(node);
15051 +                       }
15052 +               }
15053 +       } else
15054 +               result = PTR_ERR(op);
15055 +       zput(node);
15056 +       done_carry_pool(&pool);
15057 +       return result;
15058 +}
15059 +
15060 +/* handle extent-on-the-twig-level cases in tree traversal */
15061 +reiser4_internal int
15062 +handle_eottl(cbk_handle * h /* cbk handle */ ,
15063 +            int *outcome /* how traversal should proceed */ )
15064 +{
15065 +       int result;
15066 +       reiser4_key key;
15067 +       coord_t *coord;
15068 +
15069 +       coord = h->coord;
15070 +
15071 +       if (h->level != TWIG_LEVEL || (coord_is_existing_item(coord) && item_is_internal(coord))) {
15072 +               /* Continue to traverse tree downward. */
15073 +               return 0;
15074 +       }
15075 +       /* strange item type found on non-stop level?!  Twig
15076 +          horrors? */
15077 +       assert("vs-356", h->level == TWIG_LEVEL);
15078 +       assert("vs-357", ( {
15079 +                         coord_t lcoord;
15080 +                         coord_dup(&lcoord, coord);
15081 +                         check_me("vs-733", coord_set_to_left(&lcoord) == 0);
15082 +                         item_is_extent(&lcoord);}
15083 +              ));
15084 +
15085 +       if (*outcome == NS_FOUND) {
15086 +               /* we have found desired key on twig level in extent item */
15087 +               h->result = CBK_COORD_FOUND;
15088 +               reiser4_stat_inc(tree.cbk_found);
15089 +               *outcome = LOOKUP_DONE;
15090 +               return 1;
15091 +       }
15092 +
15093 +       if (!(h->flags & CBK_FOR_INSERT)) {
15094 +               /* tree traversal is not for insertion. Just return
15095 +                  CBK_COORD_NOTFOUND. */
15096 +               h->result = CBK_COORD_NOTFOUND;
15097 +               *outcome = LOOKUP_DONE;
15098 +               return 1;
15099 +       }
15100 +
15101 +       /* take a look at the item to the right of h -> coord */
15102 +       result = is_next_item_internal(coord);
15103 +       if (result == 0) {
15104 +               /* item to the right is also an extent one. Allocate a new node
15105 +                  and insert pointer to it after item h -> coord.
15106 +
15107 +                  This is a result of extents being located at the twig
15108 +                  level. For explanation, see comment just above
15109 +                  is_next_item_internal().
15110 +               */
15111 +               if (cbk_lock_mode(h->level, h) != ZNODE_WRITE_LOCK) {
15112 +                       /* we got node read locked, restart coord_by_key to
15113 +                          have write lock on twig level */
15114 +                       h->lock_level = TWIG_LEVEL;
15115 +                       h->lock_mode = ZNODE_WRITE_LOCK;
15116 +                       *outcome = LOOKUP_REST;
15117 +                       return 1;
15118 +               }
15119 +
15120 +               result = add_empty_leaf(coord, h->active_lh, h->key, rd_key(coord, &key));
15121 +               if (result) {
15122 +                       h->error = "could not add empty leaf";
15123 +                       h->result = result;
15124 +                       *outcome = LOOKUP_DONE;
15125 +                       return 1;
15126 +               }
15127 +               /* added empty leaf is locked, its parent node is unlocked,
15128 +                  coord is set as EMPTY */
15129 +               *outcome = LOOKUP_DONE;
15130 +               h->result = CBK_COORD_NOTFOUND;
15131 +               return 1;
15132 +               /*assert("vs-358", keyeq(h->key, item_key_by_coord(coord, &key)));*/
15133 +       } else {
15134 +               /* this is special case mentioned in the comment on
15135 +                  tree.h:cbk_flags. We have found internal item immediately
15136 +                  on the right of extent, and we are going to insert new item
15137 +                  there. Key of item we are going to insert is smaller than
15138 +                  leftmost key in the node pointed to by said internal item
15139 +                  (otherwise search wouldn't come to the extent in the first
15140 +                  place).
15141 +
15142 +                  This is a result of extents being located at the twig
15143 +                  level. For explanation, see comment just above
15144 +                  is_next_item_internal().
15145 +               */
15146 +               h->flags &= ~CBK_TRUST_DK;
15147 +       }
15148 +       assert("vs-362", item_is_internal(coord));
15149 +       return 0;
15150 +}
15151 +
15152 +/* Make Linus happy.
15153 +   Local variables:
15154 +   c-indentation-style: "K&R"
15155 +   mode-name: "LC"
15156 +   c-basic-offset: 8
15157 +   tab-width: 8
15158 +   fill-column: 120
15159 +   scroll-step: 1
15160 +   End:
15161 +*/
15162 diff -rupN linux-2.6.8-rc3/fs/reiser4/estimate.c linux-2.6.8-rc3-a/fs/reiser4/estimate.c
15163 --- linux-2.6.8-rc3/fs/reiser4/estimate.c       1970-01-01 03:00:00.000000000 +0300
15164 +++ linux-2.6.8-rc3-a/fs/reiser4/estimate.c     2004-08-05 21:20:52.767727067 +0400
15165 @@ -0,0 +1,107 @@
15166 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
15167 +
15168 +#include "debug.h"
15169 +#include "dformat.h"
15170 +#include "tree.h"
15171 +#include "carry.h"
15172 +#include "inode.h"
15173 +#include "cluster.h"
15174 +#include "plugin/item/ctail.h"
15175 +
15176 +/* this returns how many nodes might get dirty and added nodes if @children nodes are dirtied
15177 +
15178 +   Amount of internals which will get dirty or get allocated we estimate as 5% of the childs + 1 balancing. 1 balancing
15179 +   is 2 neighbours, 2 new blocks and the current block on the leaf level, 2 neighbour nodes + the current (or 1
15180 +   neighbour and 1 new and the current) on twig level, 2 neighbour nodes on upper levels and 1 for a new root. So 5 for
15181 +   leaf level, 3 for twig level, 2 on upper + 1 for root.
15182 +
15183 +   Do not calculate the current node of the lowest level here - this is overhead only.
15184 +
15185 +   children is almost always 1 here. Exception is flow insertion
15186 +*/
15187 +static reiser4_block_nr
15188 +max_balance_overhead(reiser4_block_nr childen, tree_level tree_height)
15189 +{
15190 +       reiser4_block_nr ten_percent;
15191 +
15192 +       ten_percent = ((103 * childen) >> 10);
15193 +
15194 +       /* If we have too many balancings at the time, tree height can raise on more
15195 +          then 1. Assume that if tree_height is 5, it can raise on 1 only. */
15196 +       return ((tree_height < 5 ? 5 : tree_height) * 2 + (4 + ten_percent));
15197 +}
15198 +
15199 +/* this returns maximal possible number of nodes which can be modified plus number of new nodes which can be required to
15200 +   perform insertion of one item into the tree */
15201 +/* it is only called when tree height changes, or gets initialized */
15202 +reiser4_internal reiser4_block_nr
15203 +calc_estimate_one_insert(tree_level height)
15204 +{
15205 +       return 1 + max_balance_overhead(1, height);
15206 +}
15207 +
15208 +reiser4_internal reiser4_block_nr
15209 +estimate_internal_amount(reiser4_block_nr children, tree_level tree_height)
15210 +{
15211 +       return max_balance_overhead(children, tree_height);
15212 +}
15213 +
15214 +reiser4_internal reiser4_block_nr
15215 +estimate_one_insert_item(reiser4_tree *tree)
15216 +{
15217 +       return tree->estimate_one_insert;
15218 +}
15219 +
15220 +/* this returns maximal possible number of nodes which can be modified plus number of new nodes which can be required to
15221 +   perform insertion of one unit into an item in the tree */
15222 +reiser4_internal reiser4_block_nr
15223 +estimate_one_insert_into_item(reiser4_tree *tree)
15224 +{
15225 +       /* estimate insert into item just like item insertion */
15226 +       return tree->estimate_one_insert;
15227 +}
15228 +
15229 +reiser4_internal reiser4_block_nr
15230 +estimate_one_item_removal(reiser4_tree *tree)
15231 +{
15232 +       /* on item removal reiser4 does not try to pack nodes more complact, so, only one node may be dirtied on leaf
15233 +          level */
15234 +       return tree->estimate_one_insert;
15235 +}
15236 +
15237 +/* on leaf level insert_flow may add CARRY_FLOW_NEW_NODES_LIMIT new nodes and dirty 3 existing nodes (insert point and
15238 +   both its neighbors). Max_balance_overhead should estimate number of blocks which may change/get added on internal
15239 +   levels */
15240 +reiser4_internal reiser4_block_nr
15241 +estimate_insert_flow(tree_level height)
15242 +{
15243 +       return 3 + CARRY_FLOW_NEW_NODES_LIMIT + max_balance_overhead(3 + CARRY_FLOW_NEW_NODES_LIMIT, height);
15244 +}
15245 +
15246 +/* returnes max number of nodes can be occupied by disk cluster */
15247 +reiser4_internal reiser4_block_nr
15248 +estimate_disk_cluster(struct inode * inode)
15249 +{
15250 +       return 2 + inode_cluster_pages(inode);
15251 +}
15252 +
15253 +/* how many nodes might get dirty and added nodes during insertion of a disk cluster */
15254 +reiser4_internal reiser4_block_nr
15255 +estimate_insert_cluster(struct inode * inode, int unprepped)
15256 +{
15257 +       int per_cluster;
15258 +       per_cluster = (unprepped ? 1 : inode_cluster_pages(inode));
15259 +
15260 +       return 3 + per_cluster + max_balance_overhead(3 + per_cluster, REISER4_MAX_ZTREE_HEIGHT);
15261 +}
15262 +
15263 +/* Make Linus happy.
15264 +   Local variables:
15265 +   c-indentation-style: "K&R"
15266 +   mode-name: "LC"
15267 +   c-basic-offset: 8
15268 +   tab-width: 8
15269 +   fill-column: 120
15270 +   scroll-step: 1
15271 +   End:
15272 +*/
15273 diff -rupN linux-2.6.8-rc3/fs/reiser4/file_ops.c linux-2.6.8-rc3-a/fs/reiser4/file_ops.c
15274 --- linux-2.6.8-rc3/fs/reiser4/file_ops.c       1970-01-01 03:00:00.000000000 +0300
15275 +++ linux-2.6.8-rc3-a/fs/reiser4/file_ops.c     2004-08-05 21:20:52.998678353 +0400
15276 @@ -0,0 +1,458 @@
15277 +/* Copyright 2003 by Hans Reiser, licensing governed by reiser4/README */
15278 +
15279 +/*
15280 + * Interface to VFS. Reiser4 file_operations are defined here.
15281 + *
15282 + * This file contains definitions of functions that are installed into ->i_fop
15283 + * field of reiser4 inodes.
15284 + *
15285 + * By the most part these functions simply find object plugin of inode
15286 + * involved, and call appropriate plugin method to do the actual work.
15287 + */
15288 +
15289 +#include "forward.h"
15290 +#include "debug.h"
15291 +#include "dformat.h"
15292 +#include "coord.h"
15293 +#include "plugin/item/item.h"
15294 +#include "plugin/file/file.h"
15295 +#include "plugin/security/perm.h"
15296 +#include "plugin/disk_format/disk_format.h"
15297 +#include "plugin/plugin.h"
15298 +#include "plugin/plugin_set.h"
15299 +#include "plugin/object.h"
15300 +#include "txnmgr.h"
15301 +#include "jnode.h"
15302 +#include "znode.h"
15303 +#include "block_alloc.h"
15304 +#include "tree.h"
15305 +#include "log.h"
15306 +#include "vfs_ops.h"
15307 +#include "inode.h"
15308 +#include "page_cache.h"
15309 +#include "ktxnmgrd.h"
15310 +#include "super.h"
15311 +#include "reiser4.h"
15312 +#include "kattr.h"
15313 +#include "entd.h"
15314 +#include "emergency_flush.h"
15315 +
15316 +#include <linux/profile.h>
15317 +#include <linux/types.h>
15318 +#include <linux/mount.h>
15319 +#include <linux/vfs.h>
15320 +#include <linux/mm.h>
15321 +#include <linux/buffer_head.h>
15322 +#include <linux/dcache.h>
15323 +#include <linux/list.h>
15324 +#include <linux/pagemap.h>
15325 +#include <linux/slab.h>
15326 +#include <linux/seq_file.h>
15327 +#include <linux/init.h>
15328 +#include <linux/module.h>
15329 +#include <linux/writeback.h>
15330 +#include <linux/mpage.h>
15331 +#include <linux/backing-dev.h>
15332 +#include <linux/quotaops.h>
15333 +#include <linux/security.h>
15334 +
15335 +
15336 +/* file operations */
15337 +
15338 +static loff_t reiser4_llseek(struct file *, loff_t, int);
15339 +static ssize_t reiser4_read(struct file *, char *, size_t, loff_t *);
15340 +static ssize_t reiser4_write(struct file *, const char *, size_t, loff_t *);
15341 +static int reiser4_readdir(struct file *, void *, filldir_t);
15342 +static int reiser4_ioctl(struct inode *, struct file *, unsigned int cmd, unsigned long arg);
15343 +static int reiser4_mmap(struct file *, struct vm_area_struct *);
15344 +static int reiser4_release(struct inode *, struct file *);
15345 +static int reiser4_fsync(struct file *, struct dentry *, int datasync);
15346 +static int reiser4_open(struct inode *, struct file *);
15347 +static ssize_t reiser4_sendfile(struct file *, loff_t *, size_t, read_actor_t, void __user *);
15348 +
15349 +#if 0
15350 +static unsigned int reiser4_poll(struct file *, struct poll_table_struct *);
15351 +static int reiser4_flush(struct file *);
15352 +static int reiser4_fasync(int, struct file *, int);
15353 +static int reiser4_lock(struct file *, int, struct file_lock *);
15354 +static ssize_t reiser4_readv(struct file *, const struct iovec *, unsigned long, loff_t *);
15355 +static ssize_t reiser4_writev(struct file *, const struct iovec *, unsigned long, loff_t *);
15356 +static ssize_t reiser4_sendpage(struct file *, struct page *, int, size_t, loff_t *, int);
15357 +static unsigned long reiser4_get_unmapped_area(struct file *, unsigned long,
15358 +                                              unsigned long, unsigned long, unsigned long);
15359 +#endif
15360 +
15361 +/*
15362 + * ->llseek() file operation for reiser4. Calls ->seek() method of object
15363 + * plugin.
15364 + */
15365 +static loff_t
15366 +reiser4_llseek(struct file *file, loff_t off, int origin)
15367 +{
15368 +       loff_t result;
15369 +       file_plugin *fplug;
15370 +       struct inode *inode = file->f_dentry->d_inode;
15371 +       loff_t(*seek_fn) (struct file *, loff_t, int);
15372 +       reiser4_context ctx;
15373 +
15374 +       init_context(&ctx, inode->i_sb);
15375 +       reiser4_stat_inc(vfs_calls.llseek);
15376 +
15377 +       ON_TRACE(TRACE_VFS_OPS,
15378 +                "llseek: (i_ino %li, size %lld): off %lli, origin %d\n", inode->i_ino, inode->i_size, off, origin);
15379 +
15380 +       fplug = inode_file_plugin(inode);
15381 +       assert("nikita-2291", fplug != NULL);
15382 +       seek_fn = fplug->seek ? : generic_file_llseek;
15383 +       result = seek_fn(file, off, origin);
15384 +       reiser4_exit_context(&ctx);
15385 +       return result;
15386 +}
15387 +
15388 +/* reiser4_readdir() - our readdir() method.
15389 +
15390 +   readdir(2)/getdents(2) interface is based on implicit assumption that
15391 +   readdir can be restarted from any particular point by supplying file
15392 +   system with off_t-full of data. That is, file system fill ->d_off
15393 +   field in struct dirent and later user passes ->d_off to the
15394 +   seekdir(3), which is, actually, implemented by glibc as lseek(2) on
15395 +   directory.
15396 +
15397 +   Reiser4 cannot restart readdir from 64 bits of data, because two last
15398 +   components of the key of directory entry are unknown, which given 128
15399 +   bits: locality and type fields in the key of directory entry are
15400 +   always known, to start readdir() from given point objectid and offset
15401 +   fields have to be filled.
15402 +
15403 +   See plugin/dir/dir.c:readdir_common() for the details of our solution.
15404 +*/
15405 +static int
15406 +reiser4_readdir(struct file *f /* directory file being read */ ,
15407 +               void *dirent /* opaque data passed to us by VFS */ ,
15408 +               filldir_t filldir       /* filler function passed to us
15409 +                                        * by VFS */ )
15410 +{
15411 +       dir_plugin *dplug;
15412 +       int result;
15413 +       struct inode *inode;
15414 +       reiser4_context ctx;
15415 +
15416 +       inode = f->f_dentry->d_inode;
15417 +       init_context(&ctx, inode->i_sb);
15418 +       write_syscall_log("%s", f->f_dentry->d_name.name);
15419 +       reiser4_stat_inc(vfs_calls.readdir);
15420 +
15421 +       dplug = inode_dir_plugin(inode);
15422 +       if ((dplug != NULL) && (dplug->readdir != NULL))
15423 +               result = dplug->readdir(f, dirent, filldir);
15424 +       else
15425 +               result = RETERR(-ENOTDIR);
15426 +
15427 +       /*
15428 +        * directory st_atime is updated by callers (if necessary).
15429 +        */
15430 +       write_syscall_log("ex");
15431 +       context_set_commit_async(&ctx);
15432 +       reiser4_exit_context(&ctx);
15433 +       return result;
15434 +}
15435 +
15436 +/*
15437 +  reiser4_ioctl - handler for ioctl for inode supported commands:
15438 +*/
15439 +static int
15440 +reiser4_ioctl(struct inode *inode, struct file *filp, unsigned int cmd, unsigned long arg)
15441 +{
15442 +       int result;
15443 +       reiser4_context ctx;
15444 +
15445 +       init_context(&ctx, inode->i_sb);
15446 +       write_syscall_log("%s", filp->f_dentry->d_name.name);
15447 +       reiser4_stat_inc(vfs_calls.ioctl);
15448 +
15449 +       if (inode_file_plugin(inode)->ioctl == NULL)
15450 +               result = -ENOSYS;
15451 +       else
15452 +               result = inode_file_plugin(inode)->ioctl(inode, filp, cmd, arg);
15453 +
15454 +       write_syscall_log("ex");
15455 +       reiser4_exit_context(&ctx);
15456 +       return result;
15457 +}
15458 +
15459 +/* ->mmap() VFS method in reiser4 file_operations */
15460 +static int
15461 +reiser4_mmap(struct file *file, struct vm_area_struct *vma)
15462 +{
15463 +       struct inode *inode;
15464 +       int result;
15465 +       reiser4_context ctx;
15466 +
15467 +       init_context(&ctx, file->f_dentry->d_inode->i_sb);
15468 +       write_syscall_log("%s", file->f_dentry->d_name.name);
15469 +       reiser4_stat_inc(vfs_calls.mmap);
15470 +
15471 +       ON_TRACE(TRACE_VFS_OPS, "MMAP: (i_ino %lli, size %lld)\n",
15472 +                get_inode_oid(file->f_dentry->d_inode),
15473 +                file->f_dentry->d_inode->i_size);
15474 +
15475 +       inode = file->f_dentry->d_inode;
15476 +       assert("nikita-2936", inode_file_plugin(inode)->mmap != NULL);
15477 +       result = inode_file_plugin(inode)->mmap(file, vma);
15478 +       write_syscall_log("ex");
15479 +       reiser4_exit_context(&ctx);
15480 +       return result;
15481 +}
15482 +
15483 +/* reiser4 implementation of ->read() VFS method, member of reiser4 struct file_operations
15484 +
15485 + reads some part of a file from the filesystem into the user space buffer
15486 +
15487 + gets the plugin for the file and calls its read method which does everything except some initialization
15488 +
15489 +*/
15490 +static ssize_t
15491 +reiser4_read(struct file *file /* file to read from */ ,
15492 +            char *buf          /* user-space buffer to put data read
15493 +                                * from the file */ ,
15494 +            size_t count /* bytes to read */ ,
15495 +            loff_t * off       /* current position within the file, which needs to be increased by the act of reading. Reads
15496 +                                * start from here. */ )
15497 +{
15498 +       ssize_t result;
15499 +       struct inode *inode;
15500 +       reiser4_context ctx;
15501 +
15502 +       assert("umka-072", file != NULL);
15503 +       assert("umka-073", buf != NULL);
15504 +       assert("umka-074", off != NULL);
15505 +
15506 +       inode = file->f_dentry->d_inode;
15507 +       init_context(&ctx, inode->i_sb);
15508 +       write_syscall_log("%s", file->f_dentry->d_name.name);
15509 +       reiser4_stat_inc(vfs_calls.read);
15510 +
15511 +       ON_TRACE(TRACE_VFS_OPS,
15512 +                "READ: (i_ino %li, size %lld): %u bytes from pos %lli\n",
15513 +                inode->i_ino, inode->i_size, count, *off);
15514 +
15515 +       result = perm_chk(inode, read, file, buf, count, off);
15516 +       if (likely(result == 0)) {
15517 +               file_plugin *fplug;
15518 +
15519 +               fplug = inode_file_plugin(inode);
15520 +               assert("nikita-417", fplug != NULL);
15521 +               assert("nikita-2935", fplug->write != NULL);
15522 +
15523 +               /* unix_file_read is one method that might be invoked below */
15524 +               result = fplug->read(file, buf, count, off);
15525 +       }
15526 +       write_syscall_log("ex");
15527 +       reiser4_exit_context(&ctx);
15528 +       return result;
15529 +}
15530 +
15531 +/* ->write() VFS method in reiser4 file_operations */
15532 +static ssize_t
15533 +reiser4_write(struct file *file /* file to write on */ ,
15534 +             const char *buf   /* user-space buffer to get data
15535 +                                * to write into the file */ ,
15536 +             size_t size /* bytes to write */ ,
15537 +             loff_t * off      /* offset to start writing
15538 +                                * from. This is updated to indicate
15539 +                                * actual number of bytes written */ )
15540 +{
15541 +       struct inode *inode;
15542 +       ssize_t result;
15543 +       reiser4_context ctx;
15544 +
15545 +       assert("nikita-1421", file != NULL);
15546 +       assert("nikita-1422", buf != NULL);
15547 +       assert("nikita-1424", off != NULL);
15548 +
15549 +       inode = file->f_dentry->d_inode;
15550 +       init_context(&ctx, inode->i_sb);
15551 +       write_syscall_log("%s", file->f_dentry->d_name.name);
15552 +       reiser4_stat_inc(vfs_calls.write);
15553 +
15554 +       ON_TRACE(TRACE_VFS_OPS,
15555 +                "WRITE: (i_ino %li, size %lld): %u bytes to pos %lli\n", inode->i_ino, inode->i_size, size, *off);
15556 +
15557 +       result = perm_chk(inode, write, file, buf, size, off);
15558 +       if (likely(result == 0)) {
15559 +               file_plugin *fplug;
15560 +
15561 +               fplug = inode_file_plugin(inode);
15562 +               assert("nikita-2934", fplug->read != NULL);
15563 +
15564 +               result = fplug->write(file, buf, size, off);
15565 +       }
15566 +       write_syscall_log("ex");
15567 +       context_set_commit_async(&ctx);
15568 +       reiser4_exit_context(&ctx);
15569 +       return result;
15570 +}
15571 +
15572 +/* Release reiser4 file. This is f_op->release() method. Called when last
15573 +   holder closes a file */
15574 +static int
15575 +reiser4_release(struct inode *i /* inode released */ ,
15576 +               struct file *f /* file released */ )
15577 +{
15578 +       file_plugin *fplug;
15579 +       int result;
15580 +       reiser4_context ctx;
15581 +
15582 +       assert("umka-081", i != NULL);
15583 +       assert("nikita-1447", f != NULL);
15584 +
15585 +       init_context(&ctx, i->i_sb);
15586 +       fplug = inode_file_plugin(i);
15587 +       assert("umka-082", fplug != NULL);
15588 +
15589 +       ON_TRACE(TRACE_VFS_OPS,
15590 +                "RELEASE: (i_ino %li, size %lld)\n", i->i_ino, i->i_size);
15591 +
15592 +       if (fplug->release != NULL && get_current_context() == &ctx)
15593 +               result = fplug->release(i, f);
15594 +       else
15595 +               /*
15596 +                 no ->release method defined, or we are within reiser4
15597 +                 context already. How latter is possible? Simple:
15598 +
15599 +                 (gdb) bt
15600 +                 #0  get_exclusive_access ()
15601 +                 #2  0xc01e56d3 in release_unix_file ()
15602 +                 #3  0xc01c3643 in reiser4_release ()
15603 +                 #4  0xc014cae0 in __fput ()
15604 +                 #5  0xc013ffc3 in remove_vm_struct ()
15605 +                 #6  0xc0141786 in exit_mmap ()
15606 +                 #7  0xc0118480 in mmput ()
15607 +                 #8  0xc0133205 in oom_kill ()
15608 +                 #9  0xc01332d1 in out_of_memory ()
15609 +                 #10 0xc013bc1d in try_to_free_pages ()
15610 +                 #11 0xc013427b in __alloc_pages ()
15611 +                 #12 0xc013f058 in do_anonymous_page ()
15612 +                 #13 0xc013f19d in do_no_page ()
15613 +                 #14 0xc013f60e in handle_mm_fault ()
15614 +                 #15 0xc01131e5 in do_page_fault ()
15615 +                 #16 0xc0104935 in error_code ()
15616 +                 #17 0xc025c0c6 in __copy_to_user_ll ()
15617 +                 #18 0xc01d496f in read_tail ()
15618 +                 #19 0xc01e4def in read_unix_file ()
15619 +                 #20 0xc01c3504 in reiser4_read ()
15620 +                 #21 0xc014bd4f in vfs_read ()
15621 +                 #22 0xc014bf66 in sys_read ()
15622 +               */
15623 +               result = 0;
15624 +
15625 +       reiser4_free_file_fsdata(f);
15626 +
15627 +       reiser4_exit_context(&ctx);
15628 +       return result;
15629 +}
15630 +
15631 +/*
15632 + * ->open file operation for reiser4. This is optional method. It's only
15633 + * present for mounts that support pseudo files. When "nopseudo" mount option
15634 + * is used, this method is zeroed, which speeds open(2) system call a bit.
15635 + */
15636 +static int
15637 +reiser4_open(struct inode * inode, struct file * file)
15638 +{
15639 +       int result;
15640 +
15641 +       reiser4_context ctx;
15642 +       file_plugin *fplug;
15643 +
15644 +       init_context(&ctx, inode->i_sb);
15645 +       reiser4_stat_inc(vfs_calls.open);
15646 +       fplug = inode_file_plugin(inode);
15647 +
15648 +       if (fplug->open != NULL)
15649 +               result = fplug->open(inode, file);
15650 +       else
15651 +               result = 0;
15652 +
15653 +       reiser4_exit_context(&ctx);
15654 +       return result;
15655 +}
15656 +
15657 +/* ->fsync file operation for reiser4. */
15658 +static int
15659 +reiser4_fsync(struct file *file, struct dentry *dentry, int datasync)
15660 +{
15661 +       int result;
15662 +       reiser4_context ctx;
15663 +       file_plugin *fplug;
15664 +       struct inode *inode;
15665 +
15666 +       inode = dentry->d_inode;
15667 +       init_context(&ctx, inode->i_sb);
15668 +       fplug = inode_file_plugin(inode);
15669 +       if (fplug->sync != NULL)
15670 +               result = fplug->sync(inode, datasync);
15671 +       else
15672 +               result = 0;
15673 +       context_set_commit_async(&ctx);
15674 +       reiser4_exit_context(&ctx);
15675 +       return result;
15676 +}
15677 +
15678 +/* Reads @count bytes from @file and calls @actor for every read page. This is
15679 +   needed for loop back devices support. */
15680 +static ssize_t reiser4_sendfile(struct file *file, loff_t *ppos,
15681 +                               size_t count, read_actor_t actor,
15682 +                               void __user *target)
15683 +{
15684 +       int result;
15685 +       file_plugin *fplug;
15686 +       reiser4_context ctx;
15687 +       struct inode *inode;
15688 +
15689 +       inode = file->f_dentry->d_inode;
15690 +       init_context(&ctx, inode->i_sb);
15691 +
15692 +       fplug = inode_file_plugin(inode);
15693 +
15694 +       if (fplug->sendfile != NULL)
15695 +               result = fplug->sendfile(file, ppos, count, actor, target);
15696 +       else
15697 +               result = RETERR(-EINVAL);
15698 +
15699 +       reiser4_exit_context(&ctx);
15700 +       return result;
15701 +}
15702 +
15703 +
15704 +struct file_operations reiser4_file_operations = {
15705 +       .llseek   = reiser4_llseek,     /* d */
15706 +       .read     = reiser4_read,       /* d */
15707 +       .write    = reiser4_write,      /* d */
15708 +       .readdir  = reiser4_readdir,    /* d */
15709 +/*     .poll              = reiser4_poll, */
15710 +       .ioctl    = reiser4_ioctl,
15711 +       .mmap     = reiser4_mmap,       /* d */
15712 +       .open              = reiser4_open,
15713 +/*     .flush             = reiser4_flush, */
15714 +       .release  = reiser4_release,    /* d */
15715 +       .fsync    = reiser4_fsync        /* d */,
15716 +       .sendfile = reiser4_sendfile,
15717 +/*     .fasync            = reiser4_fasync, */
15718 +/*     .lock              = reiser4_lock, */
15719 +/*     .readv             = reiser4_readv, */
15720 +/*     .writev            = reiser4_writev, */
15721 +/*     .sendpage          = reiser4_sendpage, */
15722 +/*     .get_unmapped_area = reiser4_get_unmapped_area */
15723 +};
15724 +
15725 +
15726 +/* Make Linus happy.
15727 +   Local variables:
15728 +   c-indentation-style: "K&R"
15729 +   mode-name: "LC"
15730 +   c-basic-offset: 8
15731 +   tab-width: 8
15732 +   fill-column: 120
15733 +   End:
15734 +*/
15735 diff -rupN linux-2.6.8-rc3/fs/reiser4/flush.c linux-2.6.8-rc3-a/fs/reiser4/flush.c
15736 --- linux-2.6.8-rc3/fs/reiser4/flush.c  1970-01-01 03:00:00.000000000 +0300
15737 +++ linux-2.6.8-rc3-a/fs/reiser4/flush.c        2004-08-05 21:20:53.241627109 +0400
15738 @@ -0,0 +1,3832 @@
15739 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
15740 +
15741 +/* The design document for this file is at http://www.namesys.com/v4/v4.html. */
15742 +
15743 +#include "forward.h"
15744 +#include "debug.h"
15745 +#include "dformat.h"
15746 +#include "key.h"
15747 +#include "coord.h"
15748 +#include "type_safe_list.h"
15749 +#include "plugin/item/item.h"
15750 +#include "plugin/plugin.h"
15751 +#include "plugin/object.h"
15752 +#include "txnmgr.h"
15753 +#include "jnode.h"
15754 +#include "znode.h"
15755 +#include "block_alloc.h"
15756 +#include "tree_walk.h"
15757 +#include "carry.h"
15758 +#include "tree.h"
15759 +#include "vfs_ops.h"
15760 +#include "inode.h"
15761 +#include "page_cache.h"
15762 +#include "wander.h"
15763 +#include "super.h"
15764 +#include "log.h"
15765 +#include "entd.h"
15766 +#include "reiser4.h"
15767 +#include "prof.h"
15768 +#include "flush.h"
15769 +#include "writeout.h"
15770 +
15771 +#include <asm/atomic.h>
15772 +#include <linux/fs.h>          /* for struct super_block  */
15773 +#include <linux/mm.h>          /* for struct page */
15774 +#include <linux/bio.h>         /* for struct bio */
15775 +#include <linux/pagemap.h>
15776 +#include <linux/blkdev.h>
15777 +
15778 +/* IMPLEMENTATION NOTES */
15779 +
15780 +/* PARENT-FIRST: Some terminology: A parent-first traversal is a way of assigning a total
15781 +   order to the nodes of the tree in which the parent is placed before its children, which
15782 +   are ordered (recursively) in left-to-right order.  When we speak of a "parent-first preceder", it
15783 +   describes the node that "came before in forward parent-first order".  When we speak of a
15784 +   "parent-first follower", it describes the node that "comes next in parent-first
15785 +   order" (alternatively the node that "came before in reverse parent-first order").
15786 +
15787 +   The following pseudo-code prints the nodes of a tree in forward parent-first order:
15788 +
15789 +   void parent_first (node)
15790 +   {
15791 +     print_node (node);
15792 +     if (node->level > leaf) {
15793 +       for (i = 0; i < num_children; i += 1) {
15794 +         parent_first (node->child[i]);
15795 +       }
15796 +     }
15797 +   }
15798 +*/
15799 +
15800 +/* JUST WHAT ARE WE TRYING TO OPTIMIZE, HERE?  The idea is to optimize block allocation so
15801 +   that a left-to-right scan of the tree's data (i.e., the leaves in left-to-right order)
15802 +   can be accomplished with sequential reads, which results in reading nodes in their
15803 +   parent-first order.  This is a read-optimization aspect of the flush algorithm, and
15804 +   there is also a write-optimization aspect, which is that we wish to make large
15805 +   sequential writes to the disk by allocating or reallocating blocks so that they can be
15806 +   written in sequence.  Sometimes the read-optimization and write-optimization goals
15807 +   conflict with each other, as we discuss in more detail below.
15808 +*/
15809 +
15810 +/* STATE BITS: The flush code revolves around the state of the jnodes it covers.  Here are
15811 +   the relevant jnode->state bits and their relevence to flush:
15812 +
15813 +     JNODE_DIRTY: If a node is dirty, it must be flushed.  But in order to be written it
15814 +     must be allocated first.  In order to be considered allocated, the jnode must have
15815 +     exactly one of { JNODE_OVRWR, JNODE_RELOC } set.  These two bits are exclusive, and
15816 +     all dirtied jnodes eventually have one of these bits set during each transaction.
15817 +
15818 +     JNODE_CREATED: The node was freshly created in its transaction and has no previous
15819 +     block address, so it is unconditionally assigned to be relocated, although this is
15820 +     mainly for code-convenience.  It is not being 'relocated' from anything, but in
15821 +     almost every regard it is treated as part of the relocate set.  The JNODE_CREATED bit
15822 +     remains set even after JNODE_RELOC is set, so the actual relocate can be
15823 +     distinguished from the created-and-allocated set easily: relocate-set members
15824 +     (belonging to the preserve-set) have (JNODE_RELOC) set and created-set members which
15825 +     have no previous location to preserve have (JNODE_RELOC | JNODE_CREATED) set.
15826 +
15827 +     JNODE_OVRWR: The node belongs to atom's overwrite set. The flush algorithm made the
15828 +     decision to maintain the pre-existing location for this node and it will be written
15829 +     to the wandered-log.
15830 +
15831 +     JNODE_RELOC: The flush algorithm made the decision to relocate this block (if it was
15832 +     not created, see note above).  A block with JNODE_RELOC set is eligible for
15833 +     early-flushing and may be submitted during flush_empty_queues.  When the JNODE_RELOC
15834 +     bit is set on a znode, the parent node's internal item is modified and the znode is
15835 +     rehashed.
15836 +
15837 +     JNODE_SQUEEZABLE: Before shifting everything left, the flush algorithm scans the node
15838 +     and calls plugin->f.squeeze() method for its items. By this technology we update disk
15839 +     clusters of cryptcompress objects. Also if leftmost point that was found by flush scan
15840 +     has this flag (races with write(), rare case) the flush algorythm makes the decision
15841 +     to pass it to squalloc() in spite of its flushprepped status for squeezing, not for
15842 +     repeated allocation.
15843 +
15844 +     JNODE_FLUSH_QUEUED: This bit is set when a call to flush enters the jnode into its
15845 +     flush queue.  This means the jnode is not on any clean or dirty list, instead it is
15846 +     moved to one of the flush queue (see flush_queue.h) object private list. This
15847 +     prevents multiple concurrent flushes from attempting to start flushing from the
15848 +     same node.
15849 +
15850 +     (DEAD STATE BIT) JNODE_FLUSH_BUSY: This bit was set during the bottom-up
15851 +     squeeze-and-allocate on a node while its children are actively being squeezed and
15852 +     allocated.  This flag was created to avoid submitting a write request for a node
15853 +     while its children are still being allocated and squeezed. Then flush queue was
15854 +     re-implemented to allow unlimited number of nodes be queued. This flag support was
15855 +     commented out in source code because we decided that there was no reason to submit
15856 +     queued nodes before jnode_flush() finishes.  However, current code calls fq_write()
15857 +     during a slum traversal and may submit "busy nodes" to disk. Probably we can
15858 +     re-enable the JNODE_FLUSH_BUSY bit support in future.
15859 +
15860 +   With these state bits, we describe a test used frequently in the code below,
15861 +   jnode_is_flushprepped() (and the spin-lock-taking jnode_check_flushprepped()).  The
15862 +   test for "flushprepped" returns true if any of the following are true:
15863 +
15864 +     - The node is not dirty
15865 +     - The node has JNODE_RELOC set
15866 +     - The node has JNODE_OVRWR set
15867 +
15868 +   If either the node is not dirty or it has already been processed by flush (and assigned
15869 +   JNODE_OVRWR or JNODE_RELOC), then it is prepped.  If jnode_is_flushprepped() returns
15870 +   true then flush has work to do on that node.
15871 +*/
15872 +
15873 +/* FLUSH_PREP_ONCE_PER_TRANSACTION: Within a single transaction a node is never
15874 +   flushprepped twice (unless an explicit call to flush_unprep is made as described in
15875 +   detail below).  For example a node is dirtied, allocated, and then early-flushed to
15876 +   disk and set clean.  Before the transaction commits, the page is dirtied again and, due
15877 +   to memory pressure, the node is flushed again.  The flush algorithm will not relocate
15878 +   the node to a new disk location, it will simply write it to the same, previously
15879 +   relocated position again.
15880 +*/
15881 +
15882 +/* THE BOTTOM-UP VS. TOP-DOWN ISSUE: This code implements a bottom-up algorithm where we
15883 +   start at a leaf node and allocate in parent-first order by iterating to the right.  At
15884 +   each step of the iteration, we check for the right neighbor.  Before advancing to the
15885 +   right neighbor, we check if the current position and the right neighbor share the same
15886 +   parent.  If they do not share the same parent, the parent is allocated before the right
15887 +   neighbor.
15888 +
15889 +   This process goes recursively up the tree and squeeze nodes level by level as long as
15890 +   the right neighbor and the current position have different parents, then it allocates
15891 +   the right-neighbors-with-different-parents on the way back down.  This process is
15892 +   described in more detail in flush_squalloc_changed_ancestor and the recursive function
15893 +   squalloc_one_changed_ancestor.  But the purpose here is not to discuss the
15894 +   specifics of the bottom-up approach as it is to contrast the bottom-up and top-down
15895 +   approaches.
15896 +
15897 +   The top-down algorithm was implemented earlier (April-May 2002).  In the top-down
15898 +   approach, we find a starting point by scanning left along each level past dirty nodes,
15899 +   then going up and repeating the process until the left node and the parent node are
15900 +   clean.  We then perform a parent-first traversal from the starting point, which makes
15901 +   allocating in parent-first order trivial.  After one subtree has been allocated in this
15902 +   manner, we move to the right, try moving upward, then repeat the parent-first
15903 +   traversal.
15904 +
15905 +   Both approaches have problems that need to be addressed.  Both are approximately the
15906 +   same amount of code, but the bottom-up approach has advantages in the order it acquires
15907 +   locks which, at the very least, make it the better approach.  At first glance each one
15908 +   makes the other one look simpler, so it is important to remember a few of the problems
15909 +   with each one.
15910 +
15911 +   Main problem with the top-down approach: When you encounter a clean child during the
15912 +   parent-first traversal, what do you do?  You would like to avoid searching through a
15913 +   large tree of nodes just to find a few dirty leaves at the bottom, and there is not an
15914 +   obvious solution.  One of the advantages of the top-down approach is that during the
15915 +   parent-first traversal you check every child of a parent to see if it is dirty.  In
15916 +   this way, the top-down approach easily handles the main problem of the bottom-up
15917 +   approach: unallocated children.
15918 +
15919 +   The unallocated children problem is that before writing a node to disk we must make
15920 +   sure that all of its children are allocated.  Otherwise, the writing the node means
15921 +   extra I/O because the node will have to be written again when the child is finally
15922 +   allocated.
15923 +
15924 +   WE HAVE NOT YET ELIMINATED THE UNALLOCATED CHILDREN PROBLEM.  Except for bugs, this
15925 +   should not cause any file system corruption, it only degrades I/O performance because a
15926 +   node may be written when it is sure to be written at least one more time in the same
15927 +   transaction when the remaining children are allocated.  What follows is a description
15928 +   of how we will solve the problem.
15929 +*/
15930 +
15931 +/* HANDLING UNALLOCATED CHILDREN: During flush we may allocate a parent node then,
15932 +   proceeding in parent first order, allocate some of its left-children, then encounter a
15933 +   clean child in the middle of the parent.  We do not allocate the clean child, but there
15934 +   may remain unallocated (dirty) children to the right of the clean child.  If we were to
15935 +   stop flushing at this moment and write everything to disk, the parent might still
15936 +   contain unallocated children.
15937 +
15938 +   We could try to allocate all the descendents of every node that we allocate, but this
15939 +   is not necessary.  Doing so could result in allocating the entire tree: if the root
15940 +   node is allocated then every unallocated node would have to be allocated before
15941 +   flushing.  Actually, we do not have to write a node just because we allocate it.  It is
15942 +   possible to allocate but not write a node during flush, when it still has unallocated
15943 +   children.  However, this approach is probably not optimal for the following reason.
15944 +
15945 +   The flush algorithm is designed to allocate nodes in parent-first order in an attempt
15946 +   to optimize reads that occur in the same order.  Thus we are read-optimizing for a
15947 +   left-to-right scan through all the leaves in the system, and we are hoping to
15948 +   write-optimize at the same time because those nodes will be written together in batch.
15949 +   What happens, however, if we assign a block number to a node in its read-optimized
15950 +   order but then avoid writing it because it has unallocated children?  In that
15951 +   situation, we lose out on the write-optimization aspect because a node will have to be
15952 +   written again to the its location on the device, later, which likely means seeking back
15953 +   to that location.
15954 +
15955 +   So there are tradeoffs. We can choose either:
15956 +
15957 +   A. Allocate all unallocated children to preserve both write-optimization and
15958 +   read-optimization, but this is not always desirable because it may mean having to
15959 +   allocate and flush very many nodes at once.
15960 +
15961 +   B. Defer writing nodes with unallocated children, keep their read-optimized locations,
15962 +   but sacrifice write-optimization because those nodes will be written again.
15963 +
15964 +   C. Defer writing nodes with unallocated children, but do not keep their read-optimized
15965 +   locations.  Instead, choose to write-optimize them later, when they are written.  To
15966 +   facilitate this, we "undo" the read-optimized allocation that was given to the node so
15967 +   that later it can be write-optimized, thus "unpreparing" the flush decision.  This is a
15968 +   case where we disturb the FLUSH_PREP_ONCE_PER_TRANSACTION rule described above.  By a
15969 +   call to flush_unprep() we will: if the node was wandered, unset the JNODE_OVRWR bit;
15970 +   if the node was relocated, unset the JNODE_RELOC bit, non-deferred-deallocate its block
15971 +   location, and set the JNODE_CREATED bit, effectively setting the node back to an
15972 +   unallocated state.
15973 +
15974 +   We will take the following approach in v4.0: for twig nodes we will always finish
15975 +   allocating unallocated children (A).  For nodes with (level > TWIG) we will defer
15976 +   writing and choose write-optimization (C).
15977 +
15978 +   To summarize, there are several parts to a solution that avoids the problem with
15979 +   unallocated children:
15980 +
15981 +   FIXME-ZAM: Still no one approach is implemented to eliminate the "UNALLOCATED CHILDREN"
15982 +   problem because there was an experiment which was done showed that we have 1-2 nodes
15983 +   with unallocated children for thousands of written nodes.  The experiment was simple
15984 +   like coping / deletion of linux kernel sources.  However the problem can arise in more
15985 +   complex tests.  I think we have jnode_io_hook to insert a check for unallocated
15986 +   children and see what kind of problem we have.
15987 +
15988 +   1. When flush reaches a stopping point (e.g., a clean node), it should continue calling
15989 +   squeeze-and-allocate on any remaining unallocated children.  FIXME: Difficulty to
15990 +   implement: should be simple -- amounts to adding a while loop to jnode_flush, see
15991 +   comments in that function.
15992 +
15993 +   2. When flush reaches flush_empty_queue(), some of the (level > TWIG) nodes may still
15994 +   have unallocated children.  If the twig level has unallocated children it is an
15995 +   assertion failure.  If a higher-level node has unallocated children, then it should be
15996 +   explicitly de-allocated by a call to flush_unprep().  FIXME: Difficulty to implement:
15997 +   should be simple.
15998 +
15999 +   3. (CPU-Optimization) Checking whether a node has unallocated children may consume more
16000 +   CPU cycles than we would like, and it is possible (but medium complexity) to optimize
16001 +   this somewhat in the case where large sub-trees are flushed.  The following observation
16002 +   helps: if both the left- and right-neighbor of a node are processed by the flush
16003 +   algorithm then the node itself is guaranteed to have all of its children allocated.
16004 +   However, the cost of this check may not be so expensive after all: it is not needed for
16005 +   leaves and flush can guarantee this property for twigs.  That leaves only (level >
16006 +   TWIG) nodes that have to be checked, so this optimization only helps if at least three
16007 +   (level > TWIG) nodes are flushed in one pass, and the savings will be very small unless
16008 +   there are many more (level > TWIG) nodes.  But if there are many (level > TWIG) nodes
16009 +   then the number of blocks being written will be very large, so the savings may be
16010 +   insignificant.  That said, the idea is to maintain both the left and right edges of
16011 +   nodes that are processed in flush.  When flush_empty_queue() is called, a relatively
16012 +   simple test will tell whether the (level > TWIG) node is on the edge.  If it is on the
16013 +   edge, the slow check is necessary, but if it is in the interior then it can be assumed
16014 +   to have all of its children allocated.  FIXME: medium complexity to implement, but
16015 +   simple to verify given that we must have a slow check anyway.
16016 +
16017 +   4. (Optional) This part is optional, not for v4.0--flush should work independently of
16018 +   whether this option is used or not.  Called RAPID_SCAN, the idea is to amend the
16019 +   left-scan operation to take unallocated children into account.  Normally, the left-scan
16020 +   operation goes left as long as adjacent nodes are dirty up until some large maximum
16021 +   value (FLUSH_SCAN_MAXNODES) at which point it stops and begins flushing.  But scan-left
16022 +   may stop at a position where there are unallocated children to the left with the same
16023 +   parent.  When RAPID_SCAN is enabled, the ordinary scan-left operation stops after
16024 +   FLUSH_RELOCATE_THRESHOLD, which is much smaller than FLUSH_SCAN_MAXNODES, then procedes
16025 +   with a rapid scan.  The rapid scan skips all the interior children of a node--if the
16026 +   leftmost child of a twig is dirty, check its left neighbor (the rightmost child of the
16027 +   twig to the left).  If the left neighbor of the leftmost child is also dirty, then
16028 +   continue the scan at the left twig and repeat.  This option will cause flush to
16029 +   allocate more twigs in a single pass, but it also has the potential to write many more
16030 +   nodes than would otherwise be written without the RAPID_SCAN option.  RAPID_SCAN
16031 +   was partially implemented, code removed August 12, 2002 by JMACD.
16032 +*/
16033 +
16034 +/* FLUSH CALLED ON NON-LEAF LEVEL.  Most of our design considerations assume that the
16035 +   starting point for flush is a leaf node, but actually the flush code cares very little
16036 +   about whether or not this is true.  It is possible that all the leaf nodes are flushed
16037 +   and dirty parent nodes still remain, in which case jnode_flush() is called on a
16038 +   non-leaf argument.  Flush doesn't care--it treats the argument node as if it were a
16039 +   leaf, even when it is not.  This is a simple approach, and there may be a more optimal
16040 +   policy but until a problem with this approach is discovered, simplest is probably best.
16041 +
16042 +   NOTE: In this case, the ordering produced by flush is parent-first only if you ignore
16043 +   the leaves.  This is done as a matter of simplicity and there is only one (shaky)
16044 +   justification.  When an atom commits, it flushes all leaf level nodes first, followed
16045 +   by twigs, and so on.  With flushing done in this order, if flush is eventually called
16046 +   on a non-leaf node it means that (somehow) we reached a point where all leaves are
16047 +   clean and only internal nodes need to be flushed.  If that it the case, then it means
16048 +   there were no leaves that were the parent-first preceder/follower of the parent.  This
16049 +   is expected to be a rare case, which is why we do nothing special about it.  However,
16050 +   memory pressure may pass an internal node to flush when there are still dirty leaf
16051 +   nodes that need to be flushed, which could prove our original assumptions
16052 +   "inoperative".  If this needs to be fixed, then scan_left/right should have
16053 +   special checks for the non-leaf levels.  For example, instead of passing from a node to
16054 +   the left neighbor, it should pass from the node to the left neighbor's rightmost
16055 +   descendent (if dirty).
16056 +
16057 +*/
16058 +
16059 +/* UNIMPLEMENTED AS YET: REPACKING AND RESIZING.  We walk the tree in 4MB-16MB chunks, dirtying everything and putting
16060 +   it into a transaction.  We tell the allocator to allocate the blocks as far as possible towards one end of the
16061 +   logical device--the left (starting) end of the device if we are walking from left to right, the right end of the
16062 +   device if we are walking from right to left.  We then make passes in alternating directions, and as we do this the
16063 +   device becomes sorted such that tree order and block number order fully correlate.
16064 +
16065 +   Resizing is done by shifting everything either all the way to the left or all the way
16066 +   to the right, and then reporting the last block.
16067 +*/
16068 +
16069 +/* RELOCATE DECISIONS: The code makes a decision to relocate in several places.  This
16070 +   descibes the policy from the highest level:
16071 +
16072 +   The FLUSH_RELOCATE_THRESHOLD parameter: If we count this many consecutive nodes on the
16073 +   leaf level during flush-scan (right, left), then we unconditionally decide to relocate
16074 +   leaf nodes.
16075 +
16076 +   Otherwise, there are two contexts in which we make a decision to relocate:
16077 +
16078 +   1. The REVERSE PARENT-FIRST context: Implemented in reverse_relocate_test().
16079 +   During the initial stages of flush, after scan-right completes, we want to ask the
16080 +   question: should we relocate this leaf node and thus dirty the parent node.  Then if
16081 +   the node is a leftmost child its parent is its own parent-first preceder, thus we repeat
16082 +   the question at the next level up, and so on.  In these cases we are moving in the
16083 +   reverse-parent first direction.
16084 +
16085 +   There is another case which is considered the reverse direction, which comes at the end
16086 +   of a twig in reverse_relocate_end_of_twig().  As we finish processing a twig we may
16087 +   reach a point where there is a clean twig to the right with a dirty leftmost child.  In
16088 +   this case, we may wish to relocate the child by testing if it should be relocated
16089 +   relative to its parent.
16090 +
16091 +   2. The FORWARD PARENT-FIRST context: Testing for forward relocation is done in
16092 +   allocate_znode.  What distinguishes the forward parent-first case from the
16093 +   reverse-parent first case is that the preceder has already been allocated in the
16094 +   forward case, whereas in the reverse case we don't know what the preceder is until we
16095 +   finish "going in reverse".  That simplifies the forward case considerably, and there we
16096 +   actually use the block allocator to determine whether, e.g., a block closer to the
16097 +   preceder is available.
16098 +*/
16099 +
16100 +/* SQUEEZE_LEFT_EDGE: Unimplemented idea for future consideration.  The idea is, once we
16101 +   finish scan-left and find a starting point, if the parent's left neighbor is dirty then
16102 +   squeeze the parent's left neighbor and the parent.  This may change the
16103 +   flush-starting-node's parent.  Repeat until the child's parent is stable.  If the child
16104 +   is a leftmost child, repeat this left-edge squeezing operation at the next level up.
16105 +   Note that we cannot allocate extents during this or they will be out of parent-first
16106 +   order.  There is also some difficult coordinate maintenence issues.  We can't do a tree
16107 +   search to find coordinates again (because we hold locks), we have to determine them
16108 +   from the two nodes being squeezed.  Looks difficult, but has potential to increase
16109 +   space utilization. */
16110 +
16111 +/* Flush-scan helper functions. */
16112 +static void scan_init(flush_scan * scan);
16113 +static void scan_done(flush_scan * scan);
16114 +
16115 +/* Flush-scan algorithm. */
16116 +static int scan_left(flush_scan * scan, flush_scan * right, jnode * node, unsigned limit);
16117 +static int scan_right(flush_scan * scan, jnode * node, unsigned limit);
16118 +static int scan_common(flush_scan * scan, flush_scan * other);
16119 +static int scan_formatted(flush_scan * scan);
16120 +static int scan_unformatted(flush_scan * scan, flush_scan * other);
16121 +static int scan_by_coord(flush_scan * scan);
16122 +
16123 +/* Initial flush-point ancestor allocation. */
16124 +static int alloc_pos_and_ancestors(flush_pos_t * pos);
16125 +static int alloc_one_ancestor(const coord_t * coord, flush_pos_t * pos);
16126 +static int set_preceder(const coord_t * coord_in, flush_pos_t * pos);
16127 +
16128 +/* Main flush algorithm.  Note on abbreviation: "squeeze and allocate" == "squalloc". */
16129 +static int squalloc(flush_pos_t * pos);
16130 +
16131 +/* Flush squeeze implementation. */
16132 +static int squeeze_right_non_twig(znode * left, znode * right);
16133 +static int shift_one_internal_unit(znode * left, znode * right);
16134 +
16135 +/* Flush reverse parent-first relocation routines. */
16136 +static int reverse_relocate_if_close_enough(const reiser4_block_nr * pblk, const reiser4_block_nr * nblk);
16137 +static int reverse_relocate_test(jnode * node, const coord_t * parent_coord, flush_pos_t * pos);
16138 +static int reverse_relocate_check_dirty_parent(jnode * node, const coord_t * parent_coord, flush_pos_t * pos);
16139 +
16140 +/* Flush allocate write-queueing functions: */
16141 +static int allocate_znode(znode * node, const coord_t * parent_coord, flush_pos_t * pos);
16142 +static int allocate_znode_update(znode * node, const coord_t * parent_coord, flush_pos_t * pos);
16143 +static int lock_parent_and_allocate_znode (znode *, flush_pos_t *);
16144 +
16145 +/* Flush helper functions: */
16146 +static int jnode_lock_parent_coord(jnode         * node,
16147 +                                  coord_t       * coord,
16148 +                                  lock_handle   * parent_lh,
16149 +                                  load_count    * parent_zh,
16150 +                                  znode_lock_mode mode, int try);
16151 +static int neighbor_in_slum(znode * node, lock_handle * right_lock, sideof side, znode_lock_mode mode);
16152 +static int znode_same_parents(znode * a, znode * b);
16153 +
16154 +static int
16155 +znode_check_flushprepped(znode * node)
16156 +{
16157 +       return jnode_check_flushprepped(ZJNODE(node));
16158 +}
16159 +
16160 +/* Flush position functions */
16161 +static void pos_init(flush_pos_t * pos);
16162 +static int pos_valid(flush_pos_t * pos);
16163 +static void pos_done(flush_pos_t * pos);
16164 +static int pos_stop(flush_pos_t * pos);
16165 +
16166 +/* check that @org is first jnode extent unit, if extent is unallocated,
16167 + * because all jnodes of unallocated extent are dirty and of the same atom. */
16168 +#define checkchild(scan)                                               \
16169 +assert("nikita-3435",                                                  \
16170 +       ergo(scan->direction == LEFT_SIDE &&                            \
16171 +            (scan->parent_coord.node->level == TWIG_LEVEL) &&           \
16172 +           jnode_is_unformatted(scan->node) &&                         \
16173 +           extent_is_unallocated(&scan->parent_coord),                 \
16174 +           extent_unit_index(&scan->parent_coord) == index_jnode(scan->node)))
16175 +
16176 +/* Flush debug functions */
16177 +#if REISER4_DEBUG_OUTPUT
16178 +#else
16179 +#endif
16180 +
16181 +const char *pos_tostring(flush_pos_t * pos);
16182 +
16183 +/* This flush_cnt variable is used to track the number of concurrent flush operations,
16184 +   useful for debugging.  It is initialized in txnmgr.c out of laziness (because flush has
16185 +   no static initializer function...) */
16186 +ON_DEBUG(atomic_t flush_cnt;)
16187 +
16188 +
16189 +/* FIXME: remove me */#define FLUSH_CHECKS_CONGESTION 1
16190 +
16191 +#if defined (FLUSH_CHECKS_CONGESTION)
16192 +/* check fs backing device for write congestion */
16193 +static int check_write_congestion (void)
16194 +{
16195 +       struct super_block *sb;
16196 +       struct backing_dev_info * bdi;
16197 +
16198 +       sb = reiser4_get_current_sb();
16199 +       bdi = get_super_fake(sb)->i_mapping->backing_dev_info;
16200 +       return  bdi_write_congested(bdi);
16201 +}
16202 +#endif /* FLUSH_CHECKS_CONGESTION */
16203 +
16204 +/* conditionally write flush queue */
16205 +static int write_prepped_nodes (flush_pos_t * pos, int check_congestion)
16206 +{
16207 +       int ret;
16208 +
16209 +       assert("zam-831", pos);
16210 +       assert("zam-832", pos->fq);
16211 +
16212 +       if (!(pos->flags & JNODE_FLUSH_WRITE_BLOCKS))
16213 +               return 0;
16214 +
16215 +#if defined (FLUSH_CHECKS_CONGESTION)
16216 +       if (check_congestion && check_write_congestion())
16217 +               return 0;
16218 +#endif /* FLUSH_CHECKS_CONGESTION */
16219 +
16220 +       /* trace_mark(flush); */
16221 +       write_current_logf(WRITE_IO_LOG, "mark=flush\n");
16222 +
16223 +       ret = write_fq(pos->fq, pos->nr_written,
16224 +                      WRITEOUT_SINGLE_STREAM | WRITEOUT_FOR_PAGE_RECLAIM);
16225 +       return ret;
16226 +}
16227 +
16228 +/* Proper release all flush pos. resources then move flush position to new
16229 +   locked node */
16230 +static void move_flush_pos (flush_pos_t * pos, lock_handle * new_lock,
16231 +                           load_count * new_load, const coord_t * new_coord)
16232 +{
16233 +       assert ("zam-857", new_lock->node == new_load->node);
16234 +
16235 +       if (new_coord) {
16236 +               assert ("zam-858", new_coord->node == new_lock->node);
16237 +               coord_dup(&pos->coord, new_coord);
16238 +       } else {
16239 +               coord_init_first_unit(&pos->coord, new_lock->node);
16240 +       }
16241 +
16242 +       if (pos->child) {
16243 +               jput(pos->child);
16244 +               pos->child = NULL;
16245 +       }
16246 +
16247 +       move_load_count(&pos->load, new_load);
16248 +       done_lh(&pos->lock);
16249 +       move_lh(&pos->lock, new_lock);
16250 +}
16251 +
16252 +/* delete empty node which link from the parent still exists. */
16253 +static int delete_empty_node (znode * node)
16254 +{
16255 +       reiser4_key smallest_removed;
16256 +
16257 +       assert("zam-1019", node != NULL);
16258 +       assert("zam-1020", node_is_empty(node));
16259 +       assert("zam-1023", znode_is_wlocked(node));
16260 +
16261 +       return delete_node(node, &smallest_removed, NULL);
16262 +}
16263 +
16264 +/* Prepare flush position for alloc_pos_and_ancestors() and squalloc() */
16265 +static int prepare_flush_pos(flush_pos_t *pos, jnode * org)
16266 +{
16267 +       int ret;
16268 +       load_count load;
16269 +       lock_handle lock;
16270 +
16271 +       init_lh(&lock);
16272 +       init_load_count(&load);
16273 +
16274 +       if (jnode_is_znode(org)) {
16275 +               ret = longterm_lock_znode(&lock, JZNODE(org),
16276 +                                         ZNODE_WRITE_LOCK, ZNODE_LOCK_HIPRI);
16277 +               if (ret)
16278 +                       return ret;
16279 +
16280 +               ret = incr_load_count_znode(&load, JZNODE(org));
16281 +               if (ret)
16282 +                       return ret;
16283 +
16284 +               pos->state = (jnode_get_level(org) == LEAF_LEVEL) ? POS_ON_LEAF : POS_ON_INTERNAL;
16285 +               move_flush_pos(pos, &lock, &load, NULL);
16286 +       } else {
16287 +               coord_t parent_coord;
16288 +               ret = jnode_lock_parent_coord(org, &parent_coord, &lock,
16289 +                                             &load, ZNODE_WRITE_LOCK, 0);
16290 +               if (ret)
16291 +                       goto done;
16292 +
16293 +               pos->state = POS_ON_EPOINT;
16294 +               move_flush_pos(pos, &lock, &load, &parent_coord);
16295 +               pos->child = jref(org);
16296 +               if (extent_is_unallocated(&parent_coord) && extent_unit_index(&parent_coord) != index_jnode(org)) {
16297 +                       /* @org is not first child of its parent unit. This may happen
16298 +                          because longerm lock of its parent node was released between
16299 +                          scan_left and scan_right. For now work around this having flush to repeat */
16300 +                       ret = -EAGAIN;
16301 +               }
16302 +       }
16303 +
16304 + done:
16305 +       done_load_count(&load);
16306 +       done_lh(&lock);
16307 +       return ret;
16308 +}
16309 +
16310 +#if REISER4_DEBUG
16311 +void check_pos(flush_pos_t *pos)
16312 +{
16313 +       znode *node;
16314 +
16315 +       node = pos->lock.node;
16316 +       if (node != NULL && znode_is_any_locked(node))
16317 +               assert("nikita-3562", znode_at_read(node));
16318 +}
16319 +#endif
16320 +
16321 +#if REISER4_TRACE
16322 +
16323 +const char *coord_tween_tostring(between_enum n);
16324 +
16325 +
16326 +
16327 +reiser4_internal void
16328 +jnode_tostring_internal(jnode * node, char *buf)
16329 +{
16330 +       const char *state;
16331 +       char atom[32];
16332 +       char block[48];
16333 +       char items[32];
16334 +       int fmttd;
16335 +       int dirty;
16336 +       int lockit;
16337 +
16338 +       lockit = spin_trylock_jnode(node);
16339 +
16340 +       fmttd = jnode_is_znode(node);
16341 +       dirty = JF_ISSET(node, JNODE_DIRTY);
16342 +
16343 +       sprintf(block, " block=%s page=%p state=%lx", sprint_address(jnode_get_block(node)), node->pg, node->state);
16344 +
16345 +       if (JF_ISSET(node, JNODE_OVRWR)) {
16346 +               state = dirty ? "wandr,dirty" : "wandr";
16347 +       } else if (JF_ISSET(node, JNODE_RELOC) && JF_ISSET(node, JNODE_CREATED)) {
16348 +               state = dirty ? "creat,dirty" : "creat";
16349 +       } else if (JF_ISSET(node, JNODE_RELOC)) {
16350 +               state = dirty ? "reloc,dirty" : "reloc";
16351 +       } else if (JF_ISSET(node, JNODE_CREATED)) {
16352 +               assert("jmacd-61554", dirty);
16353 +               state = "fresh";
16354 +               block[0] = 0;
16355 +       } else {
16356 +               state = dirty ? "dirty" : "clean";
16357 +       }
16358 +
16359 +       if (node->atom == NULL) {
16360 +               atom[0] = 0;
16361 +       } else {
16362 +               sprintf(atom, " atom=%u", node->atom->atom_id);
16363 +       }
16364 +
16365 +       items[0] = 0;
16366 +       if (!fmttd) {
16367 +               sprintf(items, " index=%lu", index_jnode(node));
16368 +       }
16369 +
16370 +       sprintf(buf + strlen(buf),
16371 +               "%s=%p [%s%s%s level=%u%s%s]",
16372 +               fmttd ? "z" : "j",
16373 +               node,
16374 +               state, atom, block, jnode_get_level(node), items, JF_ISSET(node, JNODE_FLUSH_QUEUED) ? " fq" : "");
16375 +
16376 +       if (lockit == 1) {
16377 +               UNLOCK_JNODE(node);
16378 +       }
16379 +}
16380 +
16381 +reiser4_internal const char *
16382 +jnode_tostring(jnode * node)
16383 +{
16384 +       static char fmtbuf[256];
16385 +       fmtbuf[0] = 0;
16386 +       jnode_tostring_internal(node, fmtbuf);
16387 +       return fmtbuf;
16388 +}
16389 +
16390 +static const char *
16391 +flags_tostring(int flags)
16392 +{
16393 +       switch (flags) {
16394 +       case JNODE_FLUSH_WRITE_BLOCKS:
16395 +               return "(write blocks)";
16396 +       case JNODE_FLUSH_COMMIT:
16397 +               return "(commit)";
16398 +       case JNODE_FLUSH_MEMORY_FORMATTED:
16399 +               return "(memory-z)";
16400 +       case JNODE_FLUSH_MEMORY_UNFORMATTED:
16401 +               return "(memory-j)";
16402 +       default:
16403 +               return "(unknown)";
16404 +       }
16405 +}
16406 +reiser4_internal const char *
16407 +znode_tostring(znode * node)
16408 +{
16409 +       return jnode_tostring(ZJNODE(node));
16410 +}
16411 +
16412 +
16413 +reiser4_internal const char *
16414 +pos_tostring(flush_pos_t * pos)
16415 +{
16416 +       static char fmtbuf[256];
16417 +       load_count load;
16418 +       fmtbuf[0] = 0;
16419 +
16420 +       init_load_count(&load);
16421 +
16422 +       if (pos->state == POS_ON_EPOINT) {
16423 +               assert("jmacd-79123", pos->lock.node == pos->load.node);
16424 +
16425 +               strcat(fmtbuf, "par:");
16426 +               jnode_tostring_internal(ZJNODE(pos->lock.node), fmtbuf);
16427 +
16428 +               if (incr_load_count_znode(&load, pos->lock.node)) {
16429 +                       return "*error*";
16430 +               }
16431 +
16432 +               if (coord_is_before_leftmost(&pos->coord)) {
16433 +                       sprintf(fmtbuf + strlen(fmtbuf), "[left]");
16434 +               } else if (coord_is_after_rightmost(&pos->coord)) {
16435 +                       sprintf(fmtbuf + strlen(fmtbuf), "[right]");
16436 +               } else {
16437 +                       sprintf(fmtbuf + strlen(fmtbuf), "[%s i=%u/%u",
16438 +                               coord_tween_tostring(pos->coord.between),
16439 +                               pos->coord.item_pos, node_num_items(pos->coord.node));
16440 +
16441 +                       if (!coord_is_existing_item(&pos->coord)) {
16442 +                               sprintf(fmtbuf + strlen(fmtbuf), "]");
16443 +                       } else {
16444 +
16445 +                               sprintf(fmtbuf + strlen(fmtbuf), ",u=%u/%u %s]",
16446 +                                       pos->coord.unit_pos,
16447 +                                       coord_num_units(&pos->coord), coord_is_existing_unit(&pos->coord)
16448 +                                       ? (item_is_extent(&pos->coord) ?
16449 +                                          "ext" : (item_is_internal(&pos->coord) ? "int" : "other"))
16450 +                                       : "tween");
16451 +                       }
16452 +               }
16453 +       } else if (pos->lock.node != NULL) {
16454 +               strcat(fmtbuf, "pt:");
16455 +               jnode_tostring_internal(ZJNODE(pos->lock.node), fmtbuf);
16456 +       }
16457 +
16458 +       done_load_count(&load);
16459 +       return fmtbuf;
16460 +}
16461 +
16462 +#endif /* REISER4_TRACE */
16463 +
16464 +/* TODO LIST (no particular order): */
16465 +/* I have labelled most of the legitimate FIXME comments in this file with letters to
16466 +   indicate which issue they relate to.  There are a few miscellaneous FIXMEs with
16467 +   specific names mentioned instead that need to be inspected/resolved. */
16468 +/* B. There is an issue described in reverse_relocate_test having to do with an
16469 +   imprecise is_preceder? check having to do with partially-dirty extents.  The code that
16470 +   sets preceder hints and computes the preceder is basically untested.  Careful testing
16471 +   needs to be done that preceder calculations are done correctly, since if it doesn't
16472 +   affect correctness we will not catch this stuff during regular testing. */
16473 +/* C. EINVAL, E_DEADLOCK, E_NO_NEIGHBOR, ENOENT handling.  It is unclear which of these are
16474 +   considered expected but unlikely conditions.  Flush currently returns 0 (i.e., success
16475 +   but no progress, i.e., restart) whenever it receives any of these in jnode_flush().
16476 +   Many of the calls that may produce one of these return values (i.e.,
16477 +   longterm_lock_znode, reiser4_get_parent, reiser4_get_neighbor, ...) check some of these
16478 +   values themselves and, for instance, stop flushing instead of resulting in a restart.
16479 +   If any of these results are true error conditions then flush will go into a busy-loop,
16480 +   as we noticed during testing when a corrupt tree caused find_child_ptr to return
16481 +   ENOENT.  It needs careful thought and testing of corner conditions.
16482 +*/
16483 +/* D. Atomicity of flush_prep against deletion and flush concurrency.  Suppose a created
16484 +   block is assigned a block number then early-flushed to disk.  It is dirtied again and
16485 +   flush is called again.  Concurrently, that block is deleted, and the de-allocation of
16486 +   its block number does not need to be deferred, since it is not part of the preserve set
16487 +   (i.e., it didn't exist before the transaction).  I think there may be a race condition
16488 +   where flush writes the dirty, created block after the non-deferred deallocated block
16489 +   number is re-allocated, making it possible to write deleted data on top of non-deleted
16490 +   data.  Its just a theory, but it needs to be thought out. */
16491 +/* F. bio_alloc() failure is not handled gracefully. */
16492 +/* G. Unallocated children. */
16493 +/* H. Add a WANDERED_LIST to the atom to clarify the placement of wandered blocks. */
16494 +/* I. Rename flush-scan to scan-point, (flush-pos to flush-point?) */
16495 +
16496 +/* JNODE_FLUSH: MAIN ENTRY POINT */
16497 +/* This is the main entry point for flushing a jnode and its dirty neighborhood (dirty
16498 +   neighborhood is named "slum").  Jnode_flush() is called if reiser4 has to write dirty
16499 +   blocks to disk, it happens when Linux VM decides to reduce number of dirty pages or as
16500 +   a part of transaction commit.
16501 +
16502 +   Our objective here is to prep and flush the slum the jnode belongs to. We want to
16503 +   squish the slum together, and allocate the nodes in it as we squish because allocation
16504 +   of children affects squishing of parents.
16505 +
16506 +   The "argument" @node tells flush where to start.  From there, flush finds the left edge
16507 +   of the slum, and calls squalloc (in which nodes are squeezed and allocated).  To find a
16508 +   "better place" to start squalloc first we perform a flush_scan.
16509 +
16510 +   Flush-scanning may be performed in both left and right directions, but for different
16511 +   purposes.  When scanning to the left, we are searching for a node that precedes a
16512 +   sequence of parent-first-ordered nodes which we will then flush in parent-first order.
16513 +   During flush-scanning, we also take the opportunity to count the number of consecutive
16514 +   leaf nodes.  If this number is past some threshold (FLUSH_RELOCATE_THRESHOLD), then we
16515 +   make a decision to reallocate leaf nodes (thus favoring write-optimization).
16516 +
16517 +   Since the flush argument node can be anywhere in a sequence of dirty leaves, there may
16518 +   also be dirty nodes to the right of the argument.  If the scan-left operation does not
16519 +   count at least FLUSH_RELOCATE_THRESHOLD nodes then we follow it with a right-scan
16520 +   operation to see whether there is, in fact, enough nodes to meet the relocate
16521 +   threshold.  Each right- and left-scan operation uses a single flush_scan object.
16522 +
16523 +   After left-scan and possibly right-scan, we prepare a flush_position object with the
16524 +   starting flush point or parent coordinate, which was determined using scan-left.
16525 +
16526 +   Next we call the main flush routine, squalloc, which iterates along the
16527 +   leaf level, squeezing and allocating nodes (and placing them into the flush queue).
16528 +
16529 +   After squalloc returns we take extra steps to ensure that all the children
16530 +   of the final twig node are allocated--this involves repeating squalloc
16531 +   until we finish at a twig with no unallocated children.
16532 +
16533 +   Finally, we call flush_empty_queue to submit write-requests to disk.  If we encounter
16534 +   any above-twig nodes during flush_empty_queue that still have unallocated children, we
16535 +   flush_unprep them.
16536 +
16537 +   Flush treats several "failure" cases as non-failures, essentially causing them to start
16538 +   over.  E_DEADLOCK is one example.  FIXME:(C) EINVAL, E_NO_NEIGHBOR, ENOENT: these should
16539 +   probably be handled properly rather than restarting, but there are a bunch of cases to
16540 +   audit.
16541 +*/
16542 +
16543 +static int jnode_flush(jnode * node, long *nr_to_flush, long * nr_written, flush_queue_t * fq, int flags)
16544 +{
16545 +       long ret = 0;
16546 +       flush_scan right_scan;
16547 +       flush_scan left_scan;
16548 +       flush_pos_t flush_pos;
16549 +       int todo;
16550 +       struct super_block *sb;
16551 +       reiser4_super_info_data *sbinfo;
16552 +       jnode * leftmost_in_slum = NULL;
16553 +
16554 +       assert("jmacd-76619", lock_stack_isclean(get_current_lock_stack()));
16555 +       assert("nikita-3022", schedulable());
16556 +
16557 +       /* lock ordering: delete_sema and flush_sema are unordered */
16558 +       assert("nikita-3185",
16559 +              get_current_super_private()->delete_sema_owner != current);
16560 +
16561 +       sb = reiser4_get_current_sb();
16562 +       sbinfo = get_super_private(sb);
16563 +       if (!reiser4_is_set(sb, REISER4_MTFLUSH)) {
16564 +#if REISER4_STATS
16565 +               unsigned long sleep_start = jiffies;
16566 +#endif
16567 +               down(&sbinfo->flush_sema);
16568 +#if REISER4_STATS
16569 +               reiser4_stat_add(flush.slept_in_mtflush_sem , jiffies - sleep_start);
16570 +#endif
16571 +       }
16572 +
16573 +       /* Flush-concurrency debug code */
16574 +#if REISER4_DEBUG
16575 +       atomic_inc(&flush_cnt);
16576 +       ON_TRACE(TRACE_FLUSH,
16577 +                "flush enter: pid %ul %u concurrent procs\n",
16578 +                current->pid, atomic_read(&flush_cnt));
16579 +       IF_TRACE(TRACE_FLUSH,
16580 +                if (atomic_read(&flush_cnt) > 1) printk("flush concurrency\n"););
16581 +#endif
16582 +
16583 +       enter_flush(sb);
16584 +
16585 +       ON_TRACE(TRACE_FLUSH, "flush squalloc %s %s\n", jnode_tostring(node), flags_tostring(flags));
16586 +
16587 +       /* Initialize a flush position. */
16588 +       pos_init(&flush_pos);
16589 +
16590 +       flush_pos.nr_to_flush = nr_to_flush;
16591 +       flush_pos.nr_written = nr_written;
16592 +       flush_pos.fq = fq;
16593 +       flush_pos.flags = flags;
16594 +
16595 +       scan_init(&right_scan);
16596 +       scan_init(&left_scan);
16597 +
16598 +       /* init linkage status of the node */
16599 +       if (jnode_is_znode(node)) {
16600 +               /* if jnode is unformatted this status will be set in scan_unformatted */
16601 +               set_flush_scan_nstat(&left_scan, LINKED);
16602 +               set_flush_scan_nstat(&right_scan, LINKED);
16603 +       }
16604 +
16605 +       /*IF_TRACE (TRACE_FLUSH_VERB, print_tree_rec ("parent_first", current_tree, REISER4_TREE_BRIEF)); */
16606 +       /*IF_TRACE (TRACE_FLUSH_VERB, print_tree_rec ("parent_first", current_tree, REISER4_TREE_CHECK)); */
16607 +
16608 +       /* First scan left and remember the leftmost scan position.  If the leftmost
16609 +          position is unformatted we remember its parent_coord.  We scan until counting
16610 +          FLUSH_SCAN_MAXNODES.
16611 +
16612 +          If starting @node is unformatted, at the beginning of left scan its
16613 +          parent (twig level node, containing extent item) will be long term
16614 +          locked and lock handle will be stored in the
16615 +          @right_scan->parent_lock. This lock is used to start the rightward
16616 +          scan without redoing the tree traversal (necessary to find parent)
16617 +          and, hence, is kept during leftward scan. As a result, we have to
16618 +          use try-lock when taking long term locks during the leftward scan.
16619 +       */
16620 +       ret = scan_left(&left_scan, &right_scan,
16621 +                       node, sbinfo->flush.scan_maxnodes);
16622 +       if (ret != 0)
16623 +               goto failed;
16624 +
16625 +       leftmost_in_slum = jref(left_scan.node);
16626 +       scan_done(&left_scan);
16627 +
16628 +       /* Then possibly go right to decide if we will use a policy of relocating leaves.
16629 +          This is only done if we did not scan past (and count) enough nodes during the
16630 +          leftward scan.  If we do scan right, we only care to go far enough to establish
16631 +          that at least FLUSH_RELOCATE_THRESHOLD number of nodes are being flushed.  The
16632 +          scan limit is the difference between left_scan.count and the threshold. */
16633 +       reiser4_stat_add(flush.left, left_scan.count);
16634 +
16635 +       todo = sbinfo->flush.relocate_threshold - left_scan.count;
16636 +       /* scan right is inherently deadlock prone, because we are
16637 +        * (potentially) holding a lock on the twig node at this moment.
16638 +        * FIXME: this is incorrect comment: lock is not held */
16639 +       if (todo > 0 && (get_flush_scan_nstat(&right_scan) == LINKED)) {
16640 +               ret = scan_right(&right_scan, node, (unsigned)todo);
16641 +               if (ret != 0)
16642 +                       goto failed;
16643 +       }
16644 +
16645 +       /* Only the right-scan count is needed, release any rightward locks right away. */
16646 +       scan_done(&right_scan);
16647 +
16648 +       ON_TRACE(TRACE_FLUSH, "flush: left: %i, right: %i\n",
16649 +                left_scan.count, right_scan.count);
16650 +
16651 +       reiser4_stat_add(flush.right, right_scan.count);
16652 +
16653 +       /* ... and the answer is: we should relocate leaf nodes if at least
16654 +          FLUSH_RELOCATE_THRESHOLD nodes were found. */
16655 +       flush_pos.leaf_relocate = JF_ISSET(node, JNODE_REPACK) ||
16656 +               (left_scan.count + right_scan.count >= sbinfo->flush.relocate_threshold);
16657 +
16658 +       /*assert ("jmacd-6218", jnode_check_dirty (left_scan.node)); */
16659 +
16660 +       /* Funny business here.  We set the 'point' in the flush_position at prior to
16661 +          starting squalloc regardless of whether the first point is
16662 +          formatted or unformatted.  Without this there would be an invariant, in the
16663 +          rest of the code, that if the flush_position is unformatted then
16664 +          flush_position->point is NULL and flush_position->parent_{lock,coord} is set,
16665 +          and if the flush_position is formatted then flush_position->point is non-NULL
16666 +          and no parent info is set.
16667 +
16668 +          This seems lazy, but it makes the initial calls to reverse_relocate_test
16669 +          (which ask "is it the pos->point the leftmost child of its parent") much easier
16670 +          because we know the first child already.  Nothing is broken by this, but the
16671 +          reasoning is subtle.  Holding an extra reference on a jnode during flush can
16672 +          cause us to see nodes with HEARD_BANSHEE during squalloc, because nodes are not
16673 +          removed from sibling lists until they have zero reference count.  Flush would
16674 +          never observe a HEARD_BANSHEE node on the left-edge of flush, nodes are only
16675 +          deleted to the right.  So if nothing is broken, why fix it?
16676 +
16677 +          NOTE-NIKITA actually, flush can meet HEARD_BANSHEE node at any
16678 +          point and in any moment, because of the concurrent file system
16679 +          activity (for example, truncate). */
16680 +
16681 +       /* Check jnode state after flush_scan completed. Having a lock on this
16682 +          node or its parent (in case of unformatted) helps us in case of
16683 +          concurrent flushing. */
16684 +       if (jnode_check_flushprepped(leftmost_in_slum) && !jnode_squeezable(leftmost_in_slum)) {
16685 +               ON_TRACE(TRACE_FLUSH_VERB, "flush concurrency: %s already allocated\n", pos_tostring(&flush_pos));
16686 +               ret = 0;
16687 +               goto failed;
16688 +       }
16689 +
16690 +       /* Now setup flush_pos using scan_left's endpoint. */
16691 +       ret = prepare_flush_pos(&flush_pos, leftmost_in_slum);
16692 +       if (ret)
16693 +               goto failed;
16694 +
16695 +       if (znode_get_level(flush_pos.coord.node) == LEAF_LEVEL
16696 +           && node_is_empty(flush_pos.coord.node)) {
16697 +               znode * empty = flush_pos.coord.node;
16698 +
16699 +               assert ("zam-1022", !ZF_ISSET(empty, JNODE_HEARD_BANSHEE));
16700 +               ret = delete_empty_node(empty);
16701 +               goto failed;
16702 +       }
16703 +
16704 +       if (jnode_check_flushprepped(leftmost_in_slum) && !jnode_squeezable(leftmost_in_slum)) {
16705 +               ON_TRACE(TRACE_FLUSH_VERB, "flush concurrency: %s already allocated\n", pos_tostring(&flush_pos));
16706 +               ret = 0;
16707 +               goto failed;
16708 +       }
16709 +
16710 +       /* Set pos->preceder and (re)allocate pos and its ancestors if it is needed  */
16711 +       ret = alloc_pos_and_ancestors(&flush_pos);
16712 +       if (ret)
16713 +               goto failed;
16714 +
16715 +       /* Do the main rightward-bottom-up squeeze and allocate loop. */
16716 +       check_pos(&flush_pos);
16717 +       ret = squalloc(&flush_pos);
16718 +       check_pos(&flush_pos);
16719 +       pos_stop(&flush_pos);
16720 +       if (ret)
16721 +               goto failed;
16722 +
16723 +       /* FIXME_NFQUCMPD: Here, handle the twig-special case for unallocated children.
16724 +          First, the pos_stop() and pos_valid() routines should be modified
16725 +          so that pos_stop() sets a flush_position->stop flag to 1 without
16726 +          releasing the current position immediately--instead release it in
16727 +          pos_done().  This is a better implementation than the current one anyway.
16728 +
16729 +          It is not clear that all fields of the flush_position should not be released,
16730 +          but at the very least the parent_lock, parent_coord, and parent_load should
16731 +          remain held because they are hold the last twig when pos_stop() is
16732 +          called.
16733 +
16734 +          When we reach this point in the code, if the parent_coord is set to after the
16735 +          last item then we know that flush reached the end of a twig (and according to
16736 +          the new flush queueing design, we will return now).  If parent_coord is not
16737 +          past the last item, we should check if the current twig has any unallocated
16738 +          children to the right (we are not concerned with unallocated children to the
16739 +          left--in that case the twig itself should not have been allocated).  If the
16740 +          twig has unallocated children to the right, set the parent_coord to that
16741 +          position and then repeat the call to squalloc.
16742 +
16743 +          Testing for unallocated children may be defined in two ways: if any internal
16744 +          item has a fake block number, it is unallocated; if any extent item is
16745 +          unallocated then all of its children are unallocated.  But there is a more
16746 +          aggressive approach: if there are any dirty children of the twig to the right
16747 +          of the current position, we may wish to relocate those nodes now.  Checking for
16748 +          potential relocation is more expensive as it requires knowing whether there are
16749 +          any dirty children that are not unallocated.  The extent_needs_allocation
16750 +          should be used after setting the correct preceder.
16751 +
16752 +          When we reach the end of a twig at this point in the code, if the flush can
16753 +          continue (when the queue is ready) it will need some information on the future
16754 +          starting point.  That should be stored away in the flush_handle using a seal, I
16755 +          believe.  Holding a jref() on the future starting point may break other code
16756 +          that deletes that node.
16757 +       */
16758 +
16759 +       /* FIXME_NFQUCMPD: Also, we don't want to do any flushing when flush is called
16760 +          above the twig level.  If the VM calls flush above the twig level, do nothing
16761 +          and return (but figure out why this happens).  The txnmgr should be modified to
16762 +          only flush its leaf-level dirty list.  This will do all the necessary squeeze
16763 +          and allocate steps but leave unallocated branches and possibly unallocated
16764 +          twigs (when the twig's leftmost child is not dirty).  After flushing the leaf
16765 +          level, the remaining unallocated nodes should be given write-optimized
16766 +          locations.  (Possibly, the remaining unallocated twigs should be allocated just
16767 +          before their leftmost child.)
16768 +       */
16769 +
16770 +       /* Any failure reaches this point. */
16771 +failed:
16772 +
16773 +       if (nr_to_flush != NULL) {
16774 +               if (ret >= 0) {
16775 +                       ON_TRACE(TRACE_FLUSH, "flush_jnode wrote %u blocks\n", flush_pos.prep_or_free_cnt);
16776 +                       (*nr_to_flush) = flush_pos.prep_or_free_cnt;
16777 +               } else {
16778 +                       (*nr_to_flush) = 0;
16779 +               }
16780 +       }
16781 +
16782 +       switch (ret) {
16783 +           case -E_REPEAT:
16784 +           case -EINVAL:
16785 +           case -E_DEADLOCK:
16786 +           case -E_NO_NEIGHBOR:
16787 +           case -ENOENT:
16788 +               /* FIXME(C): Except for E_DEADLOCK, these should probably be handled properly
16789 +                  in each case.  They already are handled in many cases. */
16790 +               /* Something bad happened, but difficult to avoid...  Try again! */
16791 +               ON_TRACE(TRACE_FLUSH, "flush restartable failure: %ld\n", ret);
16792 +               ret = 0;
16793 +       }
16794 +
16795 +       if (leftmost_in_slum)
16796 +               jput(leftmost_in_slum);
16797 +
16798 +       pos_done(&flush_pos);
16799 +       scan_done(&left_scan);
16800 +       scan_done(&right_scan);
16801 +
16802 +       ON_DEBUG(atomic_dec(&flush_cnt));
16803 +
16804 +       write_syscall_log("ex");
16805 +
16806 +       leave_flush(sb);
16807 +
16808 +       if (!reiser4_is_set(sb, REISER4_MTFLUSH))
16809 +               up(&sbinfo->flush_sema);
16810 +
16811 +       return ret;
16812 +}
16813 +
16814 +/* The reiser4 flush subsystem can be turned into "rapid flush mode" means that
16815 + * flusher should submit all prepped nodes immediately without keeping them in
16816 + * flush queues for long time.  The reason for rapid flush mode is to free
16817 + * memory as fast as possible. */
16818 +
16819 +#if REISER4_USE_RAPID_FLUSH
16820 +
16821 +/**
16822 + * submit all prepped nodes if rapid flush mode is set,
16823 + * turn rapid flush mode off.
16824 + */
16825 +
16826 +static int rapid_flush (flush_pos_t * pos)
16827 +{
16828 +       if (!wbq_available())
16829 +               return 0;
16830 +
16831 +       return write_prepped_nodes(pos, 1);
16832 +}
16833 +
16834 +#else
16835 +
16836 +#define rapid_flush(pos) (0)
16837 +
16838 +#endif /* REISER4_USE_RAPID_FLUSH */
16839 +
16840 +/* Flush some nodes of current atom, usually slum, return -E_REPEAT if there are more nodes
16841 + * to flush, return 0 if atom's dirty lists empty and keep current atom locked, return
16842 + * other errors as they are. */
16843 +reiser4_internal int
16844 +flush_current_atom (int flags, long *nr_submitted, txn_atom ** atom)
16845 +{
16846 +       reiser4_super_info_data * sinfo = get_current_super_private();
16847 +       flush_queue_t *fq = NULL;
16848 +       jnode * node;
16849 +       int nr_queued;
16850 +       int ret;
16851 +
16852 +       assert ("zam-889", atom != NULL && *atom != NULL);
16853 +       assert ("zam-890", spin_atom_is_locked(*atom));
16854 +       assert ("zam-892", get_current_context()->trans->atom == *atom);
16855 +
16856 +       while(1) {
16857 +               ret = fq_by_atom(*atom, &fq);
16858 +               if (ret != -E_REPEAT)
16859 +                       break;
16860 +               *atom = get_current_atom_locked();
16861 +       }
16862 +        if (ret)
16863 +               return ret;
16864 +
16865 +       assert ("zam-891", spin_atom_is_locked(*atom));
16866 +
16867 +       /* parallel flushers limit */
16868 +       if (sinfo->tmgr.atom_max_flushers != 0) {
16869 +               while ((*atom)->nr_flushers >= sinfo->tmgr.atom_max_flushers) {
16870 +                       /* An atom_send_event() call is inside fq_put_nolock() which is
16871 +                          called when flush is finished and nr_flushers is
16872 +                          decremented. */
16873 +                       atom_wait_event(*atom);
16874 +                       *atom = get_current_atom_locked();
16875 +               }
16876 +       }
16877 +
16878 +       /* count ourself as a flusher */
16879 +       (*atom)->nr_flushers++;
16880 +
16881 +       if (REISER4_LOG) {
16882 +               UNLOCK_ATOM(*atom);
16883 +               write_syscall_log("in");
16884 +               *atom = get_current_atom_locked();
16885 +       }
16886 +       reiser4_stat_inc(flush.flush);
16887 +       writeout_mode_enable();
16888 +
16889 +       nr_queued = 0;
16890 +
16891 +       /* In this loop we process all already prepped (RELOC or OVRWR) and dirtied again
16892 +        * nodes. The atom spin lock is not released until all dirty nodes processed or
16893 +        * not prepped node found in the atom dirty lists. */
16894 +       while ((node = find_first_dirty_jnode(*atom, flags))) {
16895 +               LOCK_JNODE(node);
16896 +
16897 +               assert ("zam-881", jnode_is_dirty(node));
16898 +               assert ("zam-898", !JF_ISSET(node, JNODE_OVRWR));
16899 +
16900 +               if (JF_ISSET(node, JNODE_WRITEBACK)) {
16901 +                       capture_list_remove_clean(node);
16902 +                       capture_list_push_back(ATOM_WB_LIST(*atom), node);
16903 +                       /*XXXX*/ON_DEBUG(count_jnode(*atom, node, DIRTY_LIST, WB_LIST, 1));
16904 +
16905 +               } else if (jnode_is_znode(node) && znode_above_root(JZNODE(node))) {
16906 +                       /* A special case for znode-above-root.  The above-root (fake)
16907 +                          znode is captured and dirtied when the tree height changes or
16908 +                          when the root node is relocated.  This causes atoms to fuse so
16909 +                          that changes at the root are serialized.  However, this node is
16910 +                          never flushed.  This special case used to be in lock.c to
16911 +                          prevent the above-root node from ever being captured, but now
16912 +                          that it is captured we simply prevent it from flushing.  The
16913 +                          log-writer code relies on this to properly log superblock
16914 +                          modifications of the tree height. */
16915 +                       jnode_make_wander_nolock(node);
16916 +               } else if (JF_ISSET(node, JNODE_RELOC)) {
16917 +                       queue_jnode(fq, node);
16918 +                       ++ nr_queued;
16919 +               } else
16920 +                       break;
16921 +
16922 +               UNLOCK_JNODE(node);
16923 +       }
16924 +
16925 +       if (node == NULL) {
16926 +               if (nr_queued == 0) {
16927 +                       writeout_mode_disable();
16928 +                       (*atom)->nr_flushers --;
16929 +                       atom_send_event(*atom);
16930 +                       fq_put_nolock(fq);
16931 +                       /* current atom remains locked */
16932 +                       return 0;
16933 +               }
16934 +               UNLOCK_ATOM(*atom);
16935 +       } else {
16936 +               jref(node);
16937 +               UNLOCK_ATOM(*atom);
16938 +               UNLOCK_JNODE(node);
16939 +               ret = jnode_flush(node, NULL, nr_submitted, fq, flags);
16940 +               jput(node);
16941 +       }
16942 +
16943 +       /* trace_mark(flush); */
16944 +       write_current_logf(WRITE_IO_LOG, "mark=flush\n");
16945 +
16946 +       ret = write_fq(fq, nr_submitted, WRITEOUT_SINGLE_STREAM | WRITEOUT_FOR_PAGE_RECLAIM);
16947 +
16948 +       *atom = get_current_atom_locked();
16949 +       (*atom)->nr_flushers --;
16950 +       fq_put_nolock(fq);
16951 +       atom_send_event(*atom);
16952 +       UNLOCK_ATOM(*atom);
16953 +
16954 +       writeout_mode_disable();
16955 +       write_syscall_log("ex");
16956 +
16957 +       if (ret == 0)
16958 +               ret = -E_REPEAT;
16959 +
16960 +       return ret;
16961 +}
16962 +
16963 +/* REVERSE PARENT-FIRST RELOCATION POLICIES */
16964 +
16965 +/* This implements the is-it-close-enough-to-its-preceder? test for relocation in the
16966 +   reverse parent-first relocate context.  Here all we know is the preceder and the block
16967 +   number.  Since we are going in reverse, the preceder may still be relocated as well, so
16968 +   we can't ask the block allocator "is there a closer block available to relocate?" here.
16969 +   In the _forward_ parent-first relocate context (not here) we actually call the block
16970 +   allocator to try and find a closer location. */
16971 +static int
16972 +reverse_relocate_if_close_enough(const reiser4_block_nr * pblk, const reiser4_block_nr * nblk)
16973 +{
16974 +       reiser4_block_nr dist;
16975 +
16976 +       assert("jmacd-7710", *pblk != 0 && *nblk != 0);
16977 +       assert("jmacd-7711", !blocknr_is_fake(pblk));
16978 +       assert("jmacd-7712", !blocknr_is_fake(nblk));
16979 +
16980 +       /* Distance is the absolute value. */
16981 +       dist = (*pblk > *nblk) ? (*pblk - *nblk) : (*nblk - *pblk);
16982 +
16983 +       /* If the block is less than FLUSH_RELOCATE_DISTANCE blocks away from its preceder
16984 +          block, do not relocate. */
16985 +       if (dist <= get_current_super_private()->flush.relocate_distance) {
16986 +               return 0;
16987 +       }
16988 +
16989 +       return 1;
16990 +}
16991 +
16992 +/* This function is a predicate that tests for relocation.  Always called in the
16993 +   reverse-parent-first context, when we are asking whether the current node should be
16994 +   relocated in order to expand the flush by dirtying the parent level (and thus
16995 +   proceeding to flush that level).  When traversing in the forward parent-first direction
16996 +   (not here), relocation decisions are handled in two places: allocate_znode() and
16997 +   extent_needs_allocation(). */
16998 +static int
16999 +reverse_relocate_test(jnode * node, const coord_t * parent_coord, flush_pos_t * pos)
17000 +{
17001 +       reiser4_block_nr pblk = 0;
17002 +       reiser4_block_nr nblk = 0;
17003 +
17004 +       assert("jmacd-8989", !jnode_is_root(node));
17005 +
17006 +       /*
17007 +        * This function is called only from the
17008 +        * reverse_relocate_check_dirty_parent() and only if the parent
17009 +        * node is clean. This implies that the parent has the real (i.e., not
17010 +        * fake) block number, and, so does the child, because otherwise the
17011 +        * parent would be dirty.
17012 +        */
17013 +
17014 +       /* New nodes are treated as if they are being relocated. */
17015 +       if (jnode_created(node)
17016 +           || (pos->leaf_relocate && jnode_get_level(node) == LEAF_LEVEL)) {
17017 +               return 1;
17018 +       }
17019 +
17020 +       /* Find the preceder.  FIXME(B): When the child is an unformatted, previously
17021 +          existing node, the coord may be leftmost even though the child is not the
17022 +          parent-first preceder of the parent.  If the first dirty node appears somewhere
17023 +          in the middle of the first extent unit, this preceder calculation is wrong.
17024 +          Needs more logic in here. */
17025 +       if (coord_is_leftmost_unit(parent_coord)) {
17026 +               pblk = *znode_get_block(parent_coord->node);
17027 +       } else {
17028 +               pblk = pos->preceder.blk;
17029 +       }
17030 +       check_preceder(pblk);
17031 +
17032 +       /* If (pblk == 0) then the preceder isn't allocated or isn't known: relocate. */
17033 +       if (pblk == 0) {
17034 +               return 1;
17035 +       }
17036 +
17037 +       nblk = *jnode_get_block(node);
17038 +
17039 +       if (blocknr_is_fake(&nblk))
17040 +               /* child is unallocated, mark parent dirty */
17041 +               return 1;
17042 +
17043 +       return reverse_relocate_if_close_enough(&pblk, &nblk);
17044 +}
17045 +
17046 +/* This function calls reverse_relocate_test to make a reverse-parent-first
17047 +   relocation decision and then, if yes, it marks the parent dirty. */
17048 +static int
17049 +reverse_relocate_check_dirty_parent(jnode * node, const coord_t * parent_coord, flush_pos_t * pos)
17050 +{
17051 +       int ret;
17052 +
17053 +       if (!znode_check_dirty(parent_coord->node)) {
17054 +
17055 +               ret = reverse_relocate_test(node, parent_coord, pos);
17056 +               if (ret < 0) {
17057 +                       return ret;
17058 +               }
17059 +
17060 +               /* FIXME-ZAM
17061 +                  if parent is already relocated - we do not want to grab space, right? */
17062 +               if (ret == 1) {
17063 +                       int grabbed;
17064 +
17065 +                       grabbed = get_current_context()->grabbed_blocks;
17066 +                       if (reiser4_grab_space_force((__u64)1, BA_RESERVED) != 0)
17067 +                           reiser4_panic("umka-1250",
17068 +                                         "No space left during flush.");
17069 +
17070 +                       assert("jmacd-18923", znode_is_write_locked(parent_coord->node));
17071 +                       znode_make_dirty(parent_coord->node);
17072 +                       grabbed2free_mark(grabbed);
17073 +               }
17074 +       }
17075 +
17076 +       return 0;
17077 +}
17078 +
17079 +/* INITIAL ALLOCATE ANCESTORS STEP (REVERSE PARENT-FIRST ALLOCATION BEFORE FORWARD
17080 +   PARENT-FIRST LOOP BEGINS) */
17081 +
17082 +/* Get the leftmost child for given coord. */
17083 +static int get_leftmost_child_of_unit (const coord_t * coord, jnode ** child)
17084 +{
17085 +       int ret;
17086 +
17087 +       ret = item_utmost_child(coord, LEFT_SIDE, child);
17088 +
17089 +       if (ret)
17090 +               return ret;
17091 +
17092 +       if (IS_ERR(*child))
17093 +               return PTR_ERR(*child);
17094 +
17095 +       return 0;
17096 +}
17097 +
17098 +/* This step occurs after the left- and right-scans are completed, before starting the
17099 +   forward parent-first traversal.  Here we attempt to allocate ancestors of the starting
17100 +   flush point, which means continuing in the reverse parent-first direction to the
17101 +   parent, grandparent, and so on (as long as the child is a leftmost child).  This
17102 +   routine calls a recursive process, alloc_one_ancestor, which does the real work,
17103 +   except there is special-case handling here for the first ancestor, which may be a twig.
17104 +   At each level (here and alloc_one_ancestor), we check for relocation and then, if
17105 +   the child is a leftmost child, repeat at the next level.  On the way back down (the
17106 +   recursion), we allocate the ancestors in parent-first order. */
17107 +static int alloc_pos_and_ancestors(flush_pos_t * pos)
17108 +{
17109 +       int ret = 0;
17110 +       lock_handle plock;
17111 +       load_count pload;
17112 +       coord_t pcoord;
17113 +
17114 +       if (znode_check_flushprepped(pos->lock.node))
17115 +               return 0;
17116 +
17117 +       ON_TRACE(TRACE_FLUSH_VERB, "flush alloc ancestors: %s\n", pos_tostring(pos));
17118 +
17119 +       coord_init_invalid(&pcoord, NULL);
17120 +       init_lh(&plock);
17121 +       init_load_count(&pload);
17122 +
17123 +       if (pos->state == POS_ON_EPOINT) {
17124 +               /* a special case for pos on twig level, where we already have
17125 +                  a lock on parent node. */
17126 +               /* The parent may not be dirty, in which case we should decide
17127 +                  whether to relocate the child now. If decision is made to
17128 +                  relocate the child, the parent is marked dirty. */
17129 +               ret = reverse_relocate_check_dirty_parent(pos->child, &pos->coord, pos);
17130 +               if (ret)
17131 +                       goto exit;
17132 +
17133 +               /* FIXME_NFQUCMPD: We only need to allocate the twig (if child
17134 +                  is leftmost) and the leaf/child, so recursion is not needed.
17135 +                  Levels above the twig will be allocated for
17136 +                  write-optimization before the transaction commits.  */
17137 +
17138 +               /* Do the recursive step, allocating zero or more of our
17139 +                * ancestors. */
17140 +               ret = alloc_one_ancestor(&pos->coord, pos);
17141 +
17142 +       } else {
17143 +               if (!znode_is_root(pos->lock.node)) {
17144 +                       /* all formatted nodes except tree root */
17145 +                       ret = reiser4_get_parent(&plock, pos->lock.node, ZNODE_WRITE_LOCK, 0);
17146 +                       if (ret)
17147 +                               goto exit;
17148 +
17149 +                       ret = incr_load_count_znode(&pload, plock.node);
17150 +                       if (ret)
17151 +                               goto exit;
17152 +
17153 +                       ret = find_child_ptr(plock.node, pos->lock.node, &pcoord);
17154 +                       if (ret)
17155 +                               goto exit;
17156 +
17157 +                       ret = reverse_relocate_check_dirty_parent(ZJNODE(pos->lock.node), &pcoord, pos);
17158 +                       if (ret)
17159 +                               goto exit;
17160 +
17161 +                       ret = alloc_one_ancestor(&pcoord, pos);
17162 +                       if (ret)
17163 +                               goto exit;
17164 +               }
17165 +
17166 +               ret = allocate_znode(pos->lock.node, &pcoord, pos);
17167 +       }
17168 +exit:
17169 +       done_load_count(&pload);
17170 +       done_lh(&plock);
17171 +       return ret;
17172 +}
17173 +
17174 +/* This is the recursive step described in alloc_pos_and_ancestors, above.  Ignoring the
17175 +   call to set_preceder, which is the next function described, this checks if the
17176 +   child is a leftmost child and returns if it is not.  If the child is a leftmost child
17177 +   it checks for relocation, possibly dirtying the parent.  Then it performs the recursive
17178 +   step. */
17179 +static int alloc_one_ancestor(const coord_t * coord, flush_pos_t * pos)
17180 +{
17181 +       int ret = 0;
17182 +       lock_handle alock;
17183 +       load_count aload;
17184 +       coord_t acoord;
17185 +
17186 +       /* As we ascend at the left-edge of the region to flush, take this opportunity at
17187 +          the twig level to find our parent-first preceder unless we have already set
17188 +          it. */
17189 +       if (pos->preceder.blk == 0) {
17190 +               ret = set_preceder(coord, pos);
17191 +               if (ret != 0)
17192 +                       return ret;
17193 +       }
17194 +
17195 +       /* If the ancestor is clean or already allocated, or if the child is not a
17196 +          leftmost child, stop going up, even leaving coord->node not flushprepped. */
17197 +       if (znode_check_flushprepped(coord->node)|| !coord_is_leftmost_unit(coord))
17198 +               return 0;
17199 +
17200 +       init_lh(&alock);
17201 +       init_load_count(&aload);
17202 +       coord_init_invalid(&acoord, NULL);
17203 +
17204 +       /* Only ascend to the next level if it is a leftmost child, but write-lock the
17205 +          parent in case we will relocate the child. */
17206 +       if (!znode_is_root(coord->node)) {
17207 +
17208 +               ret = jnode_lock_parent_coord(
17209 +                       ZJNODE(coord->node), &acoord, &alock, &aload, ZNODE_WRITE_LOCK, 0);
17210 +               if (ret != 0) {
17211 +                       /* FIXME(C): check EINVAL, E_DEADLOCK */
17212 +                       goto exit;
17213 +               }
17214 +
17215 +               ret = reverse_relocate_check_dirty_parent(ZJNODE(coord->node), &acoord, pos);
17216 +               if (ret != 0) {
17217 +                       goto exit;
17218 +               }
17219 +
17220 +               /* Recursive call. */
17221 +               if (!znode_check_flushprepped(acoord.node)) {
17222 +                       ret = alloc_one_ancestor(&acoord, pos);
17223 +                       if (ret)
17224 +                               goto exit;
17225 +               }
17226 +       }
17227 +
17228 +       /* Note: we call allocate with the parent write-locked (except at the root) in
17229 +          case we relocate the child, in which case it will modify the parent during this
17230 +          call. */
17231 +       ret = allocate_znode(coord->node, &acoord, pos);
17232 +
17233 +exit:
17234 +       done_load_count(&aload);
17235 +       done_lh(&alock);
17236 +       return ret;
17237 +}
17238 +
17239 +/* During the reverse parent-first alloc_pos_and_ancestors process described above there is
17240 +   a call to this function at the twig level.  During alloc_pos_and_ancestors we may ask:
17241 +   should this node be relocated (in reverse parent-first context)?  We repeat this
17242 +   process as long as the child is the leftmost child, eventually reaching an ancestor of
17243 +   the flush point that is not a leftmost child.  The preceder of that ancestors, which is
17244 +   not a leftmost child, is actually on the leaf level.  The preceder of that block is the
17245 +   left-neighbor of the flush point.  The preceder of that block is the rightmost child of
17246 +   the twig on the left.  So, when alloc_pos_and_ancestors passes upward through the twig
17247 +   level, it stops momentarily to remember the block of the rightmost child of the twig on
17248 +   the left and sets it to the flush_position's preceder_hint.
17249 +
17250 +   There is one other place where we may set the flush_position's preceder hint, which is
17251 +   during scan-left.
17252 +*/
17253 +static int
17254 +set_preceder(const coord_t * coord_in, flush_pos_t * pos)
17255 +{
17256 +       int ret;
17257 +       coord_t coord;
17258 +       lock_handle left_lock;
17259 +       load_count  left_load;
17260 +
17261 +#if 0
17262 +       /* do not trust to allocation of nodes above twigs, use the block number of last
17263 +        * write (write optimized approach). */
17264 +       if (znode_get_level(coord_in->node) > TWIG_LEVEL + 1) {
17265 +               get_blocknr_hint_default(&pos->preceder.blk);
17266 +               reiser4_stat_inc(block_alloc.nohint);
17267 +               return 0;
17268 +       }
17269 +#endif
17270 +
17271 +       coord_dup(&coord, coord_in);
17272 +
17273 +       init_lh(&left_lock);
17274 +       init_load_count(&left_load);
17275 +
17276 +       /* FIXME(B): Same FIXME as in "Find the preceder" in reverse_relocate_test.
17277 +          coord_is_leftmost_unit is not the right test if the unformatted child is in the
17278 +          middle of the first extent unit. */
17279 +       if (!coord_is_leftmost_unit(&coord)) {
17280 +               coord_prev_unit(&coord);
17281 +       } else {
17282 +               ret = reiser4_get_left_neighbor(&left_lock, coord.node, ZNODE_READ_LOCK, GN_SAME_ATOM);
17283 +               if (ret) {
17284 +                       /* If we fail for any reason it doesn't matter because the
17285 +                          preceder is only a hint.  We are low-priority at this point, so
17286 +                          this must be the case. */
17287 +                       if (ret == -E_REPEAT || ret == -E_NO_NEIGHBOR ||
17288 +                           ret == -ENOENT || ret == -EINVAL || ret == -E_DEADLOCK)
17289 +                       {
17290 +                               ret = 0;
17291 +                       }
17292 +                       goto exit;
17293 +               }
17294 +
17295 +               ret = incr_load_count_znode(&left_load, left_lock.node);
17296 +               if (ret)
17297 +                       goto exit;
17298 +
17299 +               coord_init_last_unit(&coord, left_lock.node);
17300 +       }
17301 +
17302 +       ret = item_utmost_child_real_block(&coord, RIGHT_SIDE, &pos->preceder.blk);
17303 +exit:
17304 +       check_preceder(pos->preceder.blk);
17305 +       done_load_count(&left_load);
17306 +       done_lh(&left_lock);
17307 +       return ret;
17308 +}
17309 +
17310 +/* MAIN SQUEEZE AND ALLOCATE LOOP (THREE BIG FUNCTIONS) */
17311 +
17312 +/* This procedure implements the outer loop of the flush algorithm.  To put this in
17313 +   context, here is the general list of steps taken by the flush routine as a whole:
17314 +
17315 +   1. Scan-left
17316 +   2. Scan-right (maybe)
17317 +   3. Allocate initial flush position and its ancestors
17318 +   4. <handle extents>
17319 +   5. <squeeze and next position and its ancestors to-the-right,
17320 +       then update position to-the-right>
17321 +   6. <repeat from #4 until flush is stopped>
17322 +
17323 +   This procedure implements the loop in steps 4 through 6 in the above listing.
17324 +
17325 +   Step 4: if the current flush position is an extent item (position on the twig level),
17326 +   it allocates the extent (allocate_extent_item_in_place) then shifts to the next
17327 +   coordinate.  If the next coordinate's leftmost child needs flushprep, we will continue.
17328 +   If the next coordinate is an internal item, we descend back to the leaf level,
17329 +   otherwise we repeat a step #4 (labeled ALLOC_EXTENTS below).  If the "next coordinate"
17330 +   brings us past the end of the twig level, then we call
17331 +   reverse_relocate_end_of_twig to possibly dirty the next (right) twig, prior to
17332 +   step #5 which moves to the right.
17333 +
17334 +   Step 5: calls squalloc_changed_ancestors, which initiates a recursive call up the
17335 +   tree to allocate any ancestors of the next-right flush position that are not also
17336 +   ancestors of the current position.  Those ancestors (in top-down order) are the next in
17337 +   parent-first order.  We squeeze adjacent nodes on the way up until the right node and
17338 +   current node share the same parent, then allocate on the way back down.  Finally, this
17339 +   step sets the flush position to the next-right node.  Then repeat steps 4 and 5.
17340 +*/
17341 +
17342 +/* SQUEEZE CODE */
17343 +
17344 +
17345 +/* squalloc_right_twig helper function, cut a range of extent items from
17346 +   cut node to->node from the beginning up to coord @to. */
17347 +static int squalloc_right_twig_cut(coord_t * to, reiser4_key * to_key, znode * left)
17348 +{
17349 +       coord_t from;
17350 +       reiser4_key from_key;
17351 +
17352 +       coord_init_first_unit(&from, to->node);
17353 +       item_key_by_coord(&from, &from_key);
17354 +
17355 +       return cut_node_content(&from, to, &from_key, to_key, NULL);
17356 +}
17357 +
17358 +/* Copy as much of the leading extents from @right to @left, allocating
17359 +   unallocated extents as they are copied.  Returns SQUEEZE_TARGET_FULL or
17360 +   SQUEEZE_SOURCE_EMPTY when no more can be shifted.  If the next item is an
17361 +   internal item it calls shift_one_internal_unit and may then return
17362 +   SUBTREE_MOVED. */
17363 +squeeze_result squalloc_extent(znode *left, const coord_t *, flush_pos_t *, reiser4_key *stop_key);
17364 +#if REISER4_DEBUG
17365 +void *shift_check_prepare(const znode *left, const znode *right);
17366 +void shift_check(void *vp, const znode *left, const znode *right);
17367 +#endif
17368 +static int squeeze_right_twig(znode * left, znode * right, flush_pos_t * pos)
17369 +{
17370 +       int ret = SUBTREE_MOVED;
17371 +       coord_t coord;          /* used to iterate over items */
17372 +       reiser4_key stop_key;
17373 +
17374 +       assert("jmacd-2008", !node_is_empty(right));
17375 +       coord_init_first_unit(&coord, right);
17376 +
17377 +       DISABLE_NODE_CHECK;
17378 +
17379 +       ON_TRACE(TRACE_FLUSH_VERB, "sq_twig before copy extents: left %s\n", znode_tostring(left));
17380 +       ON_TRACE(TRACE_FLUSH_VERB, "sq_twig before copy extents: right %s\n", znode_tostring(right));
17381 +
17382 +       /* FIXME: can be optimized to cut once */
17383 +       while (!node_is_empty(coord.node) && item_is_extent(&coord)) {
17384 +               ON_DEBUG(void *vp);
17385 +
17386 +               assert("vs-1468", coord_is_leftmost_unit(&coord));
17387 +               ON_DEBUG(vp = shift_check_prepare(left, coord.node));
17388 +
17389 +               /* stop_key is used to find what was copied and what to cut */
17390 +               stop_key = *min_key();
17391 +               ret = squalloc_extent(left, &coord, pos, &stop_key);
17392 +               if (ret != SQUEEZE_CONTINUE) {
17393 +                       ON_DEBUG(reiser4_kfree(vp));
17394 +                       break;
17395 +               }
17396 +               assert("vs-1465", !keyeq(&stop_key, min_key()));
17397 +
17398 +               /* Helper function to do the cutting. */
17399 +               set_key_offset(&stop_key, get_key_offset(&stop_key) - 1);
17400 +               check_me("vs-1466", squalloc_right_twig_cut(&coord, &stop_key, left) == 0);
17401 +
17402 +               ON_DEBUG(shift_check(vp, left, coord.node));
17403 +       }
17404 +
17405 +       if (node_is_empty(coord.node))
17406 +               ret = SQUEEZE_SOURCE_EMPTY;
17407 +
17408 +       ENABLE_NODE_CHECK;
17409 +       node_check(left, REISER4_NODE_DKEYS);
17410 +       node_check(right, REISER4_NODE_DKEYS);
17411 +
17412 +       if (ret == SQUEEZE_TARGET_FULL) {
17413 +               goto out;
17414 +       }
17415 +
17416 +       if (node_is_empty(right)) {
17417 +               /* The whole right node was copied into @left. */
17418 +               ON_TRACE(TRACE_FLUSH_VERB, "sq_twig right node empty: %s\n", znode_tostring(right));
17419 +               assert("vs-464", ret == SQUEEZE_SOURCE_EMPTY);
17420 +               goto out;
17421 +       }
17422 +
17423 +       coord_init_first_unit(&coord, right);
17424 +
17425 +       if (!item_is_internal(&coord)) {
17426 +               /* we do not want to squeeze anything else to left neighbor because "slum"
17427 +                  is over */
17428 +               ret = SQUEEZE_TARGET_FULL;
17429 +               goto out;
17430 +       }
17431 +       assert("jmacd-433", item_is_internal(&coord));
17432 +
17433 +       /* Shift an internal unit.  The child must be allocated before shifting any more
17434 +          extents, so we stop here. */
17435 +       ret = shift_one_internal_unit(left, right);
17436 +
17437 +out:
17438 +       assert("jmacd-8612", ret < 0 || ret == SQUEEZE_TARGET_FULL
17439 +              || ret == SUBTREE_MOVED || ret == SQUEEZE_SOURCE_EMPTY);
17440 +
17441 +       if (ret == SQUEEZE_TARGET_FULL) {
17442 +               /* We submit prepped nodes here and expect that this @left twig
17443 +                * will not be modified again during this jnode_flush() call. */
17444 +               int ret1;
17445 +
17446 +               /* NOTE: seems like io is done under long term locks. */
17447 +               ret1 = write_prepped_nodes(pos, 1);
17448 +               if (ret1 < 0)
17449 +                       return ret1;
17450 +       }
17451 +
17452 +       return ret;
17453 +}
17454 +
17455 +/* This is special node method which scans node items and check for each
17456 +   one, if we need to apply flush squeeze item method. This item method
17457 +   may resize/kill the item, and also may change the tree.
17458 +*/
17459 +static int squeeze_node(flush_pos_t * pos, znode * node)
17460 +{
17461 +       int ret = 0;
17462 +
17463 +       item_plugin * iplug;
17464 +
17465 +       assert("edward-304", pos != NULL);
17466 +       assert("edward-305", pos->child == NULL);
17467 +       assert("edward-475", znode_squeezable(node));
17468 +       assert("edward-669", znode_is_wlocked(node));
17469 +
17470 +       if (znode_get_level(node) != LEAF_LEVEL)
17471 +               /* do not squeeze this node */
17472 +               goto exit;
17473 +
17474 +       coord_init_first_unit(&pos->coord, node);
17475 +
17476 +       while (1) {
17477 +               ret = 0;
17478 +
17479 +               if (node_is_empty(node))
17480 +                       /* nothing to squeeze */
17481 +                       goto exit;
17482 +               if (pos->sq && item_squeeze_data(pos)) {
17483 +                       iplug = item_squeeze_plug(pos);
17484 +                       assert("edward-476", iplug->f.squeeze != NULL);
17485 +               }
17486 +               else if (!coord_is_existing_item(&pos->coord))
17487 +                       /* finished this node */
17488 +                       break;
17489 +               else {
17490 +                       iplug = item_plugin_by_coord(&pos->coord);
17491 +                       if (pos->sq && item_squeeze_plug(pos) != iplug)
17492 +                               set_item_squeeze_count(pos, 0);
17493 +               }
17494 +               assert("edward-844", iplug != NULL);
17495 +               if (iplug->f.squeeze == NULL)
17496 +                       /* unsqueezable */
17497 +                       goto next;
17498 +
17499 +               ret = iplug->f.squeeze(pos);
17500 +
17501 +               if (ret == -E_REPEAT)
17502 +                       continue;
17503 +               if (ret)
17504 +                       goto exit;
17505 +
17506 +               assert("edward-307", pos->child == NULL);
17507 +
17508 +               /* now we should check if item_squeeze_data is valid, and if so,
17509 +                  call previous method again, BUT if current item is last
17510 +                  and mergeable with the first item of slum right neighbor,
17511 +                  we set idata->mergeable = 1, go to slum right neighbor
17512 +                  and continue squeezing using this info
17513 +               */
17514 +       next:
17515 +               if (coord_next_item(&pos->coord)) {
17516 +                       /* node is over */
17517 +                       lock_handle right_lock;
17518 +                       load_count right_load;
17519 +                       coord_t coord;
17520 +
17521 +                       if (!pos->sq || !item_squeeze_data(pos))
17522 +                               break;
17523 +
17524 +                       init_lh(&right_lock);
17525 +                       init_load_count(&right_load);
17526 +
17527 +                       /* check for slum right neighbor */
17528 +                       ret = neighbor_in_slum(node, &right_lock, RIGHT_SIDE, ZNODE_WRITE_LOCK);
17529 +                       if (ret == -E_NO_NEIGHBOR)
17530 +                               /* no neighbor, repeat on this node */
17531 +                               continue;
17532 +                       else if (ret)
17533 +                               goto exit;
17534 +                       ret = incr_load_count_znode(&right_load, right_lock.node);
17535 +                       if (ret) {
17536 +                               done_lh(&right_lock);
17537 +                               break;
17538 +                       }
17539 +                       coord_init_after_item_end(&pos->coord);
17540 +                       coord_init_before_first_item(&coord, right_lock.node);
17541 +
17542 +                       if (iplug->b.mergeable(&pos->coord, &coord)) {
17543 +                               /* go to slum right neighbor */
17544 +                               item_squeeze_data(pos)->mergeable = 1;
17545 +                               done_load_count(&right_load);
17546 +                               done_lh(&right_lock);
17547 +                               break;
17548 +                       }
17549 +                       /* first item of right neighbor is not mergeable,
17550 +                          repeat this node */
17551 +                       done_load_count(&right_load);
17552 +                       done_lh(&right_lock);
17553 +               }
17554 +       }
17555 + exit:
17556 +       JF_CLR(ZJNODE(node), JNODE_SQUEEZABLE);
17557 +       znode_make_dirty(node);
17558 +       return ret;
17559 +}
17560 +
17561 +/* Squeeze and allocate the right neighbor.  This is called after @left and
17562 +   its current children have been squeezed and allocated already.  This
17563 +   procedure's job is to squeeze and items from @right to @left.
17564 +
17565 +   If at the leaf level, use the shift_everything_left memcpy-optimized
17566 +   version of shifting (squeeze_right_leaf).
17567 +
17568 +   If at the twig level, extents are allocated as they are shifted from @right
17569 +   to @left (squalloc_right_twig).
17570 +
17571 +   At any other level, shift one internal item and return to the caller
17572 +   (squalloc_parent_first) so that the shifted-subtree can be processed in
17573 +   parent-first order.
17574 +
17575 +   When unit of internal item is moved, squeezing stops and SUBTREE_MOVED is
17576 +   returned.  When all content of @right is squeezed, SQUEEZE_SOURCE_EMPTY is
17577 +   returned.  If nothing can be moved into @left anymore, SQUEEZE_TARGET_FULL
17578 +   is returned.
17579 +*/
17580 +
17581 +static int squeeze_right_neighbor(flush_pos_t * pos, znode * left, znode * right)
17582 +{
17583 +       int ret;
17584 +
17585 +       /* FIXME it is possible to see empty hasn't-heard-banshee node in a
17586 +        * tree owing to error (for example, ENOSPC) in write */
17587 +       /* assert("jmacd-9321", !node_is_empty(left)); */
17588 +       assert("jmacd-9322", !node_is_empty(right));
17589 +       assert("jmacd-9323", znode_get_level(left) == znode_get_level(right));
17590 +
17591 +       ON_TRACE(TRACE_FLUSH_VERB, "sq_rn[%u] left  %s\n", znode_get_level(left), znode_tostring(left));
17592 +       ON_TRACE(TRACE_FLUSH_VERB, "sq_rn[%u] right %s\n", znode_get_level(left), znode_tostring(right));
17593 +
17594 +       switch (znode_get_level(left)) {
17595 +       case TWIG_LEVEL:
17596 +               /* Shift with extent allocating until either an internal item
17597 +                  is encountered or everything is shifted or no free space
17598 +                  left in @left */
17599 +               ret = squeeze_right_twig(left, right, pos);
17600 +               break;
17601 +
17602 +       default:
17603 +               /* All other levels can use shift_everything until we implement per-item
17604 +                  flush plugins. */
17605 +               ret = squeeze_right_non_twig(left, right);
17606 +               break;
17607 +       }
17608 +
17609 +       assert("jmacd-2011", (ret < 0 ||
17610 +                             ret == SQUEEZE_SOURCE_EMPTY || ret == SQUEEZE_TARGET_FULL || ret == SUBTREE_MOVED));
17611 +
17612 +       if (ret == SQUEEZE_SOURCE_EMPTY) {
17613 +               reiser4_stat_inc(flush.squeezed_completely);
17614 +       }
17615 +
17616 +       ON_TRACE(TRACE_FLUSH_VERB, "sq_rn[%u] returns %s: left %s\n",
17617 +                znode_get_level(left),
17618 +                (ret == SQUEEZE_SOURCE_EMPTY) ? "src empty" :
17619 +                ((ret == SQUEEZE_TARGET_FULL) ? "tgt full" :
17620 +                 ((ret == SUBTREE_MOVED) ? "tree moved" : "error")), znode_tostring(left));
17621 +       return ret;
17622 +}
17623 +
17624 +static int squeeze_right_twig_and_advance_coord (flush_pos_t * pos, znode * right)
17625 +{
17626 +       int ret;
17627 +
17628 +       ret = squeeze_right_twig(pos->lock.node, right, pos);
17629 +       if (ret < 0)
17630 +               return ret;
17631 +       if (ret > 0) {
17632 +               coord_init_after_last_item(&pos->coord, pos->lock.node);
17633 +               return ret;
17634 +       }
17635 +
17636 +       coord_init_last_unit(&pos->coord, pos->lock.node);
17637 +       return 0;
17638 +}
17639 +
17640 +#if 0
17641 +/* "prepped" check for parent node without long-term locking it */
17642 +static inline int fast_check_parent_flushprepped (znode * node)
17643 +{
17644 +       reiser4_tree * tree = current_tree;
17645 +       int prepped = 1;
17646 +
17647 +       RLOCK_TREE(tree);
17648 +
17649 +       if (node->in_parent.node || !jnode_is_flushprepped(ZJNODE(node)))
17650 +               prepped = 0;
17651 +
17652 +       RUNLOCK_TREE(tree);
17653 +
17654 +       return prepped;
17655 +}
17656 +#endif
17657 +
17658 +/* forward declaration */
17659 +static int squalloc_upper_levels (flush_pos_t *, znode *, znode *);
17660 +
17661 +/* do a fast check for "same parents" condition before calling
17662 + * squalloc_upper_levels() */
17663 +static inline int check_parents_and_squalloc_upper_levels (flush_pos_t * pos, znode *left, znode * right)
17664 +{
17665 +       if (znode_same_parents(left, right))
17666 +               return 0;
17667 +
17668 +       return squalloc_upper_levels(pos, left, right);
17669 +}
17670 +
17671 +/* Check whether the parent of given @right node needs to be processes
17672 +   ((re)allocated) prior to processing of the child.  If @left and @right do not
17673 +   share at least the parent of the @right is after the @left but before the
17674 +   @right in parent-first order, we have to (re)allocate it before the @right
17675 +   gets (re)allocated. */
17676 +static int squalloc_upper_levels (flush_pos_t * pos, znode *left, znode * right)
17677 +{
17678 +       int ret;
17679 +
17680 +       lock_handle left_parent_lock;
17681 +       lock_handle right_parent_lock;
17682 +
17683 +       load_count left_parent_load;
17684 +       load_count right_parent_load;
17685 +
17686 +
17687 +       init_lh(&left_parent_lock);
17688 +       init_lh(&right_parent_lock);
17689 +
17690 +       init_load_count(&left_parent_load);
17691 +       init_load_count(&right_parent_load);
17692 +
17693 +       ret = reiser4_get_parent(&left_parent_lock, left, ZNODE_WRITE_LOCK, 0);
17694 +       if (ret)
17695 +               goto out;
17696 +
17697 +       ret = reiser4_get_parent(&right_parent_lock, right, ZNODE_WRITE_LOCK, 0);
17698 +       if (ret)
17699 +               goto out;
17700 +
17701 +       /* Check for same parents */
17702 +       if (left_parent_lock.node == right_parent_lock.node)
17703 +               goto out;
17704 +
17705 +       if (znode_check_flushprepped(right_parent_lock.node)) {
17706 +               /* Keep parent-first order.  In the order, the right parent node stands
17707 +                  before the @right node.  If it is already allocated, we set the
17708 +                  preceder (next block search start point) to its block number, @right
17709 +                  node should be allocated after it.
17710 +
17711 +                  However, preceder is set only if the right parent is on twig level.
17712 +                  The explanation is the following: new branch nodes are allocated over
17713 +                  already allocated children while the tree grows, it is difficult to
17714 +                  keep tree ordered, we assume that only leaves and twings are correctly
17715 +                  allocated.  So, only twigs are used as a preceder for allocating of the
17716 +                  rest of the slum. */
17717 +               if (znode_get_level(right_parent_lock.node) == TWIG_LEVEL) {
17718 +                       pos->preceder.blk = *znode_get_block(right_parent_lock.node);
17719 +                       check_preceder(pos->preceder.blk);
17720 +               }
17721 +               goto out;
17722 +       }
17723 +
17724 +       ret = incr_load_count_znode(&left_parent_load, left_parent_lock.node);
17725 +       if (ret)
17726 +               goto out;
17727 +
17728 +       ret = incr_load_count_znode(&right_parent_load, right_parent_lock.node);
17729 +       if (ret)
17730 +               goto out;
17731 +
17732 +       ret = squeeze_right_neighbor(pos, left_parent_lock.node, right_parent_lock.node);
17733 +       /* We stop if error. We stop if some items/units were shifted (ret == 0)
17734 +        * and thus @right changed its parent. It means we have not process
17735 +        * right_parent node prior to processing of @right. Positive return
17736 +        * values say that shifting items was not happen because of "empty
17737 +        * source" or "target full" conditions. */
17738 +       if (ret <= 0)
17739 +               goto out;
17740 +
17741 +       /* parent(@left) and parent(@right) may have different parents also. We
17742 +        * do a recursive call for checking that. */
17743 +       ret = check_parents_and_squalloc_upper_levels(pos, left_parent_lock.node, right_parent_lock.node);
17744 +       if (ret)
17745 +               goto out;
17746 +
17747 +       /* allocate znode when going down */
17748 +       ret = lock_parent_and_allocate_znode(right_parent_lock.node, pos);
17749 +
17750 + out:
17751 +       done_load_count(&left_parent_load);
17752 +       done_load_count(&right_parent_load);
17753 +
17754 +       done_lh(&left_parent_lock);
17755 +       done_lh(&right_parent_lock);
17756 +
17757 +       return ret;
17758 +}
17759 +
17760 +/* Check the leftmost child "flushprepped" status, also returns true if child
17761 + * node was not found in cache.  */
17762 +static int leftmost_child_of_unit_check_flushprepped (const coord_t *coord)
17763 +{
17764 +       int ret;
17765 +       int prepped;
17766 +
17767 +       jnode * child;
17768 +
17769 +       ret = get_leftmost_child_of_unit(coord, &child);
17770 +
17771 +       if (ret)
17772 +               return ret;
17773 +
17774 +       if (child) {
17775 +               prepped = jnode_check_flushprepped(child);
17776 +               jput(child);
17777 +       } else {
17778 +               /* We consider not existing child as a node which slum
17779 +                  processing should not continue to.  Not cached node is clean,
17780 +                  so it is flushprepped. */
17781 +               prepped = 1;
17782 +       }
17783 +
17784 +       return prepped;
17785 +}
17786 +
17787 +/* (re)allocate znode with automated getting parent node */
17788 +static int lock_parent_and_allocate_znode (znode * node, flush_pos_t * pos)
17789 +{
17790 +       int ret;
17791 +       lock_handle parent_lock;
17792 +       load_count parent_load;
17793 +       coord_t pcoord;
17794 +
17795 +       assert ("zam-851", znode_is_write_locked(node));
17796 +
17797 +       init_lh(&parent_lock);
17798 +       init_load_count(&parent_load);
17799 +
17800 +       ret = reiser4_get_parent(&parent_lock, node, ZNODE_WRITE_LOCK, 0);
17801 +       if (ret)
17802 +               goto out;
17803 +
17804 +       ret = incr_load_count_znode(&parent_load, parent_lock.node);
17805 +       if (ret)
17806 +               goto out;
17807 +
17808 +       ret = find_child_ptr(parent_lock.node, node, &pcoord);
17809 +       if (ret)
17810 +               goto out;
17811 +
17812 +       ret = allocate_znode(node, &pcoord, pos);
17813 +
17814 + out:
17815 +       done_load_count(&parent_load);
17816 +       done_lh(&parent_lock);
17817 +       return ret;
17818 +}
17819 +
17820 +/* Process nodes on leaf level until unformatted node or rightmost node in the
17821 + * slum reached.  */
17822 +static int handle_pos_on_formatted (flush_pos_t * pos)
17823 +{
17824 +       int ret;
17825 +       lock_handle right_lock;
17826 +       load_count right_load;
17827 +
17828 +       init_lh(&right_lock);
17829 +       init_load_count(&right_load);
17830 +
17831 +       check_pos(pos);
17832 +       if (znode_squeezable(pos->lock.node)) {
17833 +               ret = squeeze_node(pos, pos->lock.node);
17834 +               check_pos(pos);
17835 +               if (ret)
17836 +                       return ret;
17837 +       }
17838 +
17839 +       while (1) {
17840 +               check_pos(pos);
17841 +               ret = neighbor_in_slum(pos->lock.node, &right_lock, RIGHT_SIDE, ZNODE_WRITE_LOCK);
17842 +               if (ret)
17843 +                       break;
17844 +
17845 +               /* we don't prep(allocate) nodes for flushing twice.  This can be suboptimal, or it
17846 +                * can be optimal.  For now we choose to live with the risk that it will
17847 +                * be suboptimal because it would be quite complex to code it to be
17848 +                * smarter. */
17849 +               if (znode_check_flushprepped(right_lock.node) && !znode_squeezable(right_lock.node)) {
17850 +                       pos_stop(pos);
17851 +                       break;
17852 +               }
17853 +
17854 +               ret = incr_load_count_znode(&right_load, right_lock.node);
17855 +               if (ret)
17856 +                       break;
17857 +
17858 +               if (znode_squeezable(right_lock.node)) {
17859 +                       ret = squeeze_node(pos, right_lock.node);
17860 +                       check_pos(pos);
17861 +                       if (ret)
17862 +                               break;
17863 +                       if (node_is_empty(right_lock.node)) {
17864 +                               /* node was squeezed completely, repeat */
17865 +                               done_load_count(&right_load);
17866 +                               done_lh(&right_lock);
17867 +                               continue;
17868 +                       }
17869 +               }
17870 +
17871 +                /* squeeze _before_ going upward. */
17872 +               ret = squeeze_right_neighbor(pos, pos->lock.node, right_lock.node);
17873 +               check_pos(pos);
17874 +               if (ret < 0)
17875 +                       break;
17876 +
17877 +               if (znode_check_flushprepped(right_lock.node)) {
17878 +                       pos_stop(pos);
17879 +                       break;
17880 +               }
17881 +
17882 +               if (node_is_empty(right_lock.node)) {
17883 +                       /* repeat if right node was squeezed completely */
17884 +                       done_load_count(&right_load);
17885 +                       done_lh(&right_lock);
17886 +                       continue;
17887 +               }
17888 +
17889 +               /* parent(right_lock.node) has to be processed before
17890 +                * (right_lock.node) due to "parent-first" allocation order. */
17891 +               ret = check_parents_and_squalloc_upper_levels(pos, pos->lock.node, right_lock.node);
17892 +               check_pos(pos);
17893 +               if (ret)
17894 +                       break;
17895 +               /* (re)allocate _after_ going upward */
17896 +               ret = lock_parent_and_allocate_znode(right_lock.node, pos);
17897 +               check_pos(pos);
17898 +               if (ret)
17899 +                       break;
17900 +
17901 +               if (should_terminate_squalloc(pos)) {
17902 +                       set_item_squeeze_count(pos, 0);
17903 +                       break;
17904 +               }
17905 +               /* advance the flush position to the right neighbor */
17906 +               move_flush_pos(pos, &right_lock, &right_load, NULL);
17907 +
17908 +               ret = rapid_flush(pos);
17909 +               check_pos(pos);
17910 +               if (ret)
17911 +                       break;
17912 +       }
17913 +       check_pos(pos);
17914 +
17915 +       done_load_count(&right_load);
17916 +       done_lh(&right_lock);
17917 +
17918 +       /* This function indicates via pos whether to stop or go to twig or continue on current
17919 +        * level. */
17920 +       return ret;
17921 +
17922 +}
17923 +
17924 +/* Process nodes on leaf level until unformatted node or rightmost node in the
17925 + * slum reached.  */
17926 +static int handle_pos_on_leaf (flush_pos_t * pos)
17927 +{
17928 +       int ret;
17929 +
17930 +       assert ("zam-845", pos->state == POS_ON_LEAF);
17931 +
17932 +       ret = handle_pos_on_formatted(pos);
17933 +
17934 +       if (ret == -E_NO_NEIGHBOR) {
17935 +               /* cannot get right neighbor, go process extents. */
17936 +               pos->state = POS_TO_TWIG;
17937 +               return 0;
17938 +       }
17939 +
17940 +       return ret;
17941 +}
17942 +
17943 +/* Process slum on level > 1 */
17944 +static int handle_pos_on_internal (flush_pos_t * pos)
17945 +{
17946 +       assert ("zam-850", pos->state == POS_ON_INTERNAL);
17947 +       return handle_pos_on_formatted(pos);
17948 +}
17949 +
17950 +/* check whether squalloc should stop before processing given extent */
17951 +static int squalloc_extent_should_stop (flush_pos_t * pos)
17952 +{
17953 +       assert("zam-869", item_is_extent(&pos->coord));
17954 +
17955 +       /* pos->child is a jnode handle_pos_on_extent() should start with in
17956 +        * stead of the first child of the first extent unit. */
17957 +       if (pos->child) {
17958 +               int prepped;
17959 +
17960 +               assert("vs-1383", jnode_is_unformatted(pos->child));
17961 +               prepped = jnode_check_flushprepped(pos->child);
17962 +               pos->pos_in_unit = jnode_get_index(pos->child) - extent_unit_index(&pos->coord);
17963 +               assert("vs-1470", pos->pos_in_unit < extent_unit_width(&pos->coord));
17964 +               assert("nikita-3434", ergo(extent_is_unallocated(&pos->coord),
17965 +                                          pos->pos_in_unit == 0));
17966 +               jput(pos->child);
17967 +               pos->child = NULL;
17968 +
17969 +               return prepped;
17970 +       }
17971 +
17972 +       pos->pos_in_unit = 0;
17973 +       if (extent_is_unallocated(&pos->coord))
17974 +               return 0;
17975 +
17976 +       return leftmost_child_of_unit_check_flushprepped(&pos->coord);
17977 +}
17978 +
17979 +int alloc_extent(flush_pos_t *flush_pos);
17980 +
17981 +/* Handle the case when regular reiser4 tree (znodes connected one to its
17982 + * neighbors by sibling pointers) is interrupted on leaf level by one or more
17983 + * unformatted nodes.  By having a lock on twig level and use extent code
17984 + * routines to process unformatted nodes we swim around an irregular part of
17985 + * reiser4 tree. */
17986 +static int handle_pos_on_twig (flush_pos_t * pos)
17987 +{
17988 +       int ret;
17989 +
17990 +       assert ("zam-844", pos->state == POS_ON_EPOINT);
17991 +       assert ("zam-843", item_is_extent(&pos->coord));
17992 +
17993 +       check_pos(pos);
17994 +       /* We decide should we continue slum processing with current extent
17995 +          unit: if leftmost child of current extent unit is flushprepped
17996 +          (i.e. clean or already processed by flush) we stop squalloc().  There
17997 +          is a fast check for unallocated extents which we assume contain all
17998 +          not flushprepped nodes. */
17999 +       /* FIXME: Here we implement simple check, we are only looking on the
18000 +          leftmost child. */
18001 +       ret = squalloc_extent_should_stop(pos);
18002 +       if (ret != 0) {
18003 +               pos_stop(pos);
18004 +               return ret;
18005 +       }
18006 +
18007 +       while (pos_valid(pos) && coord_is_existing_unit(&pos->coord) && item_is_extent(&pos->coord)) {
18008 +               check_pos(pos);
18009 +               ret = alloc_extent(pos);
18010 +               check_pos(pos);
18011 +               if (ret) {
18012 +                       break;
18013 +               }
18014 +               coord_next_unit(&pos->coord);
18015 +       }
18016 +
18017 +       if (coord_is_after_rightmost(&pos->coord)) {
18018 +               pos->state = POS_END_OF_TWIG;
18019 +               return 0;
18020 +       }
18021 +       if (item_is_internal(&pos->coord)) {
18022 +               pos->state = POS_TO_LEAF;
18023 +               return 0;
18024 +       }
18025 +
18026 +       assert ("zam-860", item_is_extent(&pos->coord));
18027 +
18028 +       check_pos(pos);
18029 +       /* "slum" is over */
18030 +       pos->state = POS_INVALID;
18031 +       return 0;
18032 +}
18033 +
18034 +/* When we about to return flush position from twig to leaf level we can process
18035 + * the right twig node or move position to the leaf.  This processes right twig
18036 + * if it is possible and jump to leaf level if not. */
18037 +static int handle_pos_end_of_twig (flush_pos_t * pos)
18038 +{
18039 +       int ret;
18040 +       lock_handle right_lock;
18041 +       load_count right_load;
18042 +       coord_t at_right;
18043 +       jnode * child = NULL;
18044 +
18045 +
18046 +       assert ("zam-848", pos->state == POS_END_OF_TWIG);
18047 +       assert ("zam-849", coord_is_after_rightmost(&pos->coord));
18048 +
18049 +       init_lh(&right_lock);
18050 +       init_load_count(&right_load);
18051 +
18052 +       check_pos(pos);
18053 +       /* We get a lock on the right twig node even it is not dirty because
18054 +        * slum continues or discontinues on leaf level not on next twig. This
18055 +        * lock on the right twig is needed for getting its leftmost child. */
18056 +       ret = reiser4_get_right_neighbor(&right_lock, pos->lock.node, ZNODE_WRITE_LOCK, GN_SAME_ATOM);
18057 +       if (ret)
18058 +               goto out;
18059 +
18060 +       ret = incr_load_count_znode(&right_load, right_lock.node);
18061 +       if (ret)
18062 +               goto out;
18063 +
18064 +       /* right twig could be not dirty */
18065 +       if (znode_check_dirty(right_lock.node)) {
18066 +               /* If right twig node is dirty we always attempt to squeeze it
18067 +                * content to the left... */
18068 +became_dirty:
18069 +               check_pos(pos);
18070 +               ret = squeeze_right_twig_and_advance_coord(pos, right_lock.node);
18071 +               check_pos(pos);
18072 +               if (ret <=0) {
18073 +                       /* pos->coord is on internal item, go to leaf level, or
18074 +                        * we have an error which will be caught in squalloc() */
18075 +                       pos->state = POS_TO_LEAF;
18076 +                       goto out;
18077 +               }
18078 +
18079 +               /* If right twig was squeezed completely we wave to re-lock
18080 +                * right twig. now it is done through the top-level squalloc
18081 +                * routine. */
18082 +               if (node_is_empty(right_lock.node))
18083 +                       goto out;
18084 +
18085 +               /* ... and prep it if it is not yet prepped */
18086 +               if (!znode_check_flushprepped(right_lock.node)) {
18087 +                       /* As usual, process parent before ...*/
18088 +                       ret = check_parents_and_squalloc_upper_levels(pos, pos->lock.node, right_lock.node);
18089 +                       check_pos(pos);
18090 +                       if (ret)
18091 +                               goto out;
18092 +
18093 +                       /* ... processing the child */
18094 +                       ret = lock_parent_and_allocate_znode(right_lock.node, pos);
18095 +                       check_pos(pos);
18096 +                       if (ret)
18097 +                               goto out;
18098 +               }
18099 +       } else {
18100 +               coord_init_first_unit(&at_right, right_lock.node);
18101 +
18102 +               /* check first child of next twig, should we continue there ? */
18103 +               ret = get_leftmost_child_of_unit(&at_right, &child);
18104 +               if (ret || child == NULL || jnode_check_flushprepped(child)) {
18105 +                       pos_stop(pos);
18106 +                       goto out;
18107 +               }
18108 +
18109 +               /* check clean twig for possible relocation */
18110 +               if (!znode_check_flushprepped(right_lock.node)) {
18111 +                       check_pos(pos);
18112 +                       ret = reverse_relocate_check_dirty_parent(child, &at_right, pos);
18113 +                       check_pos(pos);
18114 +                       if (ret)
18115 +                               goto out;
18116 +                       if (znode_check_dirty(right_lock.node))
18117 +                               goto became_dirty;
18118 +               }
18119 +       }
18120 +
18121 +       assert ("zam-875", znode_check_flushprepped(right_lock.node));
18122 +
18123 +       /* Update the preceder by a block number of just processed right twig
18124 +        * node. The code above could miss the preceder updating because
18125 +        * allocate_znode() could not be called for this node. */
18126 +       pos->preceder.blk = *znode_get_block(right_lock.node);
18127 +       check_preceder(pos->preceder.blk);
18128 +
18129 +       coord_init_first_unit(&at_right, right_lock.node);
18130 +       assert("zam-868", coord_is_existing_unit(&at_right));
18131 +
18132 +       pos->state = item_is_extent(&at_right) ? POS_ON_EPOINT : POS_TO_LEAF;
18133 +       move_flush_pos(pos, &right_lock, &right_load, &at_right);
18134 +
18135 + out:
18136 +       check_pos(pos);
18137 +       done_load_count(&right_load);
18138 +       done_lh(&right_lock);
18139 +
18140 +       if (child)
18141 +               jput(child);
18142 +
18143 +       return ret;
18144 +}
18145 +
18146 +/* Move the pos->lock to leaf node pointed by pos->coord, check should we
18147 + * continue there. */
18148 +static int handle_pos_to_leaf (flush_pos_t * pos)
18149 +{
18150 +       int ret;
18151 +       lock_handle child_lock;
18152 +       load_count child_load;
18153 +       jnode * child;
18154 +
18155 +       assert ("zam-846", pos->state == POS_TO_LEAF);
18156 +       assert ("zam-847", item_is_internal(&pos->coord));
18157 +
18158 +       init_lh(&child_lock);
18159 +       init_load_count(&child_load);
18160 +
18161 +       check_pos(pos);
18162 +       ret = get_leftmost_child_of_unit(&pos->coord, &child);
18163 +       if (ret)
18164 +               return ret;
18165 +       if (child == NULL) {
18166 +               pos_stop(pos);
18167 +               return 0;
18168 +       }
18169 +
18170 +       if (jnode_check_flushprepped(child)) {
18171 +               pos->state = POS_INVALID;
18172 +               goto out;
18173 +       }
18174 +
18175 +       ret = longterm_lock_znode(&child_lock, JZNODE(child), ZNODE_WRITE_LOCK, ZNODE_LOCK_LOPRI);
18176 +       if (ret)
18177 +               goto out;
18178 +
18179 +       ret = incr_load_count_znode(&child_load, JZNODE(child));
18180 +       if (ret)
18181 +               goto out;
18182 +
18183 +       ret = allocate_znode(JZNODE(child), &pos->coord, pos);
18184 +       check_pos(pos);
18185 +       if (ret)
18186 +               goto out;
18187 +
18188 +       /* move flush position to leaf level */
18189 +       pos->state = POS_ON_LEAF;
18190 +       move_flush_pos(pos, &child_lock, &child_load, NULL);
18191 +
18192 +       if (node_is_empty(JZNODE(child))) {
18193 +               ret = delete_empty_node(JZNODE(child));
18194 +               check_pos(pos);
18195 +               pos->state = POS_INVALID;
18196 +       }
18197 + out:
18198 +       check_pos(pos);
18199 +       done_load_count(&child_load);
18200 +       done_lh(&child_lock);
18201 +       jput(child);
18202 +
18203 +       return ret;
18204 +}
18205 +/* move pos from leaf to twig, and move lock from leaf to twig. */
18206 +/* Move pos->lock to upper (twig) level */
18207 +static int handle_pos_to_twig (flush_pos_t * pos)
18208 +{
18209 +       int ret;
18210 +
18211 +       lock_handle parent_lock;
18212 +       load_count parent_load;
18213 +       coord_t pcoord;
18214 +
18215 +       assert ("zam-852", pos->state == POS_TO_TWIG);
18216 +
18217 +       init_lh(&parent_lock);
18218 +       init_load_count(&parent_load);
18219 +
18220 +       check_pos(pos);
18221 +       ret = reiser4_get_parent(&parent_lock, pos->lock.node, ZNODE_WRITE_LOCK, 0);
18222 +       if (ret)
18223 +               goto out;
18224 +
18225 +       ret = incr_load_count_znode(&parent_load, parent_lock.node);
18226 +       if (ret)
18227 +               goto out;
18228 +
18229 +       ret = find_child_ptr(parent_lock.node, pos->lock.node, &pcoord);
18230 +       if (ret)
18231 +               goto out;
18232 +
18233 +       assert ("zam-870", item_is_internal(&pcoord));
18234 +       coord_next_item(&pcoord);
18235 +
18236 +       if (coord_is_after_rightmost(&pcoord))
18237 +               pos->state = POS_END_OF_TWIG;
18238 +       else if (item_is_extent(&pcoord))
18239 +               pos->state = POS_ON_EPOINT;
18240 +       else {
18241 +               /* Here we understand that getting -E_NO_NEIGHBOR in
18242 +                * handle_pos_on_leaf() was because of just a reaching edge of
18243 +                * slum */
18244 +               pos_stop(pos);
18245 +               goto out;
18246 +       }
18247 +
18248 +       move_flush_pos(pos, &parent_lock, &parent_load, &pcoord);
18249 +
18250 + out:
18251 +       check_pos(pos);
18252 +       done_load_count(&parent_load);
18253 +       done_lh(&parent_lock);
18254 +
18255 +       return ret;
18256 +}
18257 +
18258 +typedef int (*pos_state_handle_t)(flush_pos_t*);
18259 +static pos_state_handle_t flush_pos_handlers[] = {
18260 +       /* process formatted nodes on leaf level, keep lock on a leaf node */
18261 +       [POS_ON_LEAF]     = handle_pos_on_leaf,
18262 +       /* process unformatted nodes, keep lock on twig node, pos->coord points to extent currently
18263 +        * being processed */
18264 +       [POS_ON_EPOINT]     = handle_pos_on_twig,
18265 +       /* move a lock from leaf node to its parent for further processing of unformatted nodes */
18266 +       [POS_TO_TWIG]     = handle_pos_to_twig,
18267 +       /* move a lock from twig to leaf level when a processing of unformatted nodes finishes,
18268 +        * pos->coord points to the leaf node we jump to */
18269 +       [POS_TO_LEAF]     = handle_pos_to_leaf,
18270 +       /* after processing last extent in the twig node, attempting to shift items from the twigs
18271 +        * right neighbor and process them while shifting */
18272 +       [POS_END_OF_TWIG] = handle_pos_end_of_twig,
18273 +       /* process formatted nodes on internal level, keep lock on an internal node */
18274 +       [POS_ON_INTERNAL] = handle_pos_on_internal
18275 +};
18276 +
18277 +/* Advance flush position horizontally, prepare for flushing ((re)allocate, squeeze,
18278 + * encrypt) nodes and their ancestors in "parent-first" order */
18279 +static int squalloc (flush_pos_t * pos)
18280 +{
18281 +       int ret = 0;
18282 +
18283 +       /* maybe needs to be made a case statement with handle_pos_on_leaf as first case, for
18284 +        * greater CPU efficiency? Measure and see.... -Hans */
18285 +       while (pos_valid(pos)) {
18286 +               check_pos(pos);
18287 +               ret = flush_pos_handlers[pos->state](pos);
18288 +               check_pos(pos);
18289 +               if (ret < 0)
18290 +                       break;
18291 +
18292 +               ret = rapid_flush(pos);
18293 +               check_pos(pos);
18294 +               if (ret)
18295 +                       break;
18296 +       }
18297 +
18298 +       /* any positive value or -E_NO_NEIGHBOR are legal return codes for handle_pos*
18299 +          routines, -E_NO_NEIGHBOR means that slum edge was reached */
18300 +       if (ret > 0 || ret == -E_NO_NEIGHBOR)
18301 +               ret = 0;
18302 +
18303 +       return ret;
18304 +}
18305 +
18306 +static void update_ldkey(znode *node)
18307 +{
18308 +       reiser4_key ldkey;
18309 +
18310 +       assert("vs-1630", rw_dk_is_write_locked(znode_get_tree(node)));
18311 +       if (node_is_empty(node))
18312 +               return;
18313 +
18314 +       znode_set_ld_key(node, leftmost_key_in_node(node, &ldkey));
18315 +}
18316 +
18317 +/* this is to be called after calling of shift node's method to shift data from @right to
18318 +   @left. It sets left delimiting keys of @left and @right to keys of first items of @left
18319 +   and @right correspondingly and sets right delimiting key of @left to first key of @right */
18320 +static void
18321 +update_znode_dkeys(znode *left, znode *right)
18322 +{
18323 +       assert("nikita-1470", rw_dk_is_write_locked(znode_get_tree(right)));
18324 +       assert("vs-1629", znode_is_write_locked(left) && znode_is_write_locked(right));
18325 +
18326 +       /* we need to update left delimiting of left if it was empty before shift */
18327 +       update_ldkey(left);
18328 +       update_ldkey(right);
18329 +       if (node_is_empty(right))
18330 +               znode_set_rd_key(left, znode_get_rd_key(right));
18331 +       else
18332 +               znode_set_rd_key(left, znode_get_ld_key(right));
18333 +}
18334 +
18335 +/* try to shift everything from @right to @left. If everything was shifted -
18336 +   @right is removed from the tree.  Result is the number of bytes shifted. */
18337 +static int
18338 +shift_everything_left(znode * right, znode * left, carry_level * todo)
18339 +{
18340 +       coord_t from;
18341 +       node_plugin *nplug;
18342 +       carry_plugin_info info;
18343 +
18344 +       coord_init_after_last_item(&from, right);
18345 +
18346 +       IF_TRACE(TRACE_COORDS, print_coord("shift_everything_left:", &from, 0));
18347 +
18348 +       nplug = node_plugin_by_node(right);
18349 +       info.doing = NULL;
18350 +       info.todo = todo;
18351 +       return nplug->shift(&from, left, SHIFT_LEFT,
18352 +                           1 /* delete @right if it becomes empty */,
18353 +                           1 /* move coord @from to node @left if everything will be shifted */,
18354 +                           &info);
18355 +}
18356 +
18357 +/* Shift as much as possible from @right to @left using the memcpy-optimized
18358 +   shift_everything_left.  @left and @right are formatted neighboring nodes on
18359 +   leaf level. */
18360 +static int
18361 +squeeze_right_non_twig(znode * left, znode * right)
18362 +{
18363 +       int ret;
18364 +       carry_pool pool;
18365 +       carry_level todo;
18366 +       ON_STATS(int old_items; int old_free_space);
18367 +
18368 +       assert("nikita-2246", znode_get_level(left) == znode_get_level(right));
18369 +
18370 +       if (!znode_is_dirty(left) || !znode_is_dirty(right))
18371 +               return SQUEEZE_TARGET_FULL;
18372 +
18373 +       init_carry_pool(&pool);
18374 +       init_carry_level(&todo, &pool);
18375 +
18376 +       ON_STATS(old_items = node_num_items(left); old_free_space = znode_free_space(left));
18377 +
18378 +       ret = shift_everything_left(right, left, &todo);
18379 +       if (ret > 0) {
18380 +               /* something was shifted */
18381 +               reiser4_tree *tree;
18382 +               __u64 grabbed;
18383 +
18384 +               znode_make_dirty(left);
18385 +               znode_make_dirty(right);
18386 +
18387 +               /* update delimiting keys of nodes which participated in
18388 +                  shift. FIXME: it would be better to have this in shift
18389 +                  node's operation. But it can not be done there. Nobody
18390 +                  remembers why, though */
18391 +               tree = znode_get_tree(left);
18392 +               UNDER_RW_VOID(dk, tree, write, update_znode_dkeys(left, right));
18393 +
18394 +               /* Carry is called to update delimiting key and, maybe, to remove empty
18395 +                  node. */
18396 +               grabbed = get_current_context()->grabbed_blocks;
18397 +               ret = reiser4_grab_space_force(tree->height, BA_RESERVED);
18398 +               assert("nikita-3003", ret == 0); /* reserved space is exhausted. Ask Hans. */
18399 +
18400 +               ON_STATS(todo.level_no = znode_get_level(left) + 1);
18401 +
18402 +               ret = carry(&todo, NULL /* previous level */ );
18403 +               grabbed2free_mark(grabbed);
18404 +       } else {
18405 +               /* Shifting impossible, we return appropriate result code */
18406 +               ret = node_is_empty(right) ? SQUEEZE_SOURCE_EMPTY : SQUEEZE_TARGET_FULL;
18407 +       }
18408 +
18409 +       done_carry_pool(&pool);
18410 +
18411 +#if REISER4_STATS
18412 +       if (znode_get_level(left) == LEAF_LEVEL) {
18413 +               reiser4_stat_inc(flush.squeezed_leaves);
18414 +               reiser4_stat_add(flush.squeezed_leaf_items, node_num_items(left) - old_items);
18415 +               reiser4_stat_add(flush.squeezed_leaf_bytes, old_free_space - znode_free_space(left));
18416 +       }
18417 +#endif
18418 +
18419 +       return ret;
18420 +}
18421 +
18422 +/* Shift first unit of first item if it is an internal one.  Return
18423 +   SQUEEZE_TARGET_FULL if it fails to shift an item, otherwise return
18424 +   SUBTREE_MOVED. */
18425 +static int
18426 +shift_one_internal_unit(znode * left, znode * right)
18427 +{
18428 +       int ret;
18429 +       carry_pool pool;
18430 +       carry_level todo;
18431 +       coord_t coord;
18432 +       int size, moved;
18433 +       carry_plugin_info info;
18434 +
18435 +       assert("nikita-2247", znode_get_level(left) == znode_get_level(right));
18436 +       assert("nikita-2435", znode_is_write_locked(left));
18437 +       assert("nikita-2436", znode_is_write_locked(right));
18438 +       assert("nikita-2434", UNDER_RW(tree, znode_get_tree(left), read, left->right == right));
18439 +
18440 +       coord_init_first_unit(&coord, right);
18441 +
18442 +#if REISER4_DEBUG
18443 +       if (!node_is_empty(left)) {
18444 +               coord_t last;
18445 +               reiser4_key right_key;
18446 +               reiser4_key left_key;
18447 +
18448 +               coord_init_last_unit(&last, left);
18449 +
18450 +               assert("nikita-2463",
18451 +                      keyle(item_key_by_coord(&last, &left_key), item_key_by_coord(&coord, &right_key)));
18452 +       }
18453 +#endif
18454 +
18455 +       assert("jmacd-2007", item_is_internal(&coord));
18456 +
18457 +       init_carry_pool(&pool);
18458 +       init_carry_level(&todo, &pool);
18459 +
18460 +       size = item_length_by_coord(&coord);
18461 +       info.todo = &todo;
18462 +       info.doing = NULL;
18463 +
18464 +       ret = node_plugin_by_node(left)->shift(&coord, left, SHIFT_LEFT,
18465 +                                              1 /* delete @right if it becomes empty */,
18466 +                                              0 /* do not move coord @coord to node @left */,
18467 +                                              &info);
18468 +
18469 +       /* If shift returns positive, then we shifted the item. */
18470 +       assert("vs-423", ret <= 0 || size == ret);
18471 +       moved = (ret > 0);
18472 +
18473 +       if (moved) {
18474 +               /* something was moved */
18475 +               reiser4_tree *tree;
18476 +               int grabbed;
18477 +
18478 +               znode_make_dirty(left);
18479 +               znode_make_dirty(right);
18480 +               tree = znode_get_tree(left);
18481 +               UNDER_RW_VOID(dk, tree, write, update_znode_dkeys(left, right));
18482 +
18483 +               /* reserve space for delimiting keys after shifting */
18484 +               grabbed = get_current_context()->grabbed_blocks;
18485 +               ret = reiser4_grab_space_force(tree->height, BA_RESERVED);
18486 +               assert("nikita-3003", ret == 0); /* reserved space is exhausted. Ask Hans. */
18487 +
18488 +               ON_STATS(todo.level_no = znode_get_level(left) + 1);
18489 +
18490 +               ret = carry(&todo, NULL /* previous level */ );
18491 +               grabbed2free_mark(grabbed);
18492 +       }
18493 +
18494 +       ON_TRACE(TRACE_FLUSH_VERB,
18495 +                "shift_one %s an item: left has %u items, right has %u items\n",
18496 +                moved > 0 ? "moved" : "did not move", node_num_items(left), node_num_items(right));
18497 +
18498 +       done_carry_pool(&pool);
18499 +
18500 +       if (ret != 0) {
18501 +               /* Shift or carry operation failed. */
18502 +               assert("jmacd-7325", ret < 0);
18503 +               return ret;
18504 +       }
18505 +
18506 +       return moved ? SUBTREE_MOVED : SQUEEZE_TARGET_FULL;
18507 +}
18508 +
18509 +/* ALLOCATE INTERFACE */
18510 +/* Audited by: umka (2002.06.11) */
18511 +reiser4_internal void
18512 +jnode_set_block(jnode * node /* jnode to update */ ,
18513 +               const reiser4_block_nr * blocknr /* new block nr */ )
18514 +{
18515 +       assert("nikita-2020", node != NULL);
18516 +       assert("umka-055", blocknr != NULL);
18517 +       assert("zam-819", ergo(JF_ISSET(node, JNODE_EFLUSH), node->blocknr == 0));
18518 +       assert("vs-1453", ergo(JF_ISSET(node, JNODE_EFLUSH), jnode_is_unformatted(node)));
18519 +       node->blocknr = *blocknr;
18520 +}
18521 +
18522 +/* Make the final relocate/wander decision during forward parent-first squalloc for a
18523 +   znode.  For unformatted nodes this is done in plugin/item/extent.c:extent_needs_allocation(). */
18524 +static int
18525 +allocate_znode_loaded(znode * node,
18526 +                     const coord_t * parent_coord, flush_pos_t * pos)
18527 +{
18528 +       int ret;
18529 +       reiser4_super_info_data * sbinfo = get_current_super_private();
18530 +       /* FIXME(D): We have the node write-locked and should have checked for !
18531 +          allocated() somewhere before reaching this point, but there can be a race, so
18532 +          this assertion is bogus. */
18533 +       assert("jmacd-7987", !jnode_check_flushprepped(ZJNODE(node)));
18534 +       assert("jmacd-7988", znode_is_write_locked(node));
18535 +       assert("jmacd-7989", coord_is_invalid(parent_coord)
18536 +              || znode_is_write_locked(parent_coord->node));
18537 +
18538 +       if (ZF_ISSET(node, JNODE_REPACK) || znode_created(node) || znode_is_root(node) ||
18539 +           /* We have enough nodes to relocate no matter what. */
18540 +           (pos->leaf_relocate != 0 && znode_get_level(node) == LEAF_LEVEL))
18541 +       {
18542 +               /* No need to decide with new nodes, they are treated the same as
18543 +                  relocate. If the root node is dirty, relocate. */
18544 +               if (pos->preceder.blk == 0) {
18545 +                       /* preceder is unknown and we have decided to relocate node --
18546 +                          using of default value for search start is better than search
18547 +                          from block #0. */
18548 +                       get_blocknr_hint_default(&pos->preceder.blk);
18549 +                       reiser4_stat_inc(block_alloc.nohint);
18550 +                       check_preceder(pos->preceder.blk);
18551 +               }
18552 +
18553 +               goto best_reloc;
18554 +
18555 +       } else if (pos->preceder.blk == 0) {
18556 +               /* If we don't know the preceder, leave it where it is. */
18557 +               jnode_make_wander(ZJNODE(node));
18558 +       } else {
18559 +               /* Make a decision based on block distance. */
18560 +               reiser4_block_nr dist;
18561 +               reiser4_block_nr nblk = *znode_get_block(node);
18562 +
18563 +               assert("jmacd-6172", !blocknr_is_fake(&nblk));
18564 +               assert("jmacd-6173", !blocknr_is_fake(&pos->preceder.blk));
18565 +               assert("jmacd-6174", pos->preceder.blk != 0);
18566 +
18567 +               if (pos->preceder.blk == nblk - 1) {
18568 +                       /* Ideal. */
18569 +                       jnode_make_wander(ZJNODE(node));
18570 +               } else {
18571 +
18572 +                       dist = (nblk < pos->preceder.blk) ? (pos->preceder.blk - nblk) : (nblk - pos->preceder.blk);
18573 +
18574 +                       /* See if we can find a closer block (forward direction only). */
18575 +                       pos->preceder.max_dist = min((reiser4_block_nr)sbinfo->flush.relocate_distance, dist);
18576 +                       pos->preceder.level = znode_get_level(node);
18577 +
18578 +                       ret = allocate_znode_update(node, parent_coord, pos);
18579 +
18580 +                       pos->preceder.max_dist = 0;
18581 +
18582 +                       if (ret && (ret != -ENOSPC))
18583 +                               return ret;
18584 +
18585 +                       if (ret == 0) {
18586 +                               /* Got a better allocation. */
18587 +                               znode_make_reloc(node, pos->fq);
18588 +                       } else if (dist < sbinfo->flush.relocate_distance) {
18589 +                               /* The present allocation is good enough. */
18590 +                               jnode_make_wander(ZJNODE(node));
18591 +                       } else {
18592 +                               /* Otherwise, try to relocate to the best position. */
18593 +                             best_reloc:
18594 +                               ret = allocate_znode_update(node, parent_coord, pos);
18595 +                               if (ret != 0)
18596 +                                       return ret;
18597 +
18598 +                               /* set JNODE_RELOC bit _after_ node gets allocated */
18599 +                               znode_make_reloc(node, pos->fq);
18600 +                       }
18601 +               }
18602 +       }
18603 +
18604 +       /* This is the new preceder. */
18605 +       pos->preceder.blk = *znode_get_block(node);
18606 +       check_preceder(pos->preceder.blk);
18607 +       pos->alloc_cnt += 1;
18608 +
18609 +       assert ("jmacd-4277", !blocknr_is_fake(&pos->preceder.blk));
18610 +
18611 +       return 0;
18612 +}
18613 +
18614 +static int
18615 +allocate_znode(znode * node, const coord_t * parent_coord, flush_pos_t * pos)
18616 +{
18617 +       /*
18618 +        * perform znode allocation with znode pinned in memory to avoid races
18619 +        * with asynchronous emergency flush (which plays with
18620 +        * JNODE_FLUSH_RESERVED bit).
18621 +        */
18622 +       return WITH_DATA(node, allocate_znode_loaded(node, parent_coord, pos));
18623 +}
18624 +
18625 +
18626 +/* A subroutine of allocate_znode, this is called first to see if there is a close
18627 +   position to relocate to.  It may return ENOSPC if there is no close position.  If there
18628 +   is no close position it may not relocate.  This takes care of updating the parent node
18629 +   with the relocated block address. */
18630 +static int
18631 +allocate_znode_update(znode * node, const coord_t * parent_coord, flush_pos_t * pos)
18632 +{
18633 +       int ret;
18634 +       reiser4_block_nr blk;
18635 +       lock_handle uber_lock;
18636 +       int flush_reserved_used = 0;
18637 +       int grabbed;
18638 +
18639 +       init_lh(&uber_lock);
18640 +
18641 +       grabbed = get_current_context()->grabbed_blocks;
18642 +
18643 +       /* discard e-flush allocation */
18644 +       ret = zload(node);
18645 +       if (ret)
18646 +               return ret;
18647 +
18648 +       if (ZF_ISSET(node, JNODE_CREATED)) {
18649 +               assert ("zam-816", blocknr_is_fake(znode_get_block(node)));
18650 +               pos->preceder.block_stage = BLOCK_UNALLOCATED;
18651 +       } else {
18652 +               pos->preceder.block_stage = BLOCK_GRABBED;
18653 +
18654 +               /* The disk space for relocating the @node is already reserved in "flush reserved"
18655 +                * counter if @node is leaf, otherwise we grab space using BA_RESERVED (means grab
18656 +                * space from whole disk not from only 95%). */
18657 +               if (znode_get_level(node) == LEAF_LEVEL) {
18658 +                       /*
18659 +                        * earlier (during do_jnode_make_dirty()) we decided
18660 +                        * that @node can possibly go into overwrite set and
18661 +                        * reserved block for its wandering location.
18662 +                        */
18663 +                       txn_atom * atom = get_current_atom_locked();
18664 +                       assert("nikita-3449",
18665 +                              ZF_ISSET(node, JNODE_FLUSH_RESERVED));
18666 +                       flush_reserved2grabbed(atom, (__u64)1);
18667 +                       spin_unlock_atom(atom);
18668 +                       /*
18669 +                        * we are trying to move node into relocate
18670 +                        * set. Allocation of relocated position "uses"
18671 +                        * reserved block.
18672 +                        */
18673 +                       ZF_CLR(node, JNODE_FLUSH_RESERVED);
18674 +                       flush_reserved_used = 1;
18675 +               } else {
18676 +                       ret = reiser4_grab_space_force((__u64)1, BA_RESERVED);
18677 +                       if (ret != 0)
18678 +                               goto exit;
18679 +               }
18680 +       }
18681 +
18682 +        /* We may do not use 5% of reserved disk space here and flush will not pack tightly. */
18683 +        ret = reiser4_alloc_block(&pos->preceder, &blk, BA_FORMATTED | BA_PERMANENT);
18684 +       if(ret)
18685 +               goto exit;
18686 +
18687 +
18688 +       if (!ZF_ISSET(node, JNODE_CREATED) &&
18689 +           (ret = reiser4_dealloc_block(znode_get_block(node), 0, BA_DEFER | BA_FORMATTED)))
18690 +               goto exit;
18691 +
18692 +       if (likely(!znode_is_root(node))) {
18693 +               item_plugin *iplug;
18694 +
18695 +               iplug = item_plugin_by_coord(parent_coord);
18696 +               assert("nikita-2954", iplug->f.update != NULL);
18697 +               iplug->f.update(parent_coord, &blk);
18698 +
18699 +               znode_make_dirty(parent_coord->node);
18700 +
18701 +       } else {
18702 +               reiser4_tree *tree = znode_get_tree(node);
18703 +               znode *uber;
18704 +
18705 +               /* We take a longterm lock on the fake node in order to change
18706 +                  the root block number.  This may cause atom fusion. */
18707 +               ret = get_uber_znode(tree, ZNODE_WRITE_LOCK, ZNODE_LOCK_HIPRI,
18708 +                                    &uber_lock);
18709 +               /* The fake node cannot be deleted, and we must have priority
18710 +                  here, and may not be confused with ENOSPC. */
18711 +               assert("jmacd-74412",
18712 +                      ret != -EINVAL && ret != -E_DEADLOCK && ret != -ENOSPC);
18713 +
18714 +               if (ret)
18715 +                       goto exit;
18716 +
18717 +               uber = uber_lock.node;
18718 +
18719 +               UNDER_RW_VOID(tree, tree, write, tree->root_block = blk);
18720 +
18721 +               znode_make_dirty(uber);
18722 +       }
18723 +
18724 +       ret = znode_rehash(node, &blk);
18725 +exit:
18726 +       if(ret) {
18727 +               /* Get flush reserved block back if something fails, because
18728 +                * callers assume that on error block wasn't relocated and its
18729 +                * flush reserved block wasn't used. */
18730 +               if (flush_reserved_used) {
18731 +                       /*
18732 +                        * ok, we failed to move node into relocate
18733 +                        * set. Restore status quo.
18734 +                        */
18735 +                       grabbed2flush_reserved((__u64)1);
18736 +                       ZF_SET(node, JNODE_FLUSH_RESERVED);
18737 +               }
18738 +       }
18739 +       zrelse(node);
18740 +       done_lh(&uber_lock);
18741 +       grabbed2free_mark(grabbed);
18742 +       return ret;
18743 +}
18744 +
18745 +/* JNODE INTERFACE */
18746 +
18747 +/* Lock a node (if formatted) and then get its parent locked, set the child's
18748 +   coordinate in the parent.  If the child is the root node, the above_root
18749 +   znode is returned but the coord is not set.  This function may cause atom
18750 +   fusion, but it is only used for read locks (at this point) and therefore
18751 +   fusion only occurs when the parent is already dirty. */
18752 +/* Hans adds this note: remember to ask how expensive this operation is vs. storing parent
18753 +   pointer in jnodes. */
18754 +static int
18755 +jnode_lock_parent_coord(jnode         * node,
18756 +                       coord_t       * coord,
18757 +                       lock_handle   * parent_lh,
18758 +                       load_count    * parent_zh,
18759 +                       znode_lock_mode parent_mode,
18760 +                       int             try)
18761 +{
18762 +       int ret;
18763 +
18764 +       assert("edward-53", jnode_is_unformatted(node) || jnode_is_znode(node));
18765 +       assert("edward-54", jnode_is_unformatted(node) || znode_is_any_locked(JZNODE(node)));
18766 +
18767 +       if (!jnode_is_znode(node)) {
18768 +               reiser4_key key;
18769 +               tree_level stop_level = TWIG_LEVEL ;
18770 +               lookup_bias bias = FIND_EXACT;
18771 +
18772 +               assert("edward-168", !(jnode_get_type(node) == JNODE_BITMAP));
18773 +
18774 +               /* The case when node is not znode, but can have parent coord
18775 +                  (unformatted node, node which represents cluster page,
18776 +                  etc..).  Generate a key for the appropriate entry, search
18777 +                  in the tree using coord_by_key, which handles locking for
18778 +                  us. */
18779 +
18780 +               /*
18781 +                * nothing is locked at this moment, so, nothing prevents
18782 +                * concurrent truncate from removing jnode from inode. To
18783 +                * prevent this spin-lock jnode. jnode can be truncated just
18784 +                * after call to the jnode_build_key(), but this is ok,
18785 +                * because coord_by_key() will just fail to find appropriate
18786 +                * extent.
18787 +                */
18788 +               LOCK_JNODE(node);
18789 +               if (!JF_ISSET(node, JNODE_HEARD_BANSHEE)) {
18790 +                       jnode_build_key(node, &key);
18791 +                       ret = 0;
18792 +               } else
18793 +                       ret = RETERR(-ENOENT);
18794 +               UNLOCK_JNODE(node);
18795 +
18796 +               if (ret != 0)
18797 +                       return ret;
18798 +
18799 +               if (jnode_is_cluster_page(node))
18800 +                       stop_level = LEAF_LEVEL;
18801 +
18802 +               assert("jmacd-1812", coord != NULL);
18803 +
18804 +               ret = coord_by_key(jnode_get_tree(node), &key, coord, parent_lh,
18805 +                                  parent_mode, bias, stop_level, stop_level, CBK_UNIQUE, 0/*ra_info*/);
18806 +               switch (ret) {
18807 +               case CBK_COORD_NOTFOUND:
18808 +                       if (jnode_is_cluster_page(node)) {
18809 +                               int result;
18810 +                               assert("edward-164", jnode_page(node) != NULL);
18811 +                               assert("edward-165", jnode_page(node)->mapping != NULL);
18812 +                               assert("edward-166", jnode_page(node)->mapping->host != NULL);
18813 +                               assert("edward-167", inode_get_flag(jnode_page(node)->mapping->host, REISER4_CLUSTER_KNOWN));
18814 +                                /* jnode of a new cluster which is not represented by any items in the tree. */
18815 +                               result = incr_load_count_znode(parent_zh, parent_lh->node);
18816 +                               if (result != 0)
18817 +                                       return result;
18818 +                               coord->between = AFTER_ITEM;
18819 +                       } else if (!JF_ISSET(node, JNODE_HEARD_BANSHEE)) {
18820 +                               warning("nikita-3177", "Parent not found");
18821 +                               print_jnode("node", node);
18822 +                       }
18823 +                       return ret;
18824 +               case CBK_COORD_FOUND:
18825 +                       if (coord->between != AT_UNIT) {
18826 +                               /* FIXME: comment needed */
18827 +                               done_lh(parent_lh);
18828 +                               if (!JF_ISSET(node, JNODE_HEARD_BANSHEE)) {
18829 +                                       warning("nikita-3178",
18830 +                                               "Found but not happy: %i",
18831 +                                               coord->between);
18832 +                                       print_jnode("node", node);
18833 +                               }
18834 +                               return RETERR(-ENOENT);
18835 +                       }
18836 +                       ret = incr_load_count_znode(parent_zh, parent_lh->node);
18837 +                       if (ret != 0)
18838 +                               return ret;
18839 +                       /* if (jnode_is_cluster_page(node)) {
18840 +                          races with write() are possible
18841 +                          check_child_cluster (parent_lh->node);
18842 +                          }
18843 +                       */
18844 +                       break;
18845 +               default:
18846 +                       return ret;
18847 +               }
18848 +
18849 +       } else {
18850 +               int flags;
18851 +               znode *z;
18852 +
18853 +               z = JZNODE(node);
18854 +               /* Formatted node case: */
18855 +               assert("jmacd-2061", !znode_is_root(z));
18856 +
18857 +               flags = GN_ALLOW_NOT_CONNECTED;
18858 +               if (try)
18859 +                       flags |= GN_TRY_LOCK;
18860 +
18861 +               ret = reiser4_get_parent_flags(parent_lh, z, parent_mode, flags);
18862 +               if (ret != 0)
18863 +                       /* -E_REPEAT is ok here, it is handled by the caller. */
18864 +                       return ret;
18865 +
18866 +               /* Make the child's position "hint" up-to-date.  (Unless above
18867 +                  root, which caller must check.) */
18868 +               if (coord != NULL) {
18869 +
18870 +                       ret = incr_load_count_znode(parent_zh, parent_lh->node);
18871 +                       if (ret != 0) {
18872 +                               warning("jmacd-976812386", "incr_load_count_znode failed: %d", ret);
18873 +                               return ret;
18874 +                       }
18875 +
18876 +                       ret = find_child_ptr(parent_lh->node, z, coord);
18877 +                       if (ret != 0) {
18878 +                               warning("jmacd-976812", "find_child_ptr failed: %d", ret);
18879 +                               return ret;
18880 +                       }
18881 +               }
18882 +       }
18883 +
18884 +       return 0;
18885 +}
18886 +
18887 +/* Get the (locked) next neighbor of a znode which is dirty and a member of the same atom.
18888 +   If there is no next neighbor or the neighbor is not in memory or if there is a
18889 +   neighbor but it is not dirty or not in the same atom, -E_NO_NEIGHBOR is returned. */
18890 +static int
18891 +neighbor_in_slum(
18892 +
18893 +       znode * node,           /* starting point */
18894 +
18895 +       lock_handle * lock,             /* lock on starting point */
18896 +
18897 +       sideof side,                    /* left or right direction we seek the next node in */
18898 +
18899 +       znode_lock_mode mode            /* kind of lock we want */
18900 +
18901 +       )
18902 +{
18903 +       int ret;
18904 +
18905 +       assert("jmacd-6334", znode_is_connected(node));
18906 +
18907 +       ret = reiser4_get_neighbor(lock, node, mode, GN_SAME_ATOM | (side == LEFT_SIDE ? GN_GO_LEFT : 0));
18908 +
18909 +       if (ret) {
18910 +               /* May return -ENOENT or -E_NO_NEIGHBOR. */
18911 +               /* FIXME(C): check EINVAL, E_DEADLOCK */
18912 +               if (ret == -ENOENT) {
18913 +                       ret = RETERR(-E_NO_NEIGHBOR);
18914 +               }
18915 +
18916 +               return ret;
18917 +       }
18918 +
18919 +       /* Check dirty bit of locked znode, no races here */
18920 +       if (znode_check_dirty(lock->node))
18921 +               return 0;
18922 +
18923 +       done_lh(lock);
18924 +       return RETERR(-E_NO_NEIGHBOR);
18925 +}
18926 +
18927 +/* Return true if two znodes have the same parent.  This is called with both nodes
18928 +   write-locked (for squeezing) so no tree lock is needed. */
18929 +static int
18930 +znode_same_parents(znode * a, znode * b)
18931 +{
18932 +       assert("jmacd-7011", znode_is_write_locked(a));
18933 +       assert("jmacd-7012", znode_is_write_locked(b));
18934 +
18935 +       /* We lock the whole tree for this check.... I really don't like whole tree
18936 +        * locks... -Hans */
18937 +       return UNDER_RW(tree, znode_get_tree(a), read,
18938 +                       (znode_parent(a) == znode_parent(b)));
18939 +}
18940 +
18941 +/* FLUSH SCAN */
18942 +
18943 +/* Initialize the flush_scan data structure. */
18944 +static void
18945 +scan_init(flush_scan * scan)
18946 +{
18947 +       memset(scan, 0, sizeof (*scan));
18948 +       init_lh(&scan->node_lock);
18949 +       init_lh(&scan->parent_lock);
18950 +       init_load_count(&scan->parent_load);
18951 +       init_load_count(&scan->node_load);
18952 +       coord_init_invalid(&scan->parent_coord, NULL);
18953 +}
18954 +
18955 +/* Release any resources held by the flush scan, e.g., release locks, free memory, etc. */
18956 +static void
18957 +scan_done(flush_scan * scan)
18958 +{
18959 +       done_load_count(&scan->node_load);
18960 +       if (scan->node != NULL) {
18961 +               jput(scan->node);
18962 +               scan->node = NULL;
18963 +       }
18964 +       done_load_count(&scan->parent_load);
18965 +       done_lh(&scan->parent_lock);
18966 +       done_lh(&scan->node_lock);
18967 +}
18968 +
18969 +/* Returns true if flush scanning is finished. */
18970 +reiser4_internal int
18971 +scan_finished(flush_scan * scan)
18972 +{
18973 +       return scan->stop || (scan->direction == RIGHT_SIDE &&
18974 +                             scan->count >= scan->max_count);
18975 +}
18976 +
18977 +/* Return true if the scan should continue to the @tonode.  True if the node meets the
18978 +   same_slum_check condition.  If not, deref the "left" node and stop the scan. */
18979 +reiser4_internal int
18980 +scan_goto(flush_scan * scan, jnode * tonode)
18981 +{
18982 +       int go = same_slum_check(scan->node, tonode, 1, 0);
18983 +
18984 +       if (!go) {
18985 +               scan->stop = 1;
18986 +               ON_TRACE(TRACE_FLUSH_VERB,
18987 +                        "flush %s scan stop: stop at node %s\n",
18988 +                        scanning_left(scan) ? "left" : "right", jnode_tostring(scan->node));
18989 +               ON_TRACE(TRACE_FLUSH_VERB,
18990 +                        "flush %s scan stop: do not cont at %s\n",
18991 +                        scanning_left(scan) ? "left" : "right", jnode_tostring(tonode));
18992 +               jput(tonode);
18993 +       }
18994 +
18995 +       return go;
18996 +}
18997 +
18998 +/* Set the current scan->node, refcount it, increment count by the @add_count (number to
18999 +   count, e.g., skipped unallocated nodes), deref previous current, and copy the current
19000 +   parent coordinate. */
19001 +reiser4_internal int
19002 +scan_set_current(flush_scan * scan, jnode * node, unsigned add_count, const coord_t * parent)
19003 +{
19004 +       /* Release the old references, take the new reference. */
19005 +       done_load_count(&scan->node_load);
19006 +
19007 +       if (scan->node != NULL) {
19008 +               jput(scan->node);
19009 +       }
19010 +       scan->node = node;
19011 +       scan->count += add_count;
19012 +
19013 +       /* This next stmt is somewhat inefficient.  The scan_extent_coord code could
19014 +          delay this update step until it finishes and update the parent_coord only once.
19015 +          It did that before, but there was a bug and this was the easiest way to make it
19016 +          correct. */
19017 +       if (parent != NULL) {
19018 +               coord_dup(&scan->parent_coord, parent);
19019 +       }
19020 +
19021 +       /* Failure may happen at the incr_load_count call, but the caller can assume the reference
19022 +          is safely taken. */
19023 +       return incr_load_count_jnode(&scan->node_load, node);
19024 +}
19025 +
19026 +/* Return true if scanning in the leftward direction. */
19027 +reiser4_internal int
19028 +scanning_left(flush_scan * scan)
19029 +{
19030 +       return scan->direction == LEFT_SIDE;
19031 +}
19032 +
19033 +/* Performs leftward scanning starting from either kind of node.  Counts the starting
19034 +   node.  The right-scan object is passed in for the left-scan in order to copy the parent
19035 +   of an unformatted starting position.  This way we avoid searching for the unformatted
19036 +   node's parent when scanning in each direction.  If we search for the parent once it is
19037 +   set in both scan objects.  The limit parameter tells flush-scan when to stop.
19038 +
19039 +   Rapid scanning is used only during scan_left, where we are interested in finding the
19040 +   'leftpoint' where we begin flushing.  We are interested in stopping at the left child
19041 +   of a twig that does not have a dirty left neighbor.  THIS IS A SPECIAL CASE.  The
19042 +   problem is finding a way to flush only those nodes without unallocated children, and it
19043 +   is difficult to solve in the bottom-up flushing algorithm we are currently using.  The
19044 +   problem can be solved by scanning left at every level as we go upward, but this would
19045 +   basically bring us back to using a top-down allocation strategy, which we already tried
19046 +   (see BK history from May 2002), and has a different set of problems.  The top-down
19047 +   strategy makes avoiding unallocated children easier, but makes it difficult to
19048 +   propertly flush dirty children with clean parents that would otherwise stop the
19049 +   top-down flush, only later to dirty the parent once the children are flushed.  So we
19050 +   solve the problem in the bottom-up algorithm with a special case for twigs and leaves
19051 +   only.
19052 +
19053 +   The first step in solving the problem is this rapid leftward scan.  After we determine
19054 +   that there are at least enough nodes counted to qualify for FLUSH_RELOCATE_THRESHOLD we
19055 +   are no longer interested in the exact count, we are only interested in finding a the
19056 +   best place to start the flush.  We could choose one of two possibilities:
19057 +
19058 +   1. Stop at the leftmost child (of a twig) that does not have a dirty left neighbor.
19059 +   This requires checking one leaf per rapid-scan twig
19060 +
19061 +   2. Stop at the leftmost child (of a twig) where there are no dirty children of the twig
19062 +   to the left.  This requires checking possibly all of the in-memory children of each
19063 +   twig during the rapid scan.
19064 +
19065 +   For now we implement the first policy.
19066 +*/
19067 +static int
19068 +scan_left(flush_scan * scan, flush_scan * right, jnode * node, unsigned limit)
19069 +{
19070 +       int ret = 0;
19071 +
19072 +       scan->max_count = limit;
19073 +       scan->direction = LEFT_SIDE;
19074 +
19075 +       ret = scan_set_current(scan, jref(node), 1, NULL);
19076 +       if (ret != 0) {
19077 +               return ret;
19078 +       }
19079 +
19080 +       ret = scan_common(scan, right);
19081 +       if (ret != 0) {
19082 +               return ret;
19083 +       }
19084 +
19085 +       /* Before rapid scanning, we need a lock on scan->node so that we can get its
19086 +          parent, only if formatted. */
19087 +       if (jnode_is_znode(scan->node)) {
19088 +               ret = longterm_lock_znode(&scan->node_lock, JZNODE(scan->node),
19089 +                                         ZNODE_WRITE_LOCK, ZNODE_LOCK_LOPRI);
19090 +       }
19091 +
19092 +       /* Rapid_scan would go here (with limit set to FLUSH_RELOCATE_THRESHOLD). */
19093 +       return ret;
19094 +}
19095 +
19096 +/* Performs rightward scanning... Does not count the starting node.  The limit parameter
19097 +   is described in scan_left.  If the starting node is unformatted then the
19098 +   parent_coord was already set during scan_left.  The rapid_after parameter is not used
19099 +   during right-scanning.
19100 +
19101 +   scan_right is only called if the scan_left operation does not count at least
19102 +   FLUSH_RELOCATE_THRESHOLD nodes for flushing.  Otherwise, the limit parameter is set to
19103 +   the difference between scan-left's count and FLUSH_RELOCATE_THRESHOLD, meaning
19104 +   scan-right counts as high as FLUSH_RELOCATE_THRESHOLD and then stops. */
19105 +static int
19106 +scan_right(flush_scan * scan, jnode * node, unsigned limit)
19107 +{
19108 +       int ret;
19109 +
19110 +       scan->max_count = limit;
19111 +       scan->direction = RIGHT_SIDE;
19112 +
19113 +       ret = scan_set_current(scan, jref(node), 0, NULL);
19114 +       if (ret != 0) {
19115 +               return ret;
19116 +       }
19117 +
19118 +       return scan_common(scan, NULL);
19119 +}
19120 +
19121 +/* Common code to perform left or right scanning. */
19122 +static int
19123 +scan_common(flush_scan * scan, flush_scan * other)
19124 +{
19125 +       int ret;
19126 +
19127 +       assert("nikita-2376", scan->node != NULL);
19128 +       assert("edward-54", jnode_is_unformatted(scan->node) || jnode_is_znode(scan->node));
19129 +
19130 +       /* Special case for starting at an unformatted node.  Optimization: we only want
19131 +          to search for the parent (which requires a tree traversal) once.  Obviously, we
19132 +          shouldn't have to call it once for the left scan and once for the right scan.
19133 +          For this reason, if we search for the parent during scan-left we then duplicate
19134 +          the coord/lock/load into the scan-right object. */
19135 +       if (jnode_is_unformatted(scan->node)) {
19136 +               ret = scan_unformatted(scan, other);
19137 +               if (ret != 0)
19138 +                       return ret;
19139 +       }
19140 +       /* This loop expects to start at a formatted position and performs chaining of
19141 +          formatted regions */
19142 +       while (!scan_finished(scan)) {
19143 +
19144 +               ret = scan_formatted(scan);
19145 +               if (ret != 0) {
19146 +                       return ret;
19147 +               }
19148 +       }
19149 +
19150 +       return 0;
19151 +}
19152 +
19153 +/* called by scan_unformatted() when jnode_lock_parent_coord
19154 +   returns COORD_NOT_FOUND.
19155 +*/
19156 +static int
19157 +scan_should_link_node(flush_scan * scan)
19158 +{
19159 +       assert("edward-311", scan->node != NULL);
19160 +       if (jnode_is_cluster_page(scan->node)) {
19161 +
19162 +               assert("edward-303", scan->parent_coord.between != EMPTY_NODE);
19163 +               return 1;
19164 +       }
19165 +       return 0;
19166 +}
19167 +
19168 +static int
19169 +scan_unformatted(flush_scan * scan, flush_scan * other)
19170 +{
19171 +       int ret = 0;
19172 +       int try = 0;
19173 +
19174 +       if (!coord_is_invalid(&scan->parent_coord))
19175 +               goto scan;
19176 +
19177 +        /* set parent coord from */
19178 +       if (!jnode_is_unformatted(scan->node)) {
19179 +               /* formatted position*/
19180 +
19181 +               lock_handle lock;
19182 +               assert("edward-301", jnode_is_znode(scan->node));
19183 +               init_lh(&lock);
19184 +
19185 +               /*
19186 +                * when flush starts from unformatted node, first thing it
19187 +                * does is tree traversal to find formatted parent of starting
19188 +                * node. This parent is then kept lock across scans to the
19189 +                * left and to the right. This means that during scan to the
19190 +                * left we cannot take left-ward lock, because this is
19191 +                * dead-lock prone. So, if we are scanning to the left and
19192 +                * there is already lock held by this thread,
19193 +                * jnode_lock_parent_coord() should use try-lock.
19194 +                */
19195 +               try = scanning_left(scan) && !lock_stack_isclean(get_current_lock_stack());
19196 +               /* Need the node locked to get the parent lock, We have to
19197 +                  take write lock since there is at least one call path
19198 +                  where this znode is already write-locked by us. */
19199 +               ret = longterm_lock_znode(&lock, JZNODE(scan->node), ZNODE_WRITE_LOCK,
19200 +                                         scanning_left(scan) ? ZNODE_LOCK_LOPRI : ZNODE_LOCK_HIPRI);
19201 +               if (ret != 0)
19202 +                       /* EINVAL or E_DEADLOCK here mean... try again!  At this point we've
19203 +                          scanned too far and can't back out, just start over. */
19204 +                       return ret;
19205 +
19206 +               ret = jnode_lock_parent_coord(scan->node,
19207 +                                             &scan->parent_coord,
19208 +                                             &scan->parent_lock,
19209 +                                             &scan->parent_load,
19210 +                                             ZNODE_WRITE_LOCK, try);
19211 +
19212 +               /* FIXME(C): check EINVAL, E_DEADLOCK */
19213 +               done_lh(&lock);
19214 +               if (ret == -E_REPEAT) {
19215 +                       scan->stop = 1;
19216 +                       return 0;
19217 +               }
19218 +               if (ret)
19219 +                       return ret;
19220 +
19221 +       } else {
19222 +               /* unformatted position */
19223 +
19224 +               ret = jnode_lock_parent_coord(scan->node, &scan->parent_coord, &scan->parent_lock,
19225 +                                             &scan->parent_load, ZNODE_WRITE_LOCK, try);
19226 +
19227 +               if (IS_CBKERR(ret))
19228 +                       return ret;
19229 +
19230 +               if (ret == CBK_COORD_NOTFOUND) {
19231 +                       /* FIXME(C): check EINVAL, E_DEADLOCK */
19232 +                       ON_TRACE(TRACE_FLUSH,
19233 +                                "flush_scan_common: jnode_lock_parent_coord returned %d\n", ret);
19234 +                       if (!scan_should_link_node(scan))
19235 +                       return ret;
19236 +               }
19237 +               else {
19238 +                       /* parent was found */
19239 +                       set_flush_scan_nstat(scan, LINKED);
19240 +                       ON_TRACE(TRACE_FLUSH,
19241 +                                "flush_scan_common: jnode_lock_parent_coord returned 0\n");
19242 +                       assert("jmacd-8661", other != NULL);
19243 +               }
19244 +
19245 +               /* Duplicate the reference into the other flush_scan. */
19246 +               coord_dup(&other->parent_coord, &scan->parent_coord);
19247 +               copy_lh(&other->parent_lock, &scan->parent_lock);
19248 +               copy_load_count(&other->parent_load, &scan->parent_load);
19249 +               set_flush_scan_nstat(other, scan->nstat);
19250 +       }
19251 + scan:
19252 +       return scan_by_coord(scan);
19253 +}
19254 +
19255 +/* Performs left- or rightward scanning starting from a formatted node. Follow left
19256 +   pointers under tree lock as long as:
19257 +
19258 +   - node->left/right is non-NULL
19259 +   - node->left/right is connected, dirty
19260 +   - node->left/right belongs to the same atom
19261 +   - scan has not reached maximum count
19262 +*/
19263 +static int
19264 +scan_formatted(flush_scan * scan)
19265 +{
19266 +       int ret;
19267 +       znode *neighbor = NULL;
19268 +
19269 +       assert("jmacd-1401", !scan_finished(scan));
19270 +
19271 +       do {
19272 +               znode *node = JZNODE(scan->node);
19273 +
19274 +               /* Node should be connected, but if not stop the scan. */
19275 +               if (!znode_is_connected(node)) {
19276 +                       scan->stop = 1;
19277 +                       break;
19278 +               }
19279 +
19280 +               /* Lock the tree, check-for and reference the next sibling. */
19281 +               RLOCK_TREE(znode_get_tree(node));
19282 +
19283 +               /* It may be that a node is inserted or removed between a node and its
19284 +                  left sibling while the tree lock is released, but the flush-scan count
19285 +                  does not need to be precise.  Thus, we release the tree lock as soon as
19286 +                  we get the neighboring node. */
19287 +               neighbor = scanning_left(scan) ? node->left : node->right;
19288 +               if (neighbor != NULL) {
19289 +                       zref(neighbor);
19290 +               }
19291 +
19292 +               RUNLOCK_TREE(znode_get_tree(node));
19293 +
19294 +               /* If neighbor is NULL at the leaf level, need to check for an unformatted
19295 +                  sibling using the parent--break in any case. */
19296 +               if (neighbor == NULL) {
19297 +                       break;
19298 +               }
19299 +
19300 +               ON_TRACE(TRACE_FLUSH_VERB, "format scan %s %s\n",
19301 +                        scanning_left(scan) ? "left" : "right", znode_tostring(neighbor));
19302 +
19303 +               /* Check the condition for going left, break if it is not met.  This also
19304 +                  releases (jputs) the neighbor if false. */
19305 +               if (!scan_goto(scan, ZJNODE(neighbor))) {
19306 +                       break;
19307 +               }
19308 +
19309 +               /* Advance the flush_scan state to the left, repeat. */
19310 +               ret = scan_set_current(scan, ZJNODE(neighbor), 1, NULL);
19311 +               if (ret != 0) {
19312 +                       return ret;
19313 +               }
19314 +
19315 +       } while (!scan_finished(scan));
19316 +
19317 +       /* If neighbor is NULL then we reached the end of a formatted region, or else the
19318 +          sibling is out of memory, now check for an extent to the left (as long as
19319 +          LEAF_LEVEL). */
19320 +       if (neighbor != NULL || jnode_get_level(scan->node) != LEAF_LEVEL || scan_finished(scan)) {
19321 +               scan->stop = 1;
19322 +               return 0;
19323 +       }
19324 +       /* Otherwise, calls scan_by_coord for the right(left)most item of the
19325 +          left(right) neighbor on the parent level, then possibly continue. */
19326 +
19327 +       coord_init_invalid(&scan->parent_coord, NULL);
19328 +       return scan_unformatted(scan, NULL);
19329 +}
19330 +
19331 +/* NOTE-EDWARD:
19332 +   This scans adjacent items of the same type and calls scan flush plugin for each one.
19333 +   Performs left(right)ward scanning starting from a (possibly) unformatted node.  If we start
19334 +   from unformatted node, then we continue only if the next neighbor is also unformatted.
19335 +   When called from scan_formatted, we skip first iteration (to make sure that
19336 +   right(left)most item of the left(right) neighbor on the parent level is of the same
19337 +   type and set appropriate coord). */
19338 +static int
19339 +scan_by_coord(flush_scan * scan)
19340 +{
19341 +       int ret = 0;
19342 +       int scan_this_coord;
19343 +       lock_handle next_lock;
19344 +       load_count next_load;
19345 +       coord_t next_coord;
19346 +       jnode *child;
19347 +       item_plugin *iplug;
19348 +
19349 +       init_lh(&next_lock);
19350 +       init_load_count(&next_load);
19351 +       scan_this_coord = (jnode_is_unformatted(scan->node) ? 1 : 0);
19352 +
19353 +        /* set initial item id */
19354 +       if (get_flush_scan_nstat(scan) == UNLINKED)
19355 +               iplug = item_plugin_by_jnode(scan->node);
19356 +       else
19357 +               iplug = item_plugin_by_coord(&scan->parent_coord);
19358 +
19359 +       for (; !scan_finished(scan); scan_this_coord = 1) {
19360 +               if (scan_this_coord) {
19361 +                       /* Here we expect that unit is scannable. it would not be so due
19362 +                        * to race with extent->tail conversion.  */
19363 +                       if (iplug->f.scan == NULL) {
19364 +                               scan->stop = 1;
19365 +                               ret = -E_REPEAT;
19366 +                               /* skip the check at the end. */
19367 +                               goto race;
19368 +                       }
19369 +
19370 +                       ret = iplug->f.scan(scan);
19371 +                       if (ret != 0)
19372 +                               goto exit;
19373 +
19374 +                       if (scan_finished(scan)) {
19375 +                               checkchild(scan);
19376 +                               break;
19377 +                       }
19378 +               } else {
19379 +                       /* the same race against truncate as above is possible
19380 +                        * here, it seems */
19381 +
19382 +                       /* NOTE-JMACD: In this case, apply the same end-of-node logic but don't scan
19383 +                          the first coordinate. */
19384 +                       assert("jmacd-1231", item_is_internal(&scan->parent_coord));
19385 +               }
19386 +
19387 +               if(iplug->f.utmost_child == NULL || znode_get_level(scan->parent_coord.node) != TWIG_LEVEL) {
19388 +                       /* stop this coord and continue on parrent level */
19389 +                       ret = scan_set_current(scan, ZJNODE(zref(scan->parent_coord.node)), 1, NULL);
19390 +                       if (ret != 0)
19391 +                               goto exit;
19392 +                       break;
19393 +               }
19394 +
19395 +               /* Either way, the invariant is that scan->parent_coord is set to the
19396 +                  parent of scan->node. Now get the next unit. */
19397 +               coord_dup(&next_coord, &scan->parent_coord);
19398 +               coord_sideof_unit(&next_coord, scan->direction);
19399 +
19400 +               /* If off-the-end of the twig, try the next twig. */
19401 +               if (coord_is_after_sideof_unit(&next_coord, scan->direction)) {
19402 +                       /* We take the write lock because we may start flushing from this
19403 +                        * coordinate. */
19404 +                       ret = neighbor_in_slum(next_coord.node, &next_lock, scan->direction, ZNODE_WRITE_LOCK);
19405 +
19406 +                       if (ret == -E_NO_NEIGHBOR) {
19407 +                               scan->stop = 1;
19408 +                               ret = 0;
19409 +                               break;
19410 +                       }
19411 +
19412 +                       if (ret != 0) {
19413 +                               goto exit;
19414 +                       }
19415 +
19416 +                       ret = incr_load_count_znode(&next_load, next_lock.node);
19417 +                       if (ret != 0) {
19418 +                               goto exit;
19419 +                       }
19420 +
19421 +                       coord_init_sideof_unit(&next_coord, next_lock.node, sideof_reverse(scan->direction));
19422 +               }
19423 +
19424 +               iplug = item_plugin_by_coord(&next_coord);
19425 +
19426 +               /* Get the next child. */
19427 +               ret = iplug->f.utmost_child(&next_coord, sideof_reverse(scan->direction), &child);
19428 +               if (ret != 0)
19429 +                       goto exit;
19430 +               /* If the next child is not in memory, or, item_utmost_child
19431 +                  failed (due to race with unlink, most probably), stop
19432 +                  here. */
19433 +               if (child == NULL || IS_ERR(child)) {
19434 +                       scan->stop = 1;
19435 +                       checkchild(scan);
19436 +                       break;
19437 +               }
19438 +
19439 +               assert("nikita-2374", jnode_is_unformatted(child) || jnode_is_znode(child));
19440 +
19441 +               /* See if it is dirty, part of the same atom. */
19442 +               if (!scan_goto(scan, child)) {
19443 +                       checkchild(scan);
19444 +                       break;
19445 +               }
19446 +
19447 +               /* If so, make this child current. */
19448 +               ret = scan_set_current(scan, child, 1, &next_coord);
19449 +               if (ret != 0)
19450 +                       goto exit;
19451 +
19452 +               /* Now continue.  If formatted we release the parent lock and return, then
19453 +                  proceed. */
19454 +               if (jnode_is_znode(child))
19455 +                       break;
19456 +
19457 +               /* Otherwise, repeat the above loop with next_coord. */
19458 +               if (next_load.node != NULL) {
19459 +                       done_lh(&scan->parent_lock);
19460 +                       move_lh(&scan->parent_lock, &next_lock);
19461 +                       move_load_count(&scan->parent_load, &next_load);
19462 +               }
19463 +       }
19464 +
19465 +       assert("jmacd-6233", scan_finished(scan) || jnode_is_znode(scan->node));
19466 + exit:
19467 +       checkchild(scan);
19468 + race: /* skip the above check  */
19469 +       if (jnode_is_znode(scan->node)) {
19470 +               done_lh(&scan->parent_lock);
19471 +               done_load_count(&scan->parent_load);
19472 +       }
19473 +
19474 +       done_load_count(&next_load);
19475 +       done_lh(&next_lock);
19476 +       return ret;
19477 +}
19478 +
19479 +/* FLUSH POS HELPERS */
19480 +
19481 +/* Initialize the fields of a flush_position. */
19482 +static void
19483 +pos_init(flush_pos_t * pos)
19484 +{
19485 +       xmemset(pos, 0, sizeof *pos);
19486 +
19487 +       pos->state = POS_INVALID;
19488 +       coord_init_invalid(&pos->coord, NULL);
19489 +       init_lh(&pos->lock);
19490 +       init_load_count(&pos->load);
19491 +
19492 +       blocknr_hint_init(&pos->preceder);
19493 +}
19494 +
19495 +/* The flush loop inside squalloc periodically checks pos_valid to
19496 +   determine when "enough flushing" has been performed.  This will return true until one
19497 +   of the following conditions is met:
19498 +
19499 +   1. the number of flush-queued nodes has reached the kernel-supplied "int *nr_to_flush"
19500 +   parameter, meaning we have flushed as many blocks as the kernel requested.  When
19501 +   flushing to commit, this parameter is NULL.
19502 +
19503 +   2. pos_stop() is called because squalloc discovers that the "next" node in the
19504 +   flush order is either non-existant, not dirty, or not in the same atom.
19505 +*/
19506 +
19507 +
19508 +static int pos_valid (flush_pos_t * pos)
19509 +{
19510 +       return pos->state != POS_INVALID;
19511 +}
19512 +
19513 +/* Release any resources of a flush_position.  Called when jnode_flush finishes. */
19514 +static void
19515 +pos_done(flush_pos_t * pos)
19516 +{
19517 +       pos_stop(pos);
19518 +       blocknr_hint_done(&pos->preceder);
19519 +       if (pos->sq)
19520 +               free_squeeze_data(pos);
19521 +}
19522 +
19523 +/* Reset the point and parent.  Called during flush subroutines to terminate the
19524 +   squalloc loop. */
19525 +static int
19526 +pos_stop(flush_pos_t * pos)
19527 +{
19528 +       pos->state = POS_INVALID;
19529 +       done_lh(&pos->lock);
19530 +       done_load_count(&pos->load);
19531 +       coord_init_invalid(&pos->coord, NULL);
19532 +
19533 +       if (pos->child) {
19534 +               jput(pos->child);
19535 +               pos->child = NULL;
19536 +       }
19537 +
19538 +       return 0;
19539 +}
19540 +
19541 +/* Return the flush_position's block allocator hint. */
19542 +reiser4_internal reiser4_blocknr_hint *
19543 +pos_hint(flush_pos_t * pos)
19544 +{
19545 +       return &pos->preceder;
19546 +}
19547 +
19548 +/* Return true if we have decided to unconditionally relocate leaf nodes, thus write
19549 +   optimizing. */
19550 +reiser4_internal int
19551 +pos_leaf_relocate(flush_pos_t * pos)
19552 +{
19553 +       return pos->leaf_relocate;
19554 +}
19555 +
19556 +reiser4_internal flush_queue_t * pos_fq(flush_pos_t * pos)
19557 +{
19558 +       return pos->fq;
19559 +}
19560 +
19561 +/* Make Linus happy.
19562 +   Local variables:
19563 +   c-indentation-style: "K&R"
19564 +   mode-name: "LC"
19565 +   c-basic-offset: 8
19566 +   tab-width: 8
19567 +   fill-column: 90
19568 +   LocalWords:  preceder
19569 +   End:
19570 +*/
19571 diff -rupN linux-2.6.8-rc3/fs/reiser4/flush.h linux-2.6.8-rc3-a/fs/reiser4/flush.h
19572 --- linux-2.6.8-rc3/fs/reiser4/flush.h  1970-01-01 03:00:00.000000000 +0300
19573 +++ linux-2.6.8-rc3-a/fs/reiser4/flush.h        2004-08-05 21:20:53.164643347 +0400
19574 @@ -0,0 +1,240 @@
19575 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
19576 +
19577 +/* DECLARATIONS: */
19578 +
19579 +#if !defined(__REISER4_FLUSH_H__)
19580 +#define __REISER4_FLUSH_H__
19581 +
19582 +#include "plugin/item/ctail.h" /* for ctail scan/squeeze info */
19583 +
19584 +typedef enum {
19585 +       UNLINKED = 0,
19586 +       LINKED   = 1
19587 +} flush_scan_node_stat_t;
19588 +
19589 +/* The flush_scan data structure maintains the state of an in-progress flush-scan on a
19590 +   single level of the tree.  A flush-scan is used for counting the number of adjacent
19591 +   nodes to flush, which is used to determine whether we should relocate, and it is also
19592 +   used to find a starting point for flush.  A flush-scan object can scan in both right
19593 +   and left directions via the scan_left() and scan_right() interfaces.  The
19594 +   right- and left-variations are similar but perform different functions.  When scanning
19595 +   left we (optionally perform rapid scanning and then) longterm-lock the endpoint node.
19596 +   When scanning right we are simply counting the number of adjacent, dirty nodes. */
19597 +struct flush_scan {
19598 +
19599 +       /* The current number of nodes scanned on this level. */
19600 +       unsigned count;
19601 +
19602 +       /* There may be a maximum number of nodes for a scan on any single level.  When
19603 +          going leftward, max_count is determined by FLUSH_SCAN_MAXNODES (see reiser4.h) */
19604 +       unsigned max_count;
19605 +
19606 +       /* Direction: Set to one of the sideof enumeration: { LEFT_SIDE, RIGHT_SIDE }. */
19607 +       sideof direction;
19608 +
19609 +       /* Initially @stop is set to false then set true once some condition stops the
19610 +          search (e.g., we found a clean node before reaching max_count or we found a
19611 +          node belonging to another atom). */
19612 +       int stop;
19613 +
19614 +       /* The current scan position.  If @node is non-NULL then its reference count has
19615 +          been incremented to reflect this reference. */
19616 +       jnode *node;
19617 +
19618 +       /* node specific linkage status. This indicates if the node that flush
19619 +        * started from is linked to the tree (like formatted nodes, extent's jnodes),
19620 +        * or not (like jnodes of newly created cluster of cryptcompressed file.
19621 +        * If (nstat == UNLINKED) we don't do right scan. Also we use this status in
19622 +        * scan_by_coord() to assign item plugin */
19623 +       flush_scan_node_stat_t nstat;
19624 +
19625 +       /* A handle for zload/zrelse of current scan position node. */
19626 +       load_count node_load;
19627 +
19628 +       /* During left-scan, if the final position (a.k.a. endpoint node) is formatted the
19629 +          node is locked using this lock handle.  The endpoint needs to be locked for
19630 +          transfer to the flush_position object after scanning finishes. */
19631 +       lock_handle node_lock;
19632 +
19633 +       /* When the position is unformatted, its parent, coordinate, and parent
19634 +          zload/zrelse handle. */
19635 +       lock_handle parent_lock;
19636 +       coord_t parent_coord;
19637 +       load_count parent_load;
19638 +
19639 +       /* The block allocator preceder hint.  Sometimes flush_scan determines what the
19640 +          preceder is and if so it sets it here, after which it is copied into the
19641 +          flush_position.  Otherwise, the preceder is computed later. */
19642 +       reiser4_block_nr preceder_blk;
19643 +};
19644 +
19645 +static inline flush_scan_node_stat_t
19646 +get_flush_scan_nstat(flush_scan * scan)
19647 +
19648 +{
19649 +       return scan->nstat;
19650 +}
19651 +
19652 +static inline void
19653 +set_flush_scan_nstat(flush_scan * scan, flush_scan_node_stat_t nstat)
19654 +{
19655 +       scan->nstat = nstat;
19656 +}
19657 +
19658 +typedef struct squeeze_item_info {
19659 +       int mergeable;
19660 +       union {
19661 +               ctail_squeeze_info_t ctail_info;
19662 +       } u;
19663 +} squeeze_item_info_t;
19664 +
19665 +typedef struct squeeze_info {
19666 +       int count;                    /* for squalloc terminating */
19667 +       tfm_info_t  * tfm;           /* transform info */
19668 +       item_plugin * iplug;         /* current item plugin */
19669 +       squeeze_item_info_t * itm;   /* current item info */
19670 +} squeeze_info_t;
19671 +
19672 +typedef enum flush_position_state {
19673 +       POS_INVALID,            /* Invalid or stopped pos, do not continue slum
19674 +                                * processing */
19675 +       POS_ON_LEAF,            /* pos points to already prepped, locked formatted node at
19676 +                                * leaf level */
19677 +       POS_ON_EPOINT,          /* pos keeps a lock on twig level, "coord" field is used
19678 +                                * to traverse unformatted nodes */
19679 +       POS_TO_LEAF,            /* pos is being moved to leaf level */
19680 +       POS_TO_TWIG,            /* pos is being moved to twig level */
19681 +       POS_END_OF_TWIG,        /* special case of POS_ON_TWIG, when coord is after
19682 +                                * rightmost unit of the current twig */
19683 +       POS_ON_INTERNAL         /* same as POS_ON_LEAF, but points to internal node */
19684 +
19685 +} flushpos_state_t;
19686 +
19687 +
19688 +
19689 +/* An encapsulation of the current flush point and all the parameters that are passed
19690 +   through the entire squeeze-and-allocate stage of the flush routine.  A single
19691 +   flush_position object is constructed after left- and right-scanning finishes. */
19692 +struct flush_position {
19693 +       flushpos_state_t state;
19694 +
19695 +       coord_t coord;          /* coord to traverse unformatted nodes */
19696 +       lock_handle lock;       /* current lock we hold */
19697 +       load_count load;        /* load status for current locked formatted node  */
19698 +
19699 +       jnode * child;          /* for passing a reference to unformatted child
19700 +                                * across pos state changes */
19701 +
19702 +       reiser4_blocknr_hint preceder;  /* The flush 'hint' state. */
19703 +       int leaf_relocate;      /* True if enough leaf-level nodes were
19704 +                                * found to suggest a relocate policy. */
19705 +       long *nr_to_flush;      /* If called under memory pressure,
19706 +                                * indicates how many nodes the VM asked to flush. */
19707 +       int alloc_cnt;          /* The number of nodes allocated during squeeze and allococate. */
19708 +       int prep_or_free_cnt;   /* The number of nodes prepared for write (allocate) or squeezed and freed. */
19709 +       flush_queue_t *fq;
19710 +       long *nr_written;       /* number of nodes submitted to disk */
19711 +       int flags;              /* a copy of jnode_flush flags argument */
19712 +
19713 +       znode * prev_twig;      /* previous parent pointer value, used to catch
19714 +                                * processing of new twig node */
19715 +       squeeze_info_t * sq;    /* squeeze info */
19716 +
19717 +       unsigned long pos_in_unit; /* for extents only. Position
19718 +                                     within an extent unit of first
19719 +                                     jnode of slum */
19720 +};
19721 +
19722 +static inline int
19723 +item_squeeze_count (flush_pos_t * pos)
19724 +{
19725 +       return pos->sq->count;
19726 +}
19727 +static inline void
19728 +inc_item_squeeze_count (flush_pos_t * pos)
19729 +{
19730 +       pos->sq->count++;
19731 +}
19732 +static inline void
19733 +set_item_squeeze_count (flush_pos_t * pos, int count)
19734 +{
19735 +       pos->sq->count = count;
19736 +}
19737 +static inline item_plugin *
19738 +item_squeeze_plug (flush_pos_t * pos)
19739 +{
19740 +       return pos->sq->iplug;
19741 +}
19742 +
19743 +static inline squeeze_item_info_t *
19744 +item_squeeze_data (flush_pos_t * pos)
19745 +{
19746 +       return pos->sq->itm;
19747 +}
19748 +
19749 +static inline tfm_info_t *
19750 +tfm_squeeze_data (flush_pos_t * pos)
19751 +{
19752 +       return pos->sq->tfm;
19753 +}
19754 +
19755 +static inline tfm_info_t *
19756 +tfm_squeeze_idx (flush_pos_t * pos, reiser4_compression_id idx)
19757 +{
19758 +       return &pos->sq->tfm[idx];
19759 +}
19760 +
19761 +static inline tfm_info_t
19762 +tfm_squeeze_pos (flush_pos_t * pos, reiser4_compression_id idx)
19763 +{
19764 +       return (tfm_squeeze_data(pos) ? *tfm_squeeze_idx(pos, idx) : 0);
19765 +}
19766 +
19767 +#define SQUALLOC_THRESHOLD 256  /* meaningful for ctails */
19768 +
19769 +static inline int
19770 +should_terminate_squalloc(flush_pos_t * pos)
19771 +{
19772 +       return pos->sq && !item_squeeze_data(pos) && pos->sq->count >= SQUALLOC_THRESHOLD;
19773 +}
19774 +
19775 +void free_squeeze_data(flush_pos_t * pos);
19776 +/* used in extent.c */
19777 +int scan_set_current(flush_scan * scan, jnode * node, unsigned add_size, const coord_t * parent);
19778 +int scan_finished(flush_scan * scan);
19779 +int scanning_left(flush_scan * scan);
19780 +int scan_goto(flush_scan * scan, jnode * tonode);
19781 +txn_atom *atom_locked_by_fq(flush_queue_t * fq);
19782 +
19783 +int init_fqs(void);
19784 +void done_fqs(void);
19785 +
19786 +#if REISER4_TRACE
19787 +const char *jnode_tostring(jnode * node);
19788 +#else
19789 +#define jnode_tostring(n) ""
19790 +#endif
19791 +
19792 +#if REISER4_DEBUG
19793 +#define check_preceder(blk) \
19794 +assert("nikita-2588", blk < reiser4_block_count(reiser4_get_current_sb()));
19795 +
19796 +extern void check_pos(flush_pos_t *pos);
19797 +#else
19798 +#define check_preceder(b) noop
19799 +#define check_pos(pos) noop
19800 +#endif
19801 +
19802 +/* __REISER4_FLUSH_H__ */
19803 +#endif
19804 +
19805 +/* Make Linus happy.
19806 +   Local variables:
19807 +   c-indentation-style: "K&R"
19808 +   mode-name: "LC"
19809 +   c-basic-offset: 8
19810 +   tab-width: 8
19811 +   fill-column: 90
19812 +   LocalWords:  preceder
19813 +   End:
19814 +*/
19815 diff -rupN linux-2.6.8-rc3/fs/reiser4/flush_queue.c linux-2.6.8-rc3-a/fs/reiser4/flush_queue.c
19816 --- linux-2.6.8-rc3/fs/reiser4/flush_queue.c    1970-01-01 03:00:00.000000000 +0300
19817 +++ linux-2.6.8-rc3-a/fs/reiser4/flush_queue.c  2004-08-05 21:20:53.087659585 +0400
19818 @@ -0,0 +1,758 @@
19819 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
19820 +
19821 +#include "debug.h"
19822 +#include "type_safe_list.h"
19823 +#include "super.h"
19824 +#include "txnmgr.h"
19825 +#include "jnode.h"
19826 +#include "znode.h"
19827 +#include "page_cache.h"
19828 +#include "wander.h"
19829 +#include "vfs_ops.h"
19830 +#include "writeout.h"
19831 +
19832 +#include <linux/bio.h>
19833 +#include <linux/mm.h>
19834 +#include <linux/pagemap.h>
19835 +#include <linux/blkdev.h>
19836 +#include <linux/writeback.h>
19837 +
19838 +/* A flush queue object is an accumulator for keeping jnodes prepared
19839 +   by the jnode_flush() function for writing to disk. Those "queued" jnodes are
19840 +   kept on the flush queue until memory pressure or atom commit asks
19841 +   flush queues to write some or all from their jnodes. */
19842 +
19843 +TYPE_SAFE_LIST_DEFINE(fq, flush_queue_t, alink);
19844 +
19845 +#if REISER4_DEBUG
19846 +#   define spin_ordering_pred_fq(fq)  (1)
19847 +#endif
19848 +
19849 +SPIN_LOCK_FUNCTIONS(fq, flush_queue_t, guard);
19850 +
19851 +/*
19852 +   LOCKING:
19853 +
19854 +   fq->guard spin lock protects fq->atom pointer and nothing else.  fq->prepped
19855 +   list protected by atom spin lock.  fq->prepped list uses the following
19856 +   locking:
19857 +
19858 +   two ways to protect fq->prepped list for read-only list traversal:
19859 +
19860 +   1. atom spin-lock atom.
19861 +   2. fq is IN_USE, atom->nr_running_queues increased.
19862 +
19863 +   and one for list modification:
19864 +
19865 +   1. atom is spin-locked and one condition is true: fq is IN_USE or
19866 +      atom->nr_running_queues == 0.
19867 +
19868 +   The deadlock-safe order for flush queues and atoms is: first lock atom, then
19869 +   lock flush queue, then lock jnode.
19870 +*/
19871 +
19872 +#define fq_in_use(fq)          ((fq)->state & FQ_IN_USE)
19873 +#define fq_ready(fq)           (!fq_in_use(fq))
19874 +
19875 +#define mark_fq_in_use(fq)     do { (fq)->state |= FQ_IN_USE;    } while (0)
19876 +#define mark_fq_ready(fq)      do { (fq)->state &= ~FQ_IN_USE;   } while (0)
19877 +
19878 +/* get lock on atom from locked flush queue object */
19879 +reiser4_internal txn_atom *
19880 +atom_get_locked_by_fq(flush_queue_t * fq)
19881 +{
19882 +       /* This code is similar to jnode_get_atom(), look at it for the
19883 +        * explanation. */
19884 +       txn_atom *atom;
19885 +
19886 +       assert("zam-729", spin_fq_is_locked(fq));
19887 +
19888 +       while(1) {
19889 +               atom = fq->atom;
19890 +               if (atom == NULL)
19891 +                       break;
19892 +
19893 +               if (spin_trylock_atom(atom))
19894 +                       break;
19895 +
19896 +               atomic_inc(&atom->refcount);
19897 +               spin_unlock_fq(fq);
19898 +               LOCK_ATOM(atom);
19899 +               spin_lock_fq(fq);
19900 +
19901 +               if (fq->atom == atom) {
19902 +                       atomic_dec(&atom->refcount);
19903 +                       break;
19904 +               }
19905 +
19906 +               spin_unlock_fq(fq);
19907 +               atom_dec_and_unlock(atom);
19908 +               spin_lock_fq(fq);
19909 +       }
19910 +
19911 +       return atom;
19912 +}
19913 +
19914 +reiser4_internal txn_atom *
19915 +atom_locked_by_fq(flush_queue_t * fq)
19916 +{
19917 +       return UNDER_SPIN(fq, fq, atom_get_locked_by_fq(fq));
19918 +}
19919 +
19920 +static void
19921 +init_fq(flush_queue_t * fq)
19922 +{
19923 +       xmemset(fq, 0, sizeof *fq);
19924 +
19925 +       atomic_set(&fq->nr_submitted, 0);
19926 +
19927 +       capture_list_init(ATOM_FQ_LIST(fq));
19928 +
19929 +       sema_init(&fq->io_sem, 0);
19930 +       spin_fq_init(fq);
19931 +}
19932 +
19933 +/* slab for flush queues */
19934 +static kmem_cache_t *fq_slab;
19935 +
19936 +reiser4_internal int init_fqs(void)
19937 +{
19938 +       fq_slab = kmem_cache_create("fq",
19939 +                                   sizeof (flush_queue_t),
19940 +                                   0,
19941 +                                   SLAB_HWCACHE_ALIGN,
19942 +                                   NULL,
19943 +                                   NULL);
19944 +       return (fq_slab == NULL) ? RETERR(-ENOMEM) : 0;
19945 +}
19946 +
19947 +reiser4_internal void done_fqs(void)
19948 +{
19949 +       kmem_cache_destroy(fq_slab);
19950 +}
19951 +
19952 +/* create new flush queue object */
19953 +static flush_queue_t *
19954 +create_fq(int gfp)
19955 +{
19956 +       flush_queue_t *fq;
19957 +
19958 +       fq = kmem_cache_alloc(fq_slab, gfp);
19959 +       if (fq)
19960 +               init_fq(fq);
19961 +
19962 +       return fq;
19963 +}
19964 +
19965 +/* adjust atom's and flush queue's counters of queued nodes */
19966 +static void
19967 +count_enqueued_node(flush_queue_t * fq)
19968 +{
19969 +       ON_DEBUG(fq->atom->num_queued++);
19970 +}
19971 +
19972 +static void
19973 +count_dequeued_node(flush_queue_t * fq)
19974 +{
19975 +       assert("zam-993", fq->atom->num_queued > 0);
19976 +       ON_DEBUG(fq->atom->num_queued--);
19977 +}
19978 +
19979 +/* attach flush queue object to the atom */
19980 +static void
19981 +attach_fq(txn_atom * atom, flush_queue_t * fq)
19982 +{
19983 +       assert("zam-718", spin_atom_is_locked(atom));
19984 +       fq_list_push_front(&atom->flush_queues, fq);
19985 +       fq->atom = atom;
19986 +       ON_DEBUG(atom->nr_flush_queues++);
19987 +}
19988 +
19989 +static void
19990 +detach_fq(flush_queue_t * fq)
19991 +{
19992 +       assert("zam-731", spin_atom_is_locked(fq->atom));
19993 +
19994 +       spin_lock_fq(fq);
19995 +       fq_list_remove_clean(fq);
19996 +       assert("vs-1456", fq->atom->nr_flush_queues > 0);
19997 +       ON_DEBUG(fq->atom->nr_flush_queues--);
19998 +       fq->atom = NULL;
19999 +       spin_unlock_fq(fq);
20000 +}
20001 +
20002 +/* destroy flush queue object */
20003 +reiser4_internal void
20004 +done_fq(flush_queue_t * fq)
20005 +{
20006 +       assert("zam-763", capture_list_empty(ATOM_FQ_LIST(fq)));
20007 +       assert("zam-766", atomic_read(&fq->nr_submitted) == 0);
20008 +
20009 +       kmem_cache_free(fq_slab, fq);
20010 +}
20011 +
20012 +/* */
20013 +reiser4_internal void
20014 +mark_jnode_queued(flush_queue_t *fq, jnode *node)
20015 +{
20016 +       JF_SET(node, JNODE_FLUSH_QUEUED);
20017 +       count_enqueued_node(fq);
20018 +}
20019 +
20020 +/* Putting jnode into the flush queue. Both atom and jnode should be
20021 +   spin-locked. */
20022 +reiser4_internal void
20023 +queue_jnode(flush_queue_t * fq, jnode * node)
20024 +{
20025 +       assert("zam-711", spin_jnode_is_locked(node));
20026 +       assert("zam-713", node->atom != NULL);
20027 +       assert("zam-712", spin_atom_is_locked(node->atom));
20028 +       assert("zam-714", jnode_is_dirty(node));
20029 +       assert("zam-716", fq->atom != NULL);
20030 +       assert("zam-717", fq->atom == node->atom);
20031 +       assert("zam-907", fq_in_use(fq));
20032 +
20033 +       assert("zam-826", JF_ISSET(node, JNODE_RELOC));
20034 +       assert("vs-1481", !JF_ISSET(node, JNODE_FLUSH_QUEUED));
20035 +       assert("vs-1481", NODE_LIST(node) != FQ_LIST);
20036 +
20037 +       mark_jnode_queued(fq, node);
20038 +       capture_list_remove_clean(node);
20039 +       capture_list_push_back(ATOM_FQ_LIST(fq), node);
20040 +       /*XXXX*/ON_DEBUG(count_jnode(node->atom, node, NODE_LIST(node), FQ_LIST, 1));
20041 +}
20042 +
20043 +/* repeatable process for waiting io completion on a flush queue object */
20044 +static int
20045 +wait_io(flush_queue_t * fq, int *nr_io_errors)
20046 +{
20047 +       assert("zam-738", fq->atom != NULL);
20048 +       assert("zam-739", spin_atom_is_locked(fq->atom));
20049 +       assert("zam-736", fq_in_use(fq));
20050 +       assert("zam-911", capture_list_empty(ATOM_FQ_LIST(fq)));
20051 +
20052 +       if (atomic_read(&fq->nr_submitted) != 0) {
20053 +               struct super_block *super;
20054 +
20055 +               UNLOCK_ATOM(fq->atom);
20056 +
20057 +               assert("nikita-3013", schedulable());
20058 +
20059 +               super = reiser4_get_current_sb();
20060 +
20061 +               /* FIXME: this is instead of blk_run_queues() */
20062 +               blk_run_address_space(get_super_fake(super)->i_mapping);
20063 +
20064 +               if ( !(super->s_flags & MS_RDONLY) )
20065 +                       down(&fq->io_sem);
20066 +
20067 +               /* Ask the caller to re-acquire the locks and call this
20068 +                  function again. Note: this technique is commonly used in
20069 +                  the txnmgr code. */
20070 +               return -E_REPEAT;
20071 +       }
20072 +
20073 +       *nr_io_errors += atomic_read(&fq->nr_errors);
20074 +       return 0;
20075 +}
20076 +
20077 +/* wait on I/O completion, re-submit dirty nodes to write */
20078 +static int
20079 +finish_fq(flush_queue_t * fq, int *nr_io_errors)
20080 +{
20081 +       int ret;
20082 +       txn_atom * atom = fq->atom;
20083 +
20084 +       assert("zam-801", atom != NULL);
20085 +       assert("zam-744", spin_atom_is_locked(atom));
20086 +       assert("zam-762", fq_in_use(fq));
20087 +
20088 +       ret = wait_io(fq, nr_io_errors);
20089 +       if (ret)
20090 +               return ret;
20091 +
20092 +       detach_fq(fq);
20093 +       done_fq(fq);
20094 +
20095 +       atom_send_event(atom);
20096 +
20097 +       return 0;
20098 +}
20099 +
20100 +/* wait for all i/o for given atom to be completed, actually do one iteration
20101 +   on that and return -E_REPEAT if there more iterations needed */
20102 +static int
20103 +finish_all_fq(txn_atom * atom, int *nr_io_errors)
20104 +{
20105 +       flush_queue_t *fq;
20106 +
20107 +       assert("zam-730", spin_atom_is_locked(atom));
20108 +
20109 +       if (fq_list_empty(&atom->flush_queues))
20110 +               return 0;
20111 +
20112 +       for_all_type_safe_list(fq, &atom->flush_queues, fq) {
20113 +               if (fq_ready(fq)) {
20114 +                       int ret;
20115 +
20116 +                       mark_fq_in_use(fq);
20117 +                       assert("vs-1247", fq->owner == NULL);
20118 +                       ON_DEBUG(fq->owner = current);
20119 +                       ret = finish_fq(fq, nr_io_errors);
20120 +
20121 +                       if ( *nr_io_errors )
20122 +                               reiser4_handle_error();
20123 +
20124 +                       if (ret) {
20125 +                               fq_put(fq);
20126 +                               return ret;
20127 +                       }
20128 +
20129 +                       UNLOCK_ATOM(atom);
20130 +
20131 +                       return -E_REPEAT;
20132 +               }
20133 +       }
20134 +
20135 +       /* All flush queues are in use; atom remains locked */
20136 +       return -EBUSY;
20137 +}
20138 +
20139 +/* wait all i/o for current atom */
20140 +reiser4_internal int
20141 +current_atom_finish_all_fq(void)
20142 +{
20143 +       txn_atom *atom;
20144 +       int nr_io_errors = 0;
20145 +       int ret = 0;
20146 +
20147 +       do {
20148 +               while (1) {
20149 +                       atom = get_current_atom_locked();
20150 +                       ret = finish_all_fq(atom, &nr_io_errors);
20151 +                       if (ret != -EBUSY)
20152 +                               break;
20153 +                       atom_wait_event(atom);
20154 +               }
20155 +       } while (ret == -E_REPEAT);
20156 +
20157 +       /* we do not need locked atom after this function finishes, SUCCESS or
20158 +          -EBUSY are two return codes when atom remains locked after
20159 +          finish_all_fq */
20160 +       if (!ret)
20161 +               UNLOCK_ATOM(atom);
20162 +
20163 +       assert("nikita-2696", spin_atom_is_not_locked(atom));
20164 +
20165 +       if (ret)
20166 +               return ret;
20167 +
20168 +       if (nr_io_errors)
20169 +               return RETERR(-EIO);
20170 +
20171 +       return 0;
20172 +}
20173 +
20174 +/* change node->atom field for all jnode from given list */
20175 +static void
20176 +scan_fq_and_update_atom_ref(capture_list_head * list, txn_atom * atom)
20177 +{
20178 +       jnode *cur;
20179 +
20180 +       for_all_type_safe_list(capture, list, cur) {
20181 +               LOCK_JNODE(cur);
20182 +               cur->atom = atom;
20183 +               UNLOCK_JNODE(cur);
20184 +       }
20185 +}
20186 +
20187 +/* support for atom fusion operation */
20188 +reiser4_internal void
20189 +fuse_fq(txn_atom * to, txn_atom * from)
20190 +{
20191 +       flush_queue_t *fq;
20192 +
20193 +       assert("zam-720", spin_atom_is_locked(to));
20194 +       assert("zam-721", spin_atom_is_locked(from));
20195 +
20196 +
20197 +       for_all_type_safe_list(fq, &from->flush_queues, fq) {
20198 +               scan_fq_and_update_atom_ref(ATOM_FQ_LIST(fq), to);
20199 +               spin_lock_fq(fq);
20200 +               fq->atom = to;
20201 +               spin_unlock_fq(fq);
20202 +       }
20203 +
20204 +       fq_list_splice(&to->flush_queues, &from->flush_queues);
20205 +
20206 +#if REISER4_DEBUG
20207 +       to->num_queued += from->num_queued;
20208 +       to->nr_flush_queues += from->nr_flush_queues;
20209 +       from->nr_flush_queues = 0;
20210 +#endif
20211 +}
20212 +
20213 +#if REISER4_DEBUG
20214 +int atom_fq_parts_are_clean (txn_atom * atom)
20215 +{
20216 +       assert("zam-915", atom != NULL);
20217 +       return fq_list_empty(&atom->flush_queues);
20218 +}
20219 +#endif
20220 +/* Bio i/o completion routine for reiser4 write operations. */
20221 +static int
20222 +end_io_handler(struct bio *bio, unsigned int bytes_done UNUSED_ARG, int err UNUSED_ARG)
20223 +{
20224 +       int i;
20225 +       int nr_errors = 0;
20226 +       flush_queue_t *fq;
20227 +
20228 +       assert ("zam-958", bio->bi_rw & WRITE);
20229 +
20230 +       /* i/o op. is not fully completed */
20231 +       if (bio->bi_size != 0)
20232 +               return 1;
20233 +
20234 +       /* we expect that bio->private is set to NULL or fq object which is used
20235 +        * for synchronization and error counting. */
20236 +       fq = bio->bi_private;
20237 +       /* Check all elements of io_vec for correct write completion. */
20238 +       for (i = 0; i < bio->bi_vcnt; i += 1) {
20239 +               struct page *pg = bio->bi_io_vec[i].bv_page;
20240 +
20241 +               if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) {
20242 +                       SetPageError(pg);
20243 +                       nr_errors++;
20244 +               }
20245 +
20246 +               {
20247 +                       /* jnode WRITEBACK ("write is in progress bit") is
20248 +                        * atomically cleared here. */
20249 +                       jnode *node;
20250 +
20251 +                       assert("zam-736", pg != NULL);
20252 +                       assert("zam-736", PagePrivate(pg));
20253 +                       node = (jnode *) (pg->private);
20254 +
20255 +                       JF_CLR(node, JNODE_WRITEBACK);
20256 +               }
20257 +
20258 +               end_page_writeback(pg);
20259 +               page_cache_release(pg);
20260 +       }
20261 +
20262 +       if (fq) {
20263 +               /* count i/o error in fq object */
20264 +               atomic_add(nr_errors, &fq->nr_errors);
20265 +
20266 +               /* If all write requests registered in this "fq" are done we up
20267 +                * the semaphore. */
20268 +               if (atomic_sub_and_test(bio->bi_vcnt, &fq->nr_submitted))
20269 +                       up(&fq->io_sem);
20270 +       }
20271 +
20272 +       bio_put(bio);
20273 +       return 0;
20274 +}
20275 +
20276 +/* Count I/O requests which will be submitted by @bio in given flush queues
20277 +   @fq */
20278 +reiser4_internal void
20279 +add_fq_to_bio(flush_queue_t * fq, struct bio *bio)
20280 +{
20281 +       bio->bi_private = fq;
20282 +       bio->bi_end_io = end_io_handler;
20283 +
20284 +       if (fq)
20285 +               atomic_add(bio->bi_vcnt, &fq->nr_submitted);
20286 +}
20287 +
20288 +/* Move all queued nodes out from @fq->prepped list. */
20289 +static void release_prepped_list(flush_queue_t * fq)
20290 +{
20291 +       txn_atom * atom;
20292 +
20293 +       assert ("zam-904", fq_in_use(fq));
20294 +       atom = UNDER_SPIN(fq, fq, atom_get_locked_by_fq(fq));
20295 +
20296 +       while(!capture_list_empty(ATOM_FQ_LIST(fq))) {
20297 +               jnode * cur;
20298 +
20299 +               cur = capture_list_front(ATOM_FQ_LIST(fq));
20300 +               capture_list_remove_clean(cur);
20301 +
20302 +               count_dequeued_node(fq);
20303 +               LOCK_JNODE(cur);
20304 +               assert("nikita-3154", !JF_ISSET(cur, JNODE_OVRWR));
20305 +               assert("nikita-3154", JF_ISSET(cur, JNODE_RELOC));
20306 +               assert("nikita-3154", JF_ISSET(cur, JNODE_FLUSH_QUEUED));
20307 +               JF_CLR(cur, JNODE_FLUSH_QUEUED);
20308 +
20309 +               if (JF_ISSET(cur, JNODE_DIRTY)) {
20310 +                       capture_list_push_back(ATOM_DIRTY_LIST(atom, jnode_get_level(cur)), cur);
20311 +                       ON_DEBUG(count_jnode(atom, cur, FQ_LIST, DIRTY_LIST, 1));
20312 +               } else {
20313 +                       capture_list_push_back(ATOM_CLEAN_LIST(atom), cur);
20314 +                       ON_DEBUG(count_jnode(atom, cur, FQ_LIST, CLEAN_LIST, 1));
20315 +               }
20316 +
20317 +               UNLOCK_JNODE(cur);
20318 +       }
20319 +
20320 +       if (-- atom->nr_running_queues == 0)
20321 +               atom_send_event(atom);
20322 +
20323 +       UNLOCK_ATOM(atom);
20324 +}
20325 +
20326 +/* Submit write requests for nodes on the already filled flush queue @fq.
20327 +
20328 +   @fq: flush queue object which contains jnodes we can (and will) write.
20329 +   @return: number of submitted blocks (>=0) if success, otherwise -- an error
20330 +            code (<0). */
20331 +reiser4_internal int
20332 +write_fq(flush_queue_t * fq, long * nr_submitted, int flags)
20333 +{
20334 +       int ret;
20335 +       txn_atom * atom;
20336 +
20337 +       while (1) {
20338 +               atom = UNDER_SPIN(fq, fq, atom_get_locked_by_fq(fq));
20339 +               assert ("zam-924", atom);
20340 +               /* do not write fq in parallel. */
20341 +               if (atom->nr_running_queues == 0 || !(flags & WRITEOUT_SINGLE_STREAM))
20342 +                       break;
20343 +               atom_wait_event(atom);
20344 +       }
20345 +
20346 +       atom->nr_running_queues ++;
20347 +       UNLOCK_ATOM(atom);
20348 +
20349 +       ret = write_jnode_list(ATOM_FQ_LIST(fq), fq, nr_submitted, flags);
20350 +       release_prepped_list(fq);
20351 +
20352 +       return ret;
20353 +}
20354 +
20355 +/* Getting flush queue object for exclusive use by one thread. May require
20356 +   several iterations which is indicated by -E_REPEAT return code.
20357 +
20358 +   This function does not contain code for obtaining an atom lock because an
20359 +   atom lock is obtained by different ways in different parts of reiser4,
20360 +   usually it is current atom, but we need a possibility for getting fq for the
20361 +   atom of given jnode. */
20362 +reiser4_internal int
20363 +fq_by_atom_gfp(txn_atom * atom, flush_queue_t ** new_fq, int gfp)
20364 +{
20365 +       flush_queue_t *fq;
20366 +
20367 +       assert("zam-745", spin_atom_is_locked(atom));
20368 +
20369 +       fq = fq_list_front(&atom->flush_queues);
20370 +       while (!fq_list_end(&atom->flush_queues, fq)) {
20371 +               spin_lock_fq(fq);
20372 +
20373 +               if (fq_ready(fq)) {
20374 +                       mark_fq_in_use(fq);
20375 +                       assert("vs-1246", fq->owner == NULL);
20376 +                       ON_DEBUG(fq->owner = current);
20377 +                       spin_unlock_fq(fq);
20378 +
20379 +                       if (*new_fq)
20380 +                               done_fq(*new_fq);
20381 +
20382 +                       *new_fq = fq;
20383 +
20384 +                       return 0;
20385 +               }
20386 +
20387 +               spin_unlock_fq(fq);
20388 +
20389 +               fq = fq_list_next(fq);
20390 +       }
20391 +
20392 +       /* Use previously allocated fq object */
20393 +       if (*new_fq) {
20394 +               mark_fq_in_use(*new_fq);
20395 +               assert("vs-1248", (*new_fq)->owner == 0);
20396 +               ON_DEBUG((*new_fq)->owner = current);
20397 +               attach_fq(atom, *new_fq);
20398 +
20399 +               return 0;
20400 +       }
20401 +
20402 +       UNLOCK_ATOM(atom);
20403 +
20404 +       *new_fq = create_fq(gfp);
20405 +
20406 +       if (*new_fq == NULL)
20407 +               return RETERR(-ENOMEM);
20408 +
20409 +       return RETERR(-E_REPEAT);
20410 +}
20411 +
20412 +reiser4_internal int
20413 +fq_by_atom(txn_atom * atom, flush_queue_t ** new_fq)
20414 +{
20415 +       return fq_by_atom_gfp(atom, new_fq, GFP_KERNEL);
20416 +}
20417 +
20418 +/* A wrapper around fq_by_atom for getting a flush queue object for current
20419 + * atom, if success fq->atom remains locked. */
20420 +reiser4_internal flush_queue_t *
20421 +get_fq_for_current_atom(void)
20422 +{
20423 +       flush_queue_t *fq = NULL;
20424 +       txn_atom *atom;
20425 +       int ret;
20426 +
20427 +       do {
20428 +               atom = get_current_atom_locked();
20429 +               ret = fq_by_atom(atom, &fq);
20430 +       } while (ret == -E_REPEAT);
20431 +
20432 +       if (ret)
20433 +               return ERR_PTR(ret);
20434 +       return fq;
20435 +}
20436 +
20437 +/* Releasing flush queue object after exclusive use */
20438 +reiser4_internal void
20439 +fq_put_nolock(flush_queue_t * fq)
20440 +{
20441 +       assert("zam-747", fq->atom != NULL);
20442 +       assert("zam-902", capture_list_empty(ATOM_FQ_LIST(fq)));
20443 +       mark_fq_ready(fq);
20444 +       assert("vs-1245", fq->owner == current);
20445 +       ON_DEBUG(fq->owner = NULL);
20446 +}
20447 +
20448 +reiser4_internal void
20449 +fq_put(flush_queue_t * fq)
20450 +{
20451 +       txn_atom *atom;
20452 +
20453 +       spin_lock_fq(fq);
20454 +       atom = atom_get_locked_by_fq(fq);
20455 +
20456 +       assert("zam-746", atom != NULL);
20457 +
20458 +       fq_put_nolock(fq);
20459 +       atom_send_event(atom);
20460 +
20461 +       spin_unlock_fq(fq);
20462 +       UNLOCK_ATOM(atom);
20463 +}
20464 +
20465 +/* A part of atom object initialization related to the embedded flush queue
20466 +   list head */
20467 +
20468 +reiser4_internal void
20469 +init_atom_fq_parts(txn_atom * atom)
20470 +{
20471 +       fq_list_init(&atom->flush_queues);
20472 +}
20473 +
20474 +/* get a flush queue for an atom pointed by given jnode (spin-locked) ; returns
20475 + * both atom and jnode locked and found and took exclusive access for flush
20476 + * queue object.  */
20477 +reiser4_internal int fq_by_jnode_gfp(jnode * node, flush_queue_t ** fq, int gfp)
20478 +{
20479 +       txn_atom * atom;
20480 +       int ret;
20481 +
20482 +       assert("zam-835", spin_jnode_is_locked(node));
20483 +
20484 +       *fq = NULL;
20485 +
20486 +       while (1) {
20487 +               /* begin with taking lock on atom */
20488 +               atom = jnode_get_atom(node);
20489 +               UNLOCK_JNODE(node);
20490 +
20491 +               if (atom == NULL) {
20492 +                       /* jnode does not point to the atom anymore, it is
20493 +                        * possible because jnode lock could be removed for a
20494 +                        * time in atom_get_locked_by_jnode() */
20495 +                       if (*fq) {
20496 +                               done_fq(*fq);
20497 +                               *fq = NULL;
20498 +                       }
20499 +                       return 0;
20500 +               }
20501 +
20502 +               /* atom lock is required for taking flush queue */
20503 +               ret = fq_by_atom_gfp(atom, fq, gfp);
20504 +
20505 +               if (ret) {
20506 +                       if (ret == -E_REPEAT)
20507 +                               /* atom lock was released for doing memory
20508 +                                * allocation, start with locked jnode one more
20509 +                                * time */
20510 +                               goto lock_again;
20511 +                       return ret;
20512 +               }
20513 +
20514 +               /* It is correct to lock atom first, then lock a jnode */
20515 +               LOCK_JNODE(node);
20516 +
20517 +               if (node->atom == atom)
20518 +                       break;  /* Yes! it is our jnode. We got all of them:
20519 +                                * flush queue, and both locked atom and
20520 +                                * jnode */
20521 +
20522 +               /* release all locks and allocated objects and restart from
20523 +                * locked jnode. */
20524 +               UNLOCK_JNODE(node);
20525 +
20526 +               fq_put(*fq);
20527 +               fq = NULL;
20528 +
20529 +               UNLOCK_ATOM(atom);
20530 +
20531 +       lock_again:
20532 +               LOCK_JNODE(node);
20533 +       }
20534 +
20535 +       return 0;
20536 +}
20537 +
20538 +reiser4_internal int fq_by_jnode(jnode * node, flush_queue_t ** fq)
20539 +{
20540 +        return fq_by_jnode_gfp(node, fq, GFP_KERNEL);
20541 +}
20542 +
20543 +
20544 +#if REISER4_DEBUG
20545 +
20546 +void check_fq(const txn_atom *atom)
20547 +{
20548 +       /* check number of nodes on all atom's flush queues */
20549 +       flush_queue_t *fq;
20550 +       int count;
20551 +       jnode *node;
20552 +
20553 +       count = 0;
20554 +       for_all_type_safe_list(fq, &atom->flush_queues, fq) {
20555 +               spin_lock_fq(fq);
20556 +               for_all_type_safe_list(capture, ATOM_FQ_LIST(fq), node)
20557 +                       count ++;
20558 +               spin_unlock_fq(fq);
20559 +       }
20560 +       if (count != atom->fq)
20561 +               warning("", "fq counter %d, real %d\n", atom->fq, count);
20562 +
20563 +}
20564 +
20565 +#endif
20566 +
20567 +/* Make Linus happy.
20568 +   Local variables:
20569 +   c-indentation-style: "K&R"
20570 +   mode-name: "LC"
20571 +   c-basic-offset: 8
20572 +   tab-width: 8
20573 +   fill-column: 80
20574 +   scroll-step: 1
20575 +   End:
20576 +*/
20577 diff -rupN linux-2.6.8-rc3/fs/reiser4/forward.h linux-2.6.8-rc3-a/fs/reiser4/forward.h
20578 --- linux-2.6.8-rc3/fs/reiser4/forward.h        1970-01-01 03:00:00.000000000 +0300
20579 +++ linux-2.6.8-rc3-a/fs/reiser4/forward.h      2004-08-05 21:20:53.428587675 +0400
20580 @@ -0,0 +1,258 @@
20581 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
20582 +
20583 +/* Forward declarations. Thank you Kernighan. */
20584 +
20585 +#if !defined( __REISER4_FORWARD_H__ )
20586 +#define __REISER4_FORWARD_H__
20587 +
20588 +#include <asm/errno.h>
20589 +
20590 +typedef struct zlock zlock;
20591 +typedef struct lock_stack lock_stack;
20592 +typedef struct lock_handle lock_handle;
20593 +typedef struct znode znode;
20594 +typedef struct flow flow_t;
20595 +typedef struct coord coord_t;
20596 +typedef struct tree_access_pointer tap_t;
20597 +typedef struct item_coord item_coord;
20598 +typedef struct shift_params shift_params;
20599 +typedef struct reiser4_object_create_data reiser4_object_create_data;
20600 +typedef union reiser4_plugin reiser4_plugin;
20601 +typedef int reiser4_plugin_id;
20602 +typedef struct item_plugin item_plugin;
20603 +typedef struct jnode_plugin jnode_plugin;
20604 +typedef struct reiser4_item_data reiser4_item_data;
20605 +typedef union reiser4_key reiser4_key;
20606 +typedef union reiser4_dblock_nr reiser4_dblock_nr;
20607 +typedef struct reiser4_tree reiser4_tree;
20608 +typedef struct carry_cut_data carry_cut_data;
20609 +typedef struct carry_kill_data carry_kill_data;
20610 +typedef struct carry_tree_op carry_tree_op;
20611 +typedef struct carry_tree_node carry_tree_node;
20612 +typedef struct carry_plugin_info carry_plugin_info;
20613 +typedef struct reiser4_journal reiser4_journal;
20614 +typedef struct txn_atom txn_atom;
20615 +typedef struct txn_handle txn_handle;
20616 +typedef struct txn_mgr txn_mgr;
20617 +typedef struct reiser4_dir_entry_desc reiser4_dir_entry_desc;
20618 +typedef struct reiser4_context reiser4_context;
20619 +typedef struct carry_level carry_level;
20620 +typedef struct blocknr_set blocknr_set;
20621 +typedef struct blocknr_set_entry blocknr_set_entry;
20622 +/* super_block->s_fs_info points to this */
20623 +typedef struct reiser4_super_info_data reiser4_super_info_data;
20624 +/* next two objects are fields of reiser4_super_info_data */
20625 +typedef struct reiser4_oid_allocator reiser4_oid_allocator;
20626 +typedef struct reiser4_space_allocator reiser4_space_allocator;
20627 +typedef struct reiser4_file_fsdata reiser4_file_fsdata;
20628 +
20629 +typedef struct flush_scan flush_scan;
20630 +typedef struct flush_position flush_pos_t;
20631 +
20632 +typedef unsigned short pos_in_node_t;
20633 +#define MAX_POS_IN_NODE 65535
20634 +
20635 +typedef struct jnode jnode;
20636 +typedef struct reiser4_blocknr_hint reiser4_blocknr_hint;
20637 +
20638 +typedef struct uf_coord uf_coord_t;
20639 +typedef struct hint hint_t;
20640 +
20641 +typedef struct ktxnmgrd_context ktxnmgrd_context;
20642 +
20643 +typedef struct reiser4_xattr_plugin reiser4_xattr_plugin;
20644 +
20645 +struct inode;
20646 +struct page;
20647 +struct file;
20648 +struct dentry;
20649 +struct super_block;
20650 +
20651 +/* return values of coord_by_key(). cbk == coord_by_key */
20652 +typedef enum {
20653 +       CBK_COORD_FOUND = 0,
20654 +       CBK_COORD_NOTFOUND = -ENOENT,
20655 +} lookup_result;
20656 +
20657 +/* results of lookup with directory file */
20658 +typedef enum {
20659 +       FILE_NAME_FOUND = 0,
20660 +       FILE_NAME_NOTFOUND = -ENOENT,
20661 +       FILE_IO_ERROR = -EIO,   /* FIXME: it seems silly to have special OOM, IO_ERROR return codes for each search. */
20662 +       FILE_OOM = -ENOMEM      /* FIXME: it seems silly to have special OOM, IO_ERROR return codes for each search. */
20663 +} file_lookup_result;
20664 +
20665 +/* behaviors of lookup. If coord we are looking for is actually in a tree,
20666 +    both coincide. */
20667 +typedef enum {
20668 +       /* search exactly for the coord with key given */
20669 +       FIND_EXACT,
20670 +       /* search for coord with the maximal key not greater than one
20671 +           given */
20672 +       FIND_MAX_NOT_MORE_THAN  /*LEFT_SLANT_BIAS */
20673 +} lookup_bias;
20674 +
20675 +typedef enum {
20676 +       /* number of leaf level of the tree
20677 +          The fake root has (tree_level=0). */
20678 +       LEAF_LEVEL = 1,
20679 +
20680 +       /* number of level one above leaf level of the tree.
20681 +
20682 +          It is supposed that internal tree used by reiser4 to store file
20683 +          system data and meta data will have height 2 initially (when
20684 +          created by mkfs).
20685 +       */
20686 +       TWIG_LEVEL = 2,
20687 +} tree_level;
20688 +
20689 +/* The "real" maximum ztree height is the 0-origin size of any per-level
20690 +   array, since the zero'th level is not used. */
20691 +#define REAL_MAX_ZTREE_HEIGHT     (REISER4_MAX_ZTREE_HEIGHT-LEAF_LEVEL)
20692 +
20693 +/* enumeration of possible mutual position of item and coord.  This enum is
20694 +    return type of ->is_in_item() item plugin method which see. */
20695 +typedef enum {
20696 +       /* coord is on the left of an item*/
20697 +       IP_ON_THE_LEFT,
20698 +       /* coord is inside item */
20699 +       IP_INSIDE,
20700 +       /* coord is inside item, but to the right of the rightmost unit of
20701 +           this item */
20702 +       IP_RIGHT_EDGE,
20703 +       /* coord is on the right of an item */
20704 +       IP_ON_THE_RIGHT
20705 +} interposition;
20706 +
20707 +/* type of lock to acquire on znode before returning it to caller */
20708 +typedef enum {
20709 +       ZNODE_NO_LOCK = 0,
20710 +       ZNODE_READ_LOCK = 1,
20711 +       ZNODE_WRITE_LOCK = 2,
20712 +} znode_lock_mode;
20713 +
20714 +/* type of lock request */
20715 +typedef enum {
20716 +       ZNODE_LOCK_LOPRI = 0,
20717 +       ZNODE_LOCK_HIPRI = (1 << 0),
20718 +
20719 +       /* By setting the ZNODE_LOCK_NONBLOCK flag in a lock request the call to longterm_lock_znode will not sleep
20720 +          waiting for the lock to become available.  If the lock is unavailable, reiser4_znode_lock will immediately
20721 +          return the value -E_REPEAT. */
20722 +       ZNODE_LOCK_NONBLOCK = (1 << 1),
20723 +       /* An option for longterm_lock_znode which prevents atom fusion */
20724 +       ZNODE_LOCK_DONT_FUSE = (1 << 2)
20725 +} znode_lock_request;
20726 +
20727 +typedef enum { READ_OP = 0, WRITE_OP = 1 } rw_op;
20728 +
20729 +/* used to specify direction of shift. These must be -1 and 1 */
20730 +typedef enum {
20731 +       SHIFT_LEFT = 1,
20732 +       SHIFT_RIGHT = -1
20733 +} shift_direction;
20734 +
20735 +typedef enum {
20736 +       LEFT_SIDE,
20737 +       RIGHT_SIDE
20738 +} sideof;
20739 +
20740 +#define round_up( value, order )                                               \
20741 +       ( ( typeof( value ) )( ( ( long ) ( value ) + ( order ) - 1U ) &        \
20742 +                            ~( ( order ) - 1 ) ) )
20743 +
20744 +/* values returned by squalloc_right_neighbor and its auxiliary functions */
20745 +typedef enum {
20746 +       /* unit of internal item is moved */
20747 +       SUBTREE_MOVED = 0,
20748 +       /* nothing else can be squeezed into left neighbor */
20749 +       SQUEEZE_TARGET_FULL = 1,
20750 +       /* all content of node is squeezed into its left neighbor */
20751 +       SQUEEZE_SOURCE_EMPTY = 2,
20752 +       /* one more item is copied (this is only returned by
20753 +          allocate_and_copy_extent to squalloc_twig)) */
20754 +       SQUEEZE_CONTINUE = 3
20755 +} squeeze_result;
20756 +
20757 +/* Do not change items ids. If you do - there will be format change */
20758 +typedef enum {
20759 +       STATIC_STAT_DATA_ID = 0x0,
20760 +       SIMPLE_DIR_ENTRY_ID = 0x1,
20761 +       COMPOUND_DIR_ID     = 0x2,
20762 +       NODE_POINTER_ID     = 0x3,
20763 +       EXTENT_POINTER_ID   = 0x5,
20764 +       FORMATTING_ID       = 0x6,
20765 +       CTAIL_ID            = 0x7,
20766 +       BLACK_BOX_ID        = 0x8,
20767 +       LAST_ITEM_ID        = 0x9
20768 +} item_id;
20769 +
20770 +/* Flags passed to jnode_flush() to allow it to distinguish default settings based on
20771 +   whether commit() was called or VM memory pressure was applied. */
20772 +typedef enum {
20773 +       /* submit flush queue to disk at jnode_flush completion */
20774 +       JNODE_FLUSH_WRITE_BLOCKS = 1,
20775 +
20776 +       /* flush is called for commit */
20777 +       JNODE_FLUSH_COMMIT = 2,
20778 +       /* not implemented */
20779 +       JNODE_FLUSH_MEMORY_FORMATTED = 4,
20780 +
20781 +       /* not implemented */
20782 +       JNODE_FLUSH_MEMORY_UNFORMATTED = 8,
20783 +} jnode_flush_flags;
20784 +
20785 +/* Flags to insert/paste carry operations. Currently they only used in
20786 +   flushing code, but in future, they can be used to optimize for repetitive
20787 +   accesses.  */
20788 +typedef enum {
20789 +       /* carry is not allowed to shift data to the left when trying to find
20790 +          free space  */
20791 +       COPI_DONT_SHIFT_LEFT = (1 << 0),
20792 +       /* carry is not allowed to shift data to the right when trying to find
20793 +          free space  */
20794 +       COPI_DONT_SHIFT_RIGHT = (1 << 1),
20795 +       /* carry is not allowed to allocate new node(s) when trying to find
20796 +          free space */
20797 +       COPI_DONT_ALLOCATE = (1 << 2),
20798 +       /* try to load left neighbor if its not in a cache */
20799 +       COPI_LOAD_LEFT = (1 << 3),
20800 +       /* try to load right neighbor if its not in a cache */
20801 +       COPI_LOAD_RIGHT = (1 << 4),
20802 +       /* shift insertion point to the left neighbor */
20803 +       COPI_GO_LEFT = (1 << 5),
20804 +       /* shift insertion point to the right neighbor */
20805 +       COPI_GO_RIGHT = (1 << 6),
20806 +       /* try to step back into original node if insertion into new node
20807 +          fails after shifting data there. */
20808 +       COPI_STEP_BACK = (1 << 7)
20809 +} cop_insert_flag;
20810 +
20811 +typedef enum {
20812 +       SAFE_UNLINK,   /* safe-link for unlink */
20813 +       SAFE_TRUNCATE  /* safe-link for truncate */
20814 +} reiser4_safe_link_t;
20815 +
20816 +/* this is to show on which list of atom jnode is */
20817 +typedef enum {
20818 +       NOT_CAPTURED,
20819 +       DIRTY_LIST,
20820 +       CLEAN_LIST,
20821 +       FQ_LIST,
20822 +       WB_LIST,
20823 +       OVRWR_LIST,
20824 +       PROTECT_LIST
20825 +} atom_list;
20826 +
20827 +/* __REISER4_FORWARD_H__ */
20828 +#endif
20829 +
20830 +/* Make Linus happy.
20831 +   Local variables:
20832 +   c-indentation-style: "K&R"
20833 +   mode-name: "LC"
20834 +   c-basic-offset: 8
20835 +   tab-width: 8
20836 +   fill-column: 120
20837 +   End:
20838 +*/
20839 diff -rupN linux-2.6.8-rc3/fs/reiser4/init_super.c linux-2.6.8-rc3-a/fs/reiser4/init_super.c
20840 --- linux-2.6.8-rc3/fs/reiser4/init_super.c     1970-01-01 03:00:00.000000000 +0300
20841 +++ linux-2.6.8-rc3-a/fs/reiser4/init_super.c   2004-08-05 21:20:52.965685312 +0400
20842 @@ -0,0 +1,562 @@
20843 +/* Copyright by Hans Reiser, 2003 */
20844 +
20845 +#include "forward.h"
20846 +#include "debug.h"
20847 +#include "dformat.h"
20848 +#include "txnmgr.h"
20849 +#include "jnode.h"
20850 +#include "znode.h"
20851 +#include "tree.h"
20852 +#include "vfs_ops.h"
20853 +#include "inode.h"
20854 +#include "page_cache.h"
20855 +#include "ktxnmgrd.h"
20856 +#include "super.h"
20857 +#include "reiser4.h"
20858 +#include "kattr.h"
20859 +#include "entd.h"
20860 +#include "emergency_flush.h"
20861 +#include "prof.h"
20862 +#include "repacker.h"
20863 +#include "safe_link.h"
20864 +#include "plugin/dir/dir.h"
20865 +
20866 +#include <linux/errno.h>
20867 +#include <linux/types.h>
20868 +#include <linux/mount.h>
20869 +#include <linux/vfs.h>
20870 +#include <linux/mm.h>
20871 +#include <linux/buffer_head.h>
20872 +#include <linux/rcupdate.h>
20873 +
20874 +#define _INIT_PARAM_LIST (struct super_block * s, reiser4_context * ctx, void * data, int silent)
20875 +#define _DONE_PARAM_LIST (struct super_block * s)
20876 +
20877 +#define _INIT_(subsys) static int _init_##subsys _INIT_PARAM_LIST
20878 +#define _DONE_(subsys) static void _done_##subsys _DONE_PARAM_LIST
20879 +
20880 +#define _DONE_EMPTY(subsys) _DONE_(subsys) {}
20881 +
20882 +_INIT_(mount_flags_check)
20883 +{
20884 +/*     if (bdev_read_only(s->s_bdev) || (s->s_flags & MS_RDONLY)) {
20885 +               warning("nikita-3322", "Readonly reiser4 is not yet supported");
20886 +               return RETERR(-EROFS);
20887 +       }*/
20888 +       return 0;
20889 +}
20890 +
20891 +_DONE_EMPTY(mount_flags_check)
20892 +
20893 +_INIT_(sinfo)
20894 +{
20895 +       reiser4_super_info_data * sbinfo;
20896 +
20897 +       sbinfo = kmalloc(sizeof (reiser4_super_info_data), GFP_KERNEL);
20898 +       if (!sbinfo)
20899 +               return RETERR(-ENOMEM);
20900 +
20901 +       s->s_fs_info = sbinfo;
20902 +       s->s_op = NULL;
20903 +       xmemset(sbinfo, 0, sizeof (*sbinfo));
20904 +
20905 +       ON_DEBUG(INIT_LIST_HEAD(&sbinfo->all_jnodes));
20906 +       ON_DEBUG(spin_lock_init(&sbinfo->all_guard));
20907 +
20908 +       sema_init(&sbinfo->delete_sema, 1);
20909 +       sema_init(&sbinfo->flush_sema, 1);
20910 +       spin_super_init(sbinfo);
20911 +       spin_super_eflush_init(sbinfo);
20912 +
20913 +       return 0;
20914 +}
20915 +
20916 +_DONE_(sinfo)
20917 +{
20918 +       assert("zam-990", s->s_fs_info != NULL);
20919 +       rcu_barrier();
20920 +       kfree(s->s_fs_info);
20921 +       s->s_fs_info = NULL;
20922 +}
20923 +
20924 +_INIT_(stat)
20925 +{
20926 +       return reiser4_stat_init(&get_super_private(s)->stats);
20927 +}
20928 +
20929 +_DONE_(stat)
20930 +{
20931 +       reiser4_stat_done(&get_super_private(s)->stats);
20932 +}
20933 +
20934 +_INIT_(context)
20935 +{
20936 +       return init_context(ctx, s);
20937 +}
20938 +
20939 +_DONE_(context)
20940 +{
20941 +       reiser4_super_info_data * sbinfo;
20942 +
20943 +       sbinfo = get_super_private(s);
20944 +
20945 +       close_log_file(&sbinfo->log_file);
20946 +
20947 +       if (reiser4_is_debugged(s, REISER4_STATS_ON_UMOUNT))
20948 +               reiser4_print_stats();
20949 +
20950 +       /* we don't want ->write_super to be called any more. */
20951 +       if (s->s_op)
20952 +               s->s_op->write_super = NULL;
20953 +#if REISER4_DEBUG
20954 +       {
20955 +               struct list_head *scan;
20956 +
20957 +               /* print jnodes that survived umount. */
20958 +               list_for_each(scan, &sbinfo->all_jnodes) {
20959 +                       jnode *busy;
20960 +
20961 +                       busy = list_entry(scan, jnode, jnodes);
20962 +                       info_jnode("\nafter umount", busy);
20963 +               }
20964 +       }
20965 +       if (sbinfo->kmalloc_allocated > 0)
20966 +               warning("nikita-2622",
20967 +                       "%i bytes still allocated", sbinfo->kmalloc_allocated);
20968 +#endif
20969 +
20970 +       get_current_context()->trans = NULL;
20971 +       done_context(get_current_context());
20972 +}
20973 +
20974 +_INIT_(parse_options)
20975 +{
20976 +       return reiser4_parse_options(s, data);
20977 +}
20978 +
20979 +_DONE_(parse_options)
20980 +{
20981 +       close_log_file(&get_super_private(s)->log_file);
20982 +}
20983 +
20984 +_INIT_(object_ops)
20985 +{
20986 +       build_object_ops(s, &get_super_private(s)->ops);
20987 +       return 0;
20988 +}
20989 +
20990 +_DONE_EMPTY(object_ops)
20991 +
20992 +_INIT_(read_super)
20993 +{
20994 +       struct buffer_head *super_bh;
20995 +       struct reiser4_master_sb *master_sb;
20996 +       int plugin_id;
20997 +       reiser4_super_info_data * sbinfo = get_super_private(s);
20998 +       unsigned long blocksize;
20999 +
21000 + read_super_block:
21001 +#ifdef CONFIG_REISER4_BADBLOCKS
21002 +       if ( sbinfo->altsuper )
21003 +               super_bh = sb_bread(s, (sector_t) (sbinfo->altsuper >> s->s_blocksize_bits));
21004 +       else
21005 +#endif
21006 +               /* look for reiser4 magic at hardcoded place */
21007 +               super_bh = sb_bread(s, (sector_t) (REISER4_MAGIC_OFFSET / s->s_blocksize));
21008 +
21009 +       if (!super_bh)
21010 +               return RETERR(-EIO);
21011 +
21012 +       master_sb = (struct reiser4_master_sb *) super_bh->b_data;
21013 +       /* check reiser4 magic string */
21014 +       if (!strncmp(master_sb->magic, REISER4_SUPER_MAGIC_STRING, sizeof(REISER4_SUPER_MAGIC_STRING))) {
21015 +               /* reset block size if it is not a right one FIXME-VS: better comment is needed */
21016 +               blocksize = d16tocpu(&master_sb->blocksize);
21017 +
21018 +               if (blocksize != PAGE_CACHE_SIZE) {
21019 +                       if (!silent)
21020 +                               warning("nikita-2609", "%s: wrong block size %ld\n", s->s_id, blocksize);
21021 +                       brelse(super_bh);
21022 +                       return RETERR(-EINVAL);
21023 +               }
21024 +               if (blocksize != s->s_blocksize) {
21025 +                       brelse(super_bh);
21026 +                       if (!sb_set_blocksize(s, (int) blocksize)) {
21027 +                               return RETERR(-EINVAL);
21028 +                       }
21029 +                       goto read_super_block;
21030 +               }
21031 +
21032 +               plugin_id = d16tocpu(&master_sb->disk_plugin_id);
21033 +               /* only two plugins are available for now */
21034 +               assert("vs-476", plugin_id == FORMAT40_ID);
21035 +               sbinfo->df_plug = disk_format_plugin_by_id(plugin_id);
21036 +               sbinfo->diskmap_block = d64tocpu(&master_sb->diskmap);
21037 +               brelse(super_bh);
21038 +       } else {
21039 +               if (!silent) {
21040 +                       warning("nikita-2608", "Wrong master super block magic.");
21041 +               }
21042 +
21043 +               /* no standard reiser4 super block found */
21044 +               brelse(super_bh);
21045 +               /* FIXME-VS: call guess method for all available layout
21046 +                  plugins */
21047 +               /* umka (2002.06.12) Is it possible when format-specific super
21048 +                  block exists but there no master super block? */
21049 +               return RETERR(-EINVAL);
21050 +       }
21051 +       return 0;
21052 +}
21053 +
21054 +_DONE_EMPTY(read_super)
21055 +
21056 +_INIT_(tree0)
21057 +{
21058 +       reiser4_super_info_data * sbinfo = get_super_private(s);
21059 +
21060 +       init_tree_0(&sbinfo->tree);
21061 +       sbinfo->tree.super = s;
21062 +       return 0;
21063 +}
21064 +
21065 +_DONE_EMPTY(tree0)
21066 +
21067 +_INIT_(txnmgr)
21068 +{
21069 +       txnmgr_init(&get_super_private(s)->tmgr);
21070 +       return 0;
21071 +}
21072 +
21073 +_DONE_(txnmgr)
21074 +{
21075 +       txnmgr_done(&get_super_private(s)->tmgr);
21076 +}
21077 +
21078 +_INIT_(ktxnmgrd_context)
21079 +{
21080 +       return init_ktxnmgrd_context(&get_super_private(s)->tmgr);
21081 +}
21082 +
21083 +_DONE_(ktxnmgrd_context)
21084 +{
21085 +       done_ktxnmgrd_context(&get_super_private(s)->tmgr);
21086 +}
21087 +
21088 +_INIT_(ktxnmgrd)
21089 +{
21090 +       return start_ktxnmgrd(&get_super_private(s)->tmgr);
21091 +}
21092 +
21093 +_DONE_(ktxnmgrd)
21094 +{
21095 +       stop_ktxnmgrd(&get_super_private(s)->tmgr);
21096 +}
21097 +
21098 +_INIT_(formatted_fake)
21099 +{
21100 +       return init_formatted_fake(s);
21101 +}
21102 +
21103 +_DONE_(formatted_fake)
21104 +{
21105 +       reiser4_super_info_data * sbinfo;
21106 +
21107 +       sbinfo = get_super_private(s);
21108 +
21109 +       rcu_barrier();
21110 +
21111 +       /* done_formatted_fake just has finished with last jnodes (bitmap
21112 +        * ones) */
21113 +       done_tree(&sbinfo->tree);
21114 +       /* call finish_rcu(), because some znode were "released" in
21115 +        * done_tree(). */
21116 +       rcu_barrier();
21117 +       done_formatted_fake(s);
21118 +}
21119 +
21120 +_INIT_(entd)
21121 +{
21122 +       init_entd_context(s);
21123 +       return 0;
21124 +}
21125 +
21126 +_DONE_(entd)
21127 +{
21128 +       done_entd_context(s);
21129 +}
21130 +
21131 +_DONE_(disk_format);
21132 +
21133 +_INIT_(disk_format)
21134 +{
21135 +       return get_super_private(s)->df_plug->get_ready(s, data);
21136 +}
21137 +
21138 +_DONE_(disk_format)
21139 +{
21140 +       reiser4_super_info_data *sbinfo = get_super_private(s);
21141 +
21142 +       sbinfo->df_plug->release(s);
21143 +}
21144 +
21145 +_INIT_(sb_counters)
21146 +{
21147 +       /* There are some 'committed' versions of reiser4 super block
21148 +          counters, which correspond to reiser4 on-disk state. These counters
21149 +          are initialized here */
21150 +       reiser4_super_info_data *sbinfo = get_super_private(s);
21151 +
21152 +       sbinfo->blocks_free_committed = sbinfo->blocks_free;
21153 +       sbinfo->nr_files_committed = oids_used(s);
21154 +
21155 +       return 0;
21156 +}
21157 +
21158 +_DONE_EMPTY(sb_counters)
21159 +
21160 +_INIT_(d_cursor)
21161 +{
21162 +       /* this should be done before reading inode of root directory, because
21163 +        * reiser4_iget() used load_cursors(). */
21164 +       return d_cursor_init_at(s);
21165 +}
21166 +
21167 +_DONE_(d_cursor)
21168 +{
21169 +       d_cursor_done_at(s);
21170 +}
21171 +
21172 +static struct {
21173 +       reiser4_plugin_type type;
21174 +       reiser4_plugin_id   id;
21175 +} default_plugins[PSET_LAST] = {
21176 +       [PSET_FILE] = {
21177 +               .type = REISER4_FILE_PLUGIN_TYPE,
21178 +               .id   = UNIX_FILE_PLUGIN_ID
21179 +       },
21180 +       [PSET_DIR] = {
21181 +               .type = REISER4_DIR_PLUGIN_TYPE,
21182 +               .id   = HASHED_DIR_PLUGIN_ID
21183 +       },
21184 +       [PSET_HASH] = {
21185 +               .type = REISER4_HASH_PLUGIN_TYPE,
21186 +               .id   = R5_HASH_ID
21187 +       },
21188 +       [PSET_FIBRATION] = {
21189 +               .type = REISER4_FIBRATION_PLUGIN_TYPE,
21190 +               .id   = FIBRATION_DOT_O
21191 +       },
21192 +       [PSET_PERM] = {
21193 +               .type = REISER4_PERM_PLUGIN_TYPE,
21194 +               .id   = RWX_PERM_ID
21195 +       },
21196 +       [PSET_FORMATTING] = {
21197 +               .type = REISER4_FORMATTING_PLUGIN_TYPE,
21198 +               .id   = SMALL_FILE_FORMATTING_ID
21199 +       },
21200 +       [PSET_SD] = {
21201 +               .type = REISER4_ITEM_PLUGIN_TYPE,
21202 +               .id   = STATIC_STAT_DATA_ID
21203 +       },
21204 +       [PSET_DIR_ITEM] = {
21205 +               .type = REISER4_ITEM_PLUGIN_TYPE,
21206 +               .id   = COMPOUND_DIR_ID
21207 +       },
21208 +       [PSET_CRYPTO] = {
21209 +               .type = REISER4_CRYPTO_PLUGIN_TYPE,
21210 +               .id   = NONE_CRYPTO_ID
21211 +       },
21212 +       [PSET_DIGEST] = {
21213 +               .type = REISER4_DIGEST_PLUGIN_TYPE,
21214 +               .id   = NONE_DIGEST_ID
21215 +       },
21216 +       [PSET_COMPRESSION] = {
21217 +               .type = REISER4_COMPRESSION_PLUGIN_TYPE,
21218 +               .id   = NONE_COMPRESSION_ID
21219 +       }
21220 +};
21221 +
21222 +/* access to default plugin table */
21223 +reiser4_internal reiser4_plugin *
21224 +get_default_plugin(pset_member memb)
21225 +{
21226 +       return plugin_by_id(default_plugins[memb].type, default_plugins[memb].id);
21227 +}
21228 +
21229 +_INIT_(fs_root)
21230 +{
21231 +       reiser4_super_info_data *sbinfo = get_super_private(s);
21232 +       struct inode * inode;
21233 +       int result = 0;
21234 +
21235 +       inode = reiser4_iget(s, sbinfo->df_plug->root_dir_key(s), 0);
21236 +       if (IS_ERR(inode))
21237 +               return RETERR(PTR_ERR(inode));
21238 +
21239 +       s->s_root = d_alloc_root(inode);
21240 +       if (!s->s_root) {
21241 +               iput(inode);
21242 +               return RETERR(-ENOMEM);
21243 +       }
21244 +
21245 +       s->s_root->d_op = &sbinfo->ops.dentry;
21246 +
21247 +       if (!is_inode_loaded(inode)) {
21248 +               pset_member    memb;
21249 +
21250 +               for (memb = 0; memb < PSET_LAST; ++ memb) {
21251 +                       reiser4_plugin *plug;
21252 +
21253 +                       plug = get_default_plugin(memb);
21254 +                       result = grab_plugin_from(inode, memb, plug);
21255 +                       if (result != 0)
21256 +                               break;
21257 +               }
21258 +
21259 +               if (result == 0) {
21260 +                       if (REISER4_DEBUG) {
21261 +                               plugin_set *pset;
21262 +
21263 +                               pset = reiser4_inode_data(inode)->pset;
21264 +                               for (memb = 0; memb < PSET_LAST; ++ memb)
21265 +                                       assert("nikita-3500",
21266 +                                              pset_get(pset, memb) != NULL);
21267 +                       }
21268 +               } else
21269 +                       warning("nikita-3448", "Cannot set plugins of root: %i",
21270 +                               result);
21271 +               reiser4_iget_complete(inode);
21272 +       }
21273 +       s->s_maxbytes = MAX_LFS_FILESIZE;
21274 +       return result;
21275 +}
21276 +
21277 +_DONE_(fs_root)
21278 +{
21279 +       shrink_dcache_parent(s->s_root);
21280 +}
21281 +
21282 +_INIT_(sysfs)
21283 +{
21284 +       return reiser4_sysfs_init(s);
21285 +}
21286 +
21287 +_DONE_(sysfs)
21288 +{
21289 +       reiser4_sysfs_done(s);
21290 +}
21291 +
21292 +_INIT_(repacker)
21293 +{
21294 +       return init_reiser4_repacker(s);
21295 +}
21296 +
21297 +_DONE_(repacker)
21298 +{
21299 +       done_reiser4_repacker(s);
21300 +}
21301 +
21302 +_INIT_(safelink)
21303 +{
21304 +       process_safelinks(s);
21305 +       /* failure to process safe-links is not critical. Continue with
21306 +        * mount. */
21307 +       return 0;
21308 +}
21309 +
21310 +_DONE_(safelink)
21311 +{
21312 +}
21313 +
21314 +_INIT_(exit_context)
21315 +{
21316 +       reiser4_exit_context(ctx);
21317 +       return 0;
21318 +}
21319 +
21320 +_DONE_EMPTY(exit_context)
21321 +
21322 +struct reiser4_subsys {
21323 +       int  (*init) _INIT_PARAM_LIST;
21324 +       void (*done) _DONE_PARAM_LIST;
21325 +};
21326 +
21327 +#define _SUBSYS(subsys) {.init = &_init_##subsys, .done = &_done_##subsys}
21328 +static struct reiser4_subsys subsys_array[] = {
21329 +       _SUBSYS(mount_flags_check),
21330 +       _SUBSYS(sinfo),
21331 +       _SUBSYS(stat),
21332 +       _SUBSYS(context),
21333 +       _SUBSYS(parse_options),
21334 +       _SUBSYS(object_ops),
21335 +       _SUBSYS(read_super),
21336 +       _SUBSYS(tree0),
21337 +       _SUBSYS(txnmgr),
21338 +       _SUBSYS(ktxnmgrd_context),
21339 +       _SUBSYS(ktxnmgrd),
21340 +       _SUBSYS(entd),
21341 +       _SUBSYS(formatted_fake),
21342 +       _SUBSYS(disk_format),
21343 +       _SUBSYS(sb_counters),
21344 +       _SUBSYS(d_cursor),
21345 +       _SUBSYS(fs_root),
21346 +       _SUBSYS(sysfs),
21347 +       _SUBSYS(repacker),
21348 +       _SUBSYS(safelink),
21349 +       _SUBSYS(exit_context)
21350 +};
21351 +
21352 +#define REISER4_NR_SUBSYS (sizeof(subsys_array) / sizeof(struct reiser4_subsys))
21353 +
21354 +static void done_super (struct super_block * s, int last_done)
21355 +{
21356 +       int i;
21357 +       for (i = last_done; i >= 0; i--)
21358 +               subsys_array[i].done(s);
21359 +}
21360 +
21361 +/* read super block from device and fill remaining fields in @s.
21362 +
21363 +   This is read_super() of the past.  */
21364 +reiser4_internal int
21365 +reiser4_fill_super (struct super_block * s, void * data, int silent)
21366 +{
21367 +       reiser4_context ctx;
21368 +       int i;
21369 +       int ret;
21370 +
21371 +       assert ("zam-989", s != NULL);
21372 +
21373 +       for (i = 0; i < REISER4_NR_SUBSYS; i++) {
21374 +               ret = subsys_array[i].init(s, &ctx, data, silent);
21375 +               if (ret) {
21376 +                       done_super(s, i - 1);
21377 +                       return ret;
21378 +               }
21379 +       }
21380 +       return 0;
21381 +}
21382 +
21383 +#if 0
21384 +
21385 +int reiser4_done_super (struct super_block * s)
21386 +{
21387 +       reiser4_context ctx;
21388 +
21389 +       init_context(&ctx, s);
21390 +       done_super(s, REISER4_NR_SUBSYS - 1);
21391 +       return 0;
21392 +}
21393 +
21394 +#endif
21395 +
21396 +/* Make Linus happy.
21397 +   Local variables:
21398 +   c-indentation-style: "K&R"
21399 +   mode-name: "LC"
21400 +   c-basic-offset: 8
21401 +   tab-width: 8
21402 +   fill-column: 80
21403 +   End:
21404 +*/
21405 diff -rupN linux-2.6.8-rc3/fs/reiser4/init_super.h linux-2.6.8-rc3-a/fs/reiser4/init_super.h
21406 --- linux-2.6.8-rc3/fs/reiser4/init_super.h     1970-01-01 03:00:00.000000000 +0300
21407 +++ linux-2.6.8-rc3-a/fs/reiser4/init_super.h   2004-08-05 21:20:53.097657476 +0400
21408 @@ -0,0 +1,4 @@
21409 +/* Copyright by Hans Reiser, 2003 */
21410 +
21411 +extern int reiser4_fill_super (struct super_block * s, void * data, int silent);
21412 +extern int reiser4_done_super (struct super_block * s);
21413 diff -rupN linux-2.6.8-rc3/fs/reiser4/inode.c linux-2.6.8-rc3-a/fs/reiser4/inode.c
21414 --- linux-2.6.8-rc3/fs/reiser4/inode.c  1970-01-01 03:00:00.000000000 +0300
21415 +++ linux-2.6.8-rc3-a/fs/reiser4/inode.c        2004-08-05 21:20:53.046668231 +0400
21416 @@ -0,0 +1,805 @@
21417 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
21418 +
21419 +/* Inode specific operations. */
21420 +
21421 +#include "forward.h"
21422 +#include "debug.h"
21423 +#include "key.h"
21424 +#include "kassign.h"
21425 +#include "coord.h"
21426 +#include "seal.h"
21427 +#include "dscale.h"
21428 +#include "plugin/item/item.h"
21429 +#include "plugin/security/perm.h"
21430 +#include "plugin/plugin.h"
21431 +#include "plugin/object.h"
21432 +#include "plugin/dir/dir.h"
21433 +#include "znode.h"
21434 +#include "vfs_ops.h"
21435 +#include "inode.h"
21436 +#include "super.h"
21437 +#include "reiser4.h"
21438 +
21439 +#include <linux/fs.h>          /* for struct super_block,  address_space */
21440 +
21441 +/* return reiser4 internal tree which inode belongs to */
21442 +/* Audited by: green(2002.06.17) */
21443 +reiser4_internal reiser4_tree *
21444 +tree_by_inode(const struct inode * inode /* inode queried */ )
21445 +{
21446 +       assert("nikita-256", inode != NULL);
21447 +       assert("nikita-257", inode->i_sb != NULL);
21448 +       return get_tree(inode->i_sb);
21449 +}
21450 +
21451 +/* return reiser4-specific inode flags */
21452 +static inline unsigned long *
21453 +inode_flags(const struct inode * const inode)
21454 +{
21455 +       assert("nikita-2842", inode != NULL);
21456 +       return &reiser4_inode_data(inode)->flags;
21457 +}
21458 +
21459 +/* set reiser4-specific flag @f in @inode */
21460 +reiser4_internal void
21461 +inode_set_flag(struct inode * inode, reiser4_file_plugin_flags f)
21462 +{
21463 +       assert("nikita-2248", inode != NULL);
21464 +       set_bit((int) f, inode_flags(inode));
21465 +}
21466 +
21467 +/* clear reiser4-specific flag @f in @inode */
21468 +reiser4_internal void
21469 +inode_clr_flag(struct inode * inode, reiser4_file_plugin_flags f)
21470 +{
21471 +       assert("nikita-2250", inode != NULL);
21472 +       clear_bit((int) f, inode_flags(inode));
21473 +}
21474 +
21475 +/* true if reiser4-specific flag @f is set in @inode */
21476 +reiser4_internal int
21477 +inode_get_flag(const struct inode * inode, reiser4_file_plugin_flags f)
21478 +{
21479 +       assert("nikita-2251", inode != NULL);
21480 +       return test_bit((int) f, inode_flags(inode));
21481 +}
21482 +
21483 +/* convert oid to inode number */
21484 +reiser4_internal ino_t oid_to_ino(oid_t oid)
21485 +{
21486 +       return (ino_t) oid;
21487 +}
21488 +
21489 +/* convert oid to user visible inode number */
21490 +reiser4_internal ino_t oid_to_uino(oid_t oid)
21491 +{
21492 +       /* reiser4 object is uniquely identified by oid which is 64 bit
21493 +          quantity. Kernel in-memory inode is indexed (in the hash table) by
21494 +          32 bit i_ino field, but this is not a problem, because there is a
21495 +          way to further distinguish inodes with identical inode numbers
21496 +          (find_actor supplied to iget()).
21497 +
21498 +          But user space expects unique 32 bit inode number. Obviously this
21499 +          is impossible. Work-around is to somehow hash oid into user visible
21500 +          inode number.
21501 +       */
21502 +       oid_t max_ino = (ino_t) ~ 0;
21503 +
21504 +       if (REISER4_INO_IS_OID || (oid <= max_ino))
21505 +               return oid;
21506 +       else
21507 +               /* this is remotely similar to algorithm used to find next pid
21508 +                  to use for process: after wrap-around start from some
21509 +                  offset rather than from 0. Idea is that there are some long
21510 +                  living objects with which we don't want to collide.
21511 +               */
21512 +               return REISER4_UINO_SHIFT + ((oid - max_ino) & (max_ino >> 1));
21513 +}
21514 +
21515 +/* check that "inode" is on reiser4 file-system */
21516 +reiser4_internal int
21517 +is_reiser4_inode(const struct inode *inode /* inode queried */ )
21518 +{
21519 +       return
21520 +               inode != NULL &&
21521 +               (is_reiser4_super(inode->i_sb) ||
21522 +                inode->i_op == &reiser4_inode_operations);
21523 +
21524 +}
21525 +
21526 +/* Maximal length of a name that can be stored in directory @inode.
21527 +
21528 +   This is used in check during file creation and lookup. */
21529 +reiser4_internal int
21530 +reiser4_max_filename_len(const struct inode *inode /* inode queried */ )
21531 +{
21532 +       assert("nikita-287", is_reiser4_inode(inode));
21533 +       assert("nikita-1710", inode_dir_item_plugin(inode));
21534 +       if (inode_dir_item_plugin(inode)->s.dir.max_name_len)
21535 +               return inode_dir_item_plugin(inode)->s.dir.max_name_len(inode);
21536 +       else
21537 +               return 255;
21538 +}
21539 +
21540 +/* Maximal number of hash collisions for this directory. */
21541 +reiser4_internal int
21542 +max_hash_collisions(const struct inode *dir /* inode queried */ )
21543 +{
21544 +       assert("nikita-1711", dir != NULL);
21545 +#if REISER4_USE_COLLISION_LIMIT
21546 +       return reiser4_inode_data(dir)->plugin.max_collisions;
21547 +#else
21548 +       (void) dir;
21549 +       return ~0;
21550 +#endif
21551 +}
21552 +
21553 +/* Install file, inode, and address_space operation on @inode, depending on
21554 +   its mode. */
21555 +reiser4_internal int
21556 +setup_inode_ops(struct inode *inode /* inode to intialise */ ,
21557 +               reiser4_object_create_data * data       /* parameters to create
21558 +                                                        * object */ )
21559 +{
21560 +       reiser4_super_info_data *sinfo;
21561 +
21562 +       sinfo = get_super_private(inode->i_sb);
21563 +
21564 +       switch (inode->i_mode & S_IFMT) {
21565 +       case S_IFSOCK:
21566 +       case S_IFBLK:
21567 +       case S_IFCHR:
21568 +       case S_IFIFO:{
21569 +                       dev_t rdev;     /* to keep gcc happy */
21570 +
21571 +                       /* ugly hack with rdev */
21572 +                       if (data == NULL) {
21573 +                               rdev = inode->i_rdev;
21574 +                               inode->i_rdev = 0;
21575 +                       } else
21576 +                               rdev = data->rdev;
21577 +                       inode->i_blocks = 0;
21578 +                       inode->i_op = &sinfo->ops.special;
21579 +                       /* other fields are already initialised. */
21580 +                       init_special_inode(inode, inode->i_mode, rdev);
21581 +                       break;
21582 +               }
21583 +       case S_IFLNK:
21584 +               inode->i_op = &sinfo->ops.symlink;
21585 +               inode->i_fop = NULL;
21586 +               inode->i_mapping->a_ops = &sinfo->ops.as;
21587 +               break;
21588 +       case S_IFDIR:
21589 +               inode->i_op = &sinfo->ops.dir;
21590 +               inode->i_fop = &sinfo->ops.file;
21591 +               inode->i_mapping->a_ops = &sinfo->ops.as;
21592 +               break;
21593 +       case S_IFREG:
21594 +               inode->i_op = &sinfo->ops.regular;
21595 +               inode->i_fop = &sinfo->ops.file;
21596 +               inode->i_mapping->a_ops = &sinfo->ops.as;
21597 +               break;
21598 +       default:
21599 +               warning("nikita-291", "wrong file mode: %o for %llu", inode->i_mode, get_inode_oid(inode));
21600 +               reiser4_make_bad_inode(inode);
21601 +               return RETERR(-EINVAL);
21602 +       }
21603 +       return 0;
21604 +}
21605 +
21606 +/* initialise inode from disk data. Called with inode locked.
21607 +    Return inode locked. */
21608 +static int
21609 +init_inode(struct inode *inode /* inode to intialise */ ,
21610 +          coord_t * coord /* coord of stat data */ )
21611 +{
21612 +       int result;
21613 +       item_plugin *iplug;
21614 +       void *body;
21615 +       int length;
21616 +       reiser4_inode *state;
21617 +
21618 +       assert("nikita-292", coord != NULL);
21619 +       assert("nikita-293", inode != NULL);
21620 +
21621 +       coord_clear_iplug(coord);
21622 +       result = zload(coord->node);
21623 +       if (result)
21624 +               return result;
21625 +       iplug = item_plugin_by_coord(coord);
21626 +       body = item_body_by_coord(coord);
21627 +       length = item_length_by_coord(coord);
21628 +
21629 +       assert("nikita-295", iplug != NULL);
21630 +       assert("nikita-296", body != NULL);
21631 +       assert("nikita-297", length > 0);
21632 +
21633 +       /* inode is under I_LOCK now */
21634 +
21635 +       state = reiser4_inode_data(inode);
21636 +       /* call stat-data plugin method to load sd content into inode */
21637 +       result = iplug->s.sd.init_inode(inode, body, length);
21638 +       plugin_set_sd(&state->pset, iplug);
21639 +       if (result == 0) {
21640 +               result = setup_inode_ops(inode, NULL);
21641 +               if (result == 0 &&
21642 +                   inode->i_sb->s_root && inode->i_sb->s_root->d_inode) {
21643 +                       struct inode *root;
21644 +                       pset_member    ind;
21645 +
21646 +                       /* take missing plugins from file-system defaults */
21647 +                       root = inode->i_sb->s_root->d_inode;
21648 +                       /* file and directory plugins are already initialised. */
21649 +                       for (ind = PSET_DIR + 1; ind < PSET_LAST; ++ind) {
21650 +                               result = grab_plugin(inode, root, ind);
21651 +                               if (result != 0)
21652 +                                       break;
21653 +                       }
21654 +                       if (result != 0) {
21655 +                               warning("nikita-3447",
21656 +                                       "Cannot set up plugins for %lli",
21657 +                                       get_inode_oid(inode));
21658 +                       }
21659 +               }
21660 +       }
21661 +       zrelse(coord->node);
21662 +       return result;
21663 +}
21664 +
21665 +/* read `inode' from the disk. This is what was previously in
21666 +   reiserfs_read_inode2().
21667 +
21668 +   Must be called with inode locked. Return inode still locked.
21669 +*/
21670 +static int
21671 +read_inode(struct inode *inode /* inode to read from disk */ ,
21672 +          const reiser4_key * key /* key of stat data */,
21673 +          int silent)
21674 +{
21675 +       int result;
21676 +       lock_handle lh;
21677 +       reiser4_inode *info;
21678 +       coord_t coord;
21679 +
21680 +       assert("nikita-298", inode != NULL);
21681 +       assert("nikita-1945", !is_inode_loaded(inode));
21682 +
21683 +       info = reiser4_inode_data(inode);
21684 +       assert("nikita-300", info->locality_id != 0);
21685 +
21686 +       coord_init_zero(&coord);
21687 +       init_lh(&lh);
21688 +       /* locate stat-data in a tree and return znode locked */
21689 +       result = lookup_sd(inode, ZNODE_READ_LOCK, &coord, &lh, key, silent);
21690 +       assert("nikita-301", !is_inode_loaded(inode));
21691 +       if (result == 0) {
21692 +               /* use stat-data plugin to load sd into inode. */
21693 +               result = init_inode(inode, &coord);
21694 +               if (result == 0) {
21695 +                       /* initialize stat-data seal */
21696 +                       spin_lock_inode(inode);
21697 +                       seal_init(&info->sd_seal, &coord, key);
21698 +                       info->sd_coord = coord;
21699 +                       spin_unlock_inode(inode);
21700 +
21701 +                       /* call file plugin's method to initialize plugin
21702 +                        * specific part of inode */
21703 +                       if (inode_file_plugin(inode)->init_inode_data)
21704 +                               inode_file_plugin(inode)->init_inode_data(inode,
21705 +                                                                         NULL,
21706 +                                                                         0);
21707 +                       /* load detached directory cursors for stateless
21708 +                        * directory readers (NFS). */
21709 +                       load_cursors(inode);
21710 +
21711 +                       /* Check the opened inode for consistency. */
21712 +                       result = get_super_private(inode->i_sb)->df_plug->check_open(inode);
21713 +               }
21714 +       }
21715 +       /* lookup_sd() doesn't release coord because we want znode
21716 +          stay read-locked while stat-data fields are accessed in
21717 +          init_inode() */
21718 +       done_lh(&lh);
21719 +
21720 +       if (result != 0)
21721 +               reiser4_make_bad_inode(inode);
21722 +       return result;
21723 +}
21724 +
21725 +/* initialise new reiser4 inode being inserted into hash table. */
21726 +static int
21727 +init_locked_inode(struct inode *inode /* new inode */ ,
21728 +                 void *opaque  /* key of stat data passed to the
21729 +                                * iget5_locked as cookie */ )
21730 +{
21731 +       reiser4_key *key;
21732 +
21733 +       assert("nikita-1995", inode != NULL);
21734 +       assert("nikita-1996", opaque != NULL);
21735 +       key = opaque;
21736 +       set_inode_oid(inode, get_key_objectid(key));
21737 +       reiser4_inode_data(inode)->locality_id = get_key_locality(key);
21738 +       return 0;
21739 +}
21740 +
21741 +/* reiser4_inode_find_actor() - "find actor" supplied by reiser4 to iget5_locked().
21742 +
21743 +   This function is called by iget5_locked() to distinguish reiser4 inodes
21744 +   having the same inode numbers. Such inodes can only exist due to some error
21745 +   condition. One of them should be bad. Inodes with identical inode numbers
21746 +   (objectids) are distinguished by their packing locality.
21747 +
21748 +*/
21749 +reiser4_internal int
21750 +reiser4_inode_find_actor(struct inode *inode   /* inode from hash table to
21751 +                                                * check */ ,
21752 +                        void *opaque   /* "cookie" passed to
21753 +                                        * iget5_locked(). This is stat data
21754 +                                        * key */ )
21755 +{
21756 +       reiser4_key *key;
21757 +
21758 +       key = opaque;
21759 +       return
21760 +               /* oid is unique, so first term is enough, actually. */
21761 +               get_inode_oid(inode) == get_key_objectid(key) &&
21762 +               /*
21763 +                * also, locality should be checked, but locality is stored in
21764 +                * the reiser4-specific part of the inode, and actor can be
21765 +                * called against arbitrary inode that happened to be in this
21766 +                * hash chain. Hence we first have to check that this is
21767 +                * reiser4 inode at least. is_reiser4_inode() is probably too
21768 +                * early to call, as inode may have ->i_op not yet
21769 +                * initialised.
21770 +                */
21771 +               is_reiser4_super(inode->i_sb) &&
21772 +               /*
21773 +                * usually objectid is unique, but pseudo files use counter to
21774 +                * generate objectid. All pseudo files are placed into special
21775 +                * (otherwise unused) locality.
21776 +                */
21777 +               reiser4_inode_data(inode)->locality_id == get_key_locality(key);
21778 +}
21779 +
21780 +/*
21781 + * this is our helper function a la iget(). This is be called by
21782 + * reiser4_lookup() and reiser4_read_super(). Return inode locked or error
21783 + * encountered.
21784 + */
21785 +reiser4_internal struct inode *
21786 +reiser4_iget(struct super_block *super /* super block  */ ,
21787 +            const reiser4_key * key /* key of inode's stat-data */,
21788 +            int silent)
21789 +{
21790 +       struct inode *inode;
21791 +       int result;
21792 +       reiser4_inode * info;
21793 +
21794 +       assert("nikita-302", super != NULL);
21795 +       assert("nikita-303", key != NULL);
21796 +
21797 +       result = 0;
21798 +
21799 +       /* call iget(). Our ->read_inode() is dummy, so this will either
21800 +           find inode in cache or return uninitialised inode */
21801 +       inode = iget5_locked(super,
21802 +                            (unsigned long) get_key_objectid(key),
21803 +                            reiser4_inode_find_actor,
21804 +                            init_locked_inode,
21805 +                            (reiser4_key *) key);
21806 +       if (inode == NULL)
21807 +               return ERR_PTR(RETERR(-ENOMEM));
21808 +       if (is_bad_inode(inode) && !silent) {
21809 +               warning("nikita-304", "Stat data not found");
21810 +               print_key("key", key);
21811 +               iput(inode);
21812 +               return ERR_PTR(RETERR(-EIO));
21813 +       }
21814 +
21815 +       info = reiser4_inode_data(inode);
21816 +
21817 +       /* Reiser4 inode state bit REISER4_LOADED is used to distinguish fully
21818 +          loaded and initialized inode from just allocated inode. If
21819 +          REISER4_LOADED bit is not set, reiser4_iget() completes loading under
21820 +          info->loading.  The place in reiser4 which uses not initialized inode
21821 +          is the reiser4 repacker, see repacker-related functions in
21822 +          plugin/item/extent.c */
21823 +       if (!is_inode_loaded(inode)) {
21824 +               down(&info->loading);
21825 +               if (!is_inode_loaded(inode)) {
21826 +                       /* locking: iget5_locked returns locked inode */
21827 +                       assert("nikita-1941", !is_inode_loaded(inode));
21828 +                       assert("nikita-1949",
21829 +                              reiser4_inode_find_actor(inode,
21830 +                                                       (reiser4_key *)key));
21831 +                       /* now, inode has objectid as ->i_ino and locality in
21832 +                          reiser4-specific part. This is enough for
21833 +                          read_inode() to read stat data from the disk */
21834 +                       result = read_inode(inode, key, silent);
21835 +               }
21836 +       }
21837 +
21838 +       if (inode->i_state & I_NEW)
21839 +               unlock_new_inode(inode);
21840 +
21841 +       if (is_bad_inode(inode)) {
21842 +               up(&info->loading);
21843 +               iput(inode);
21844 +               inode = ERR_PTR(result);
21845 +       } else if (REISER4_DEBUG) {
21846 +               reiser4_key found_key;
21847 +
21848 +               build_sd_key(inode, &found_key);
21849 +               if (!keyeq(&found_key, key)) {
21850 +                       warning("nikita-305", "Wrong key in sd");
21851 +                       print_key("sought for", key);
21852 +                       print_key("found", &found_key);
21853 +               }
21854 +               if (inode_file_plugin(inode)->not_linked(inode)) {
21855 +                       warning("nikita-3559", "Unlinked inode found: %llu\n",
21856 +                               get_inode_oid(inode));
21857 +                       print_inode("inode", inode);
21858 +               }
21859 +       }
21860 +       return inode;
21861 +}
21862 +
21863 +/* reiser4_iget() may return not fully initialized inode, this function should
21864 + * be called after one completes reiser4 inode initializing. */
21865 +reiser4_internal void reiser4_iget_complete (struct inode * inode)
21866 +{
21867 +       assert("zam-988", is_reiser4_inode(inode));
21868 +
21869 +       if (!is_inode_loaded(inode)) {
21870 +               inode_set_flag(inode, REISER4_LOADED);
21871 +               up(&reiser4_inode_data(inode)->loading);
21872 +       }
21873 +}
21874 +
21875 +reiser4_internal void
21876 +reiser4_make_bad_inode(struct inode *inode)
21877 +{
21878 +       assert("nikita-1934", inode != NULL);
21879 +
21880 +       /* clear LOADED bit */
21881 +       inode_clr_flag(inode, REISER4_LOADED);
21882 +       make_bad_inode(inode);
21883 +       return;
21884 +}
21885 +
21886 +reiser4_internal file_plugin *
21887 +inode_file_plugin(const struct inode * inode)
21888 +{
21889 +       assert("nikita-1997", inode != NULL);
21890 +       return reiser4_inode_data(inode)->pset->file;
21891 +}
21892 +
21893 +reiser4_internal dir_plugin *
21894 +inode_dir_plugin(const struct inode * inode)
21895 +{
21896 +       assert("nikita-1998", inode != NULL);
21897 +       return reiser4_inode_data(inode)->pset->dir;
21898 +}
21899 +
21900 +reiser4_internal perm_plugin *
21901 +inode_perm_plugin(const struct inode * inode)
21902 +{
21903 +       assert("nikita-1999", inode != NULL);
21904 +       return reiser4_inode_data(inode)->pset->perm;
21905 +}
21906 +
21907 +reiser4_internal formatting_plugin *
21908 +inode_formatting_plugin(const struct inode * inode)
21909 +{
21910 +       assert("nikita-2000", inode != NULL);
21911 +       return reiser4_inode_data(inode)->pset->formatting;
21912 +}
21913 +
21914 +reiser4_internal hash_plugin *
21915 +inode_hash_plugin(const struct inode * inode)
21916 +{
21917 +       assert("nikita-2001", inode != NULL);
21918 +       return reiser4_inode_data(inode)->pset->hash;
21919 +}
21920 +
21921 +reiser4_internal fibration_plugin *
21922 +inode_fibration_plugin(const struct inode * inode)
21923 +{
21924 +       assert("nikita-2001", inode != NULL);
21925 +       return reiser4_inode_data(inode)->pset->fibration;
21926 +}
21927 +
21928 +reiser4_internal crypto_plugin *
21929 +inode_crypto_plugin(const struct inode * inode)
21930 +{
21931 +       assert("edward-36", inode != NULL);
21932 +       return reiser4_inode_data(inode)->pset->crypto;
21933 +}
21934 +
21935 +reiser4_internal compression_plugin *
21936 +inode_compression_plugin(const struct inode * inode)
21937 +{
21938 +       assert("edward-37", inode != NULL);
21939 +       return reiser4_inode_data(inode)->pset->compression;
21940 +}
21941 +
21942 +reiser4_internal digest_plugin *
21943 +inode_digest_plugin(const struct inode * inode)
21944 +{
21945 +       assert("edward-86", inode != NULL);
21946 +       return reiser4_inode_data(inode)->pset->digest;
21947 +}
21948 +
21949 +reiser4_internal item_plugin *
21950 +inode_sd_plugin(const struct inode * inode)
21951 +{
21952 +       assert("vs-534", inode != NULL);
21953 +       return reiser4_inode_data(inode)->pset->sd;
21954 +}
21955 +
21956 +reiser4_internal item_plugin *
21957 +inode_dir_item_plugin(const struct inode * inode)
21958 +{
21959 +       assert("vs-534", inode != NULL);
21960 +       return reiser4_inode_data(inode)->pset->dir_item;
21961 +}
21962 +
21963 +reiser4_internal void
21964 +inode_set_extension(struct inode *inode, sd_ext_bits ext)
21965 +{
21966 +       reiser4_inode *state;
21967 +
21968 +       assert("nikita-2716", inode != NULL);
21969 +       assert("nikita-2717", ext < LAST_SD_EXTENSION);
21970 +       assert("nikita-3491",
21971 +              spin_inode_object_is_locked(reiser4_inode_data(inode)));
21972 +
21973 +       state = reiser4_inode_data(inode);
21974 +       state->extmask |= 1 << ext;
21975 +       /* force re-calculation of stat-data length on next call to
21976 +          update_sd(). */
21977 +       inode_clr_flag(inode, REISER4_SDLEN_KNOWN);
21978 +}
21979 +
21980 +reiser4_internal void
21981 +inode_set_plugin(struct inode *inode, reiser4_plugin * plug, pset_member memb)
21982 +{
21983 +       assert("nikita-2718", inode != NULL);
21984 +       assert("nikita-2719", plug != NULL);
21985 +
21986 +       reiser4_inode_data(inode)->plugin_mask |= (1 << memb);
21987 +}
21988 +
21989 +reiser4_internal void
21990 +inode_check_scale(struct inode *inode, __u64 old, __u64 new)
21991 +{
21992 +       assert("nikita-2875", inode != NULL);
21993 +       spin_lock_inode(inode);
21994 +       if (!dscale_fit(old, new))
21995 +               inode_clr_flag(inode, REISER4_SDLEN_KNOWN);
21996 +       spin_unlock_inode(inode);
21997 +}
21998 +
21999 +/*
22000 + * initialize ->ordering field of inode. This field defines how file stat-data
22001 + * and body is ordered within a tree with respect to other objects within the
22002 + * same parent directory.
22003 + */
22004 +reiser4_internal void
22005 +init_inode_ordering(struct inode *inode,
22006 +                   reiser4_object_create_data *crd, int create)
22007 +{
22008 +       reiser4_key key;
22009 +
22010 +       if (create) {
22011 +               struct inode *parent;
22012 +
22013 +               parent = crd->parent;
22014 +               assert("nikita-3224", inode_dir_plugin(parent) != NULL);
22015 +               inode_dir_plugin(parent)->build_entry_key(parent,
22016 +                                                         &crd->dentry->d_name,
22017 +                                                         &key);
22018 +       } else {
22019 +               coord_t *coord;
22020 +
22021 +               coord = &reiser4_inode_data(inode)->sd_coord;
22022 +               coord_clear_iplug(coord);
22023 +               /* safe to use ->sd_coord, because node is under long term
22024 +                * lock */
22025 +               WITH_DATA(coord->node, item_key_by_coord(coord, &key));
22026 +       }
22027 +
22028 +       set_inode_ordering(inode, get_key_ordering(&key));
22029 +}
22030 +
22031 +reiser4_internal znode *
22032 +inode_get_vroot(struct inode *inode)
22033 +{
22034 +       reiser4_block_nr blk;
22035 +       znode *result;
22036 +       reiser4_inode *info;
22037 +
22038 +       info = reiser4_inode_data(inode);
22039 +       LOCK_INODE(info);
22040 +       blk = info->vroot;
22041 +       UNLOCK_INODE(info);
22042 +       if (!disk_addr_eq(&UBER_TREE_ADDR, &blk))
22043 +               result = zlook(tree_by_inode(inode), &blk);
22044 +       else
22045 +               result = NULL;
22046 +       return result;
22047 +}
22048 +
22049 +reiser4_internal void
22050 +inode_set_vroot(struct inode *inode, znode *vroot)
22051 +{
22052 +       reiser4_inode *info;
22053 +
22054 +       info = reiser4_inode_data(inode);
22055 +       LOCK_INODE(info);
22056 +       info->vroot = *znode_get_block(vroot);
22057 +       UNLOCK_INODE(info);
22058 +}
22059 +
22060 +reiser4_internal void
22061 +inode_clean_vroot(struct inode *inode)
22062 +{
22063 +       reiser4_inode *info;
22064 +
22065 +       info = reiser4_inode_data(inode);
22066 +       LOCK_INODE(info);
22067 +       info->vroot = UBER_TREE_ADDR;
22068 +       UNLOCK_INODE(info);
22069 +}
22070 +
22071 +reiser4_internal int
22072 +get_reiser4_inode_by_key (struct inode ** result, const reiser4_key * key)
22073 +{
22074 +       struct super_block * super = reiser4_get_current_sb();
22075 +       struct inode * inode;
22076 +
22077 +       /* We do not need to read reiser4 inode from disk and initialize all
22078 +        * reiser4 inode fields. */
22079 +       inode = iget_locked(super, (unsigned long)get_key_objectid(key));
22080 +       if (inode == NULL)
22081 +               return -ENOMEM;
22082 +       if (is_bad_inode(inode)) {
22083 +               iput(inode);
22084 +               return -EIO;
22085 +       }
22086 +
22087 +       if (inode->i_state & I_NEW) {
22088 +               reiser4_inode * inode_data = reiser4_inode_data(inode);
22089 +
22090 +               /* These inode fields are required for tree traversal. */
22091 +               set_inode_oid(inode, get_key_objectid(key));
22092 +               inode_data->locality_id = get_key_locality(key);
22093 +#if REISER4_LARGE_KEY
22094 +               inode_data->ordering = get_key_ordering(key);
22095 +#endif
22096 +
22097 +               inode->i_mapping->a_ops = &reiser4_as_operations;
22098 +               unlock_new_inode(inode);
22099 +       }
22100 +
22101 +       *result = inode;
22102 +       return 0;
22103 +}
22104 +
22105 +
22106 +#if REISER4_DEBUG_OUTPUT
22107 +/* Debugging aid: print information about inode. */
22108 +reiser4_internal void
22109 +print_inode(const char *prefix /* prefix to print */ ,
22110 +           const struct inode *i /* inode to print */ )
22111 +{
22112 +       reiser4_key inode_key;
22113 +       reiser4_inode *ref;
22114 +
22115 +       if (i == NULL) {
22116 +               printk("%s: inode: null\n", prefix);
22117 +               return;
22118 +       }
22119 +       printk("%s: ino: %lu, count: %i, link: %i, mode: %o, size: %llu\n",
22120 +              prefix, i->i_ino, atomic_read(&i->i_count), i->i_nlink, i->i_mode, (unsigned long long) i->i_size);
22121 +       printk("\tuid: %i, gid: %i, dev: %i, rdev: %i\n", i->i_uid, i->i_gid, i->i_sb->s_dev, i->i_rdev);
22122 +       printk("\tatime: [%li,%li], mtime: [%li,%li], ctime: [%li,%li]\n",
22123 +              i->i_atime.tv_sec, i->i_atime.tv_nsec,
22124 +              i->i_mtime.tv_sec, i->i_mtime.tv_nsec,
22125 +              i->i_ctime.tv_sec, i->i_ctime.tv_nsec);
22126 +       printk("\tblkbits: %i, blksize: %lu, blocks: %lu, bytes: %u\n",
22127 +              i->i_blkbits, i->i_blksize, i->i_blocks, i->i_bytes);
22128 +       printk("\tversion: %lu, generation: %i, state: %lu, flags: %u\n",
22129 +              i->i_version, i->i_generation, i->i_state, i->i_flags);
22130 +       printk("\tis_reiser4_inode: %i\n", is_reiser4_inode(i));
22131 +       print_key("\tkey", build_sd_key(i, &inode_key));
22132 +       ref = reiser4_inode_data(i);
22133 +       print_plugin("\tfile", file_plugin_to_plugin(ref->pset->file));
22134 +       print_plugin("\tdir", dir_plugin_to_plugin(ref->pset->dir));
22135 +       print_plugin("\tperm", perm_plugin_to_plugin(ref->pset->perm));
22136 +       print_plugin("\tformatting", formatting_plugin_to_plugin(ref->pset->formatting));
22137 +       print_plugin("\thash", hash_plugin_to_plugin(ref->pset->hash));
22138 +       print_plugin("\tsd", item_plugin_to_plugin(ref->pset->sd));
22139 +
22140 +       /* FIXME-VS: this segfaults trying to print seal's coord */
22141 +       print_seal("\tsd_seal", &ref->sd_seal);
22142 +       print_coord("\tsd_coord", &ref->sd_coord, 0);
22143 +       printk("\tflags: %#lx, extmask: %#llx, pmask: %i, locality: %llu\n",
22144 +              *inode_flags(i), ref->extmask,
22145 +              ref->plugin_mask, ref->locality_id);
22146 +}
22147 +#endif
22148 +
22149 +#if REISER4_DEBUG
22150 +void
22151 +inode_invariant(const struct inode *inode)
22152 +{
22153 +       reiser4_inode * object;
22154 +
22155 +       object = reiser4_inode_data(inode);
22156 +       assert("nikita-3077", spin_inode_object_is_locked(object));
22157 +
22158 +       spin_lock_eflush(inode->i_sb);
22159 +
22160 +       assert("nikita-3146", object->anonymous_eflushed >= 0 && object->captured_eflushed >= 0);
22161 +       assert("nikita-3441", ergo(object->anonymous_eflushed > 0 || object->captured_eflushed > 0,
22162 +                                  jnode_tree_by_reiser4_inode(object)->rnode != NULL));
22163 +
22164 +       spin_unlock_eflush(inode->i_sb);
22165 +}
22166 +
22167 +void
22168 +mark_inode_update(struct inode *object, int immediate)
22169 +{
22170 +       int i;
22171 +       int pos;
22172 +       reiser4_context *ctx;
22173 +
22174 +       ctx = get_current_context();
22175 +       for (i = 0, pos = -1; i < TRACKED_DELAYED_UPDATE; ++i) {
22176 +               if (ctx->dirty[i].ino == object->i_ino) {
22177 +                       pos = i;
22178 +                       break;
22179 +               } else if (ctx->dirty[i].ino == 0)
22180 +                       pos = i;
22181 +       }
22182 +       if (pos == -1)
22183 +               ;/*warning("nikita-3402", "Too many delayed inode updates");*/
22184 +       else if (immediate) {
22185 +               ctx->dirty[pos].ino = 0;
22186 +       } else {
22187 +               ctx->dirty[pos].ino = object->i_ino;
22188 +               ctx->dirty[pos].delayed = 1;
22189 +#ifdef CONFIG_FRAME_POINTER
22190 +               ctx->dirty[pos].stack[0] = __builtin_return_address(0);
22191 +               ctx->dirty[pos].stack[1] = __builtin_return_address(1);
22192 +               ctx->dirty[pos].stack[2] = __builtin_return_address(2);
22193 +               ctx->dirty[pos].stack[3] = __builtin_return_address(3);
22194 +#endif
22195 +       }
22196 +}
22197 +
22198 +
22199 +int
22200 +delayed_inode_updates(dirty_inode_info info)
22201 +{
22202 +       int i;
22203 +
22204 +       for (i = 0; i < TRACKED_DELAYED_UPDATE; ++i) {
22205 +               if (info[i].ino != 0 && info[i].delayed)
22206 +                       return 1;
22207 +       }
22208 +       return 0;
22209 +}
22210 +
22211 +#endif
22212 +
22213 +/* Make Linus happy.
22214 +   Local variables:
22215 +   c-indentation-style: "K&R"
22216 +   mode-name: "LC"
22217 +   c-basic-offset: 8
22218 +   tab-width: 8
22219 +   fill-column: 120
22220 +   End:
22221 +*/
22222 diff -rupN linux-2.6.8-rc3/fs/reiser4/inode.h linux-2.6.8-rc3-a/fs/reiser4/inode.h
22223 --- linux-2.6.8-rc3/fs/reiser4/inode.h  1970-01-01 03:00:00.000000000 +0300
22224 +++ linux-2.6.8-rc3-a/fs/reiser4/inode.h        2004-08-05 21:20:52.914696067 +0400
22225 @@ -0,0 +1,428 @@
22226 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
22227 +
22228 +/* Inode functions. */
22229 +
22230 +#if !defined( __REISER4_INODE_H__ )
22231 +#define __REISER4_INODE_H__
22232 +
22233 +#include "forward.h"
22234 +#include "debug.h"
22235 +#include "spin_macros.h"
22236 +#include "key.h"
22237 +#include "kcond.h"
22238 +#include "seal.h"
22239 +#include "plugin/plugin.h"
22240 +#include "plugin/cryptcompress.h"
22241 +#include "plugin/plugin_set.h"
22242 +#include "plugin/security/perm.h"
22243 +#include "plugin/pseudo/pseudo.h"
22244 +#include "vfs_ops.h"
22245 +#include "jnode.h"
22246 +
22247 +#include <linux/types.h>       /* for __u?? , ino_t */
22248 +#include <linux/fs.h>          /* for struct super_block, struct
22249 +                                * rw_semaphore, etc  */
22250 +#include <linux/spinlock.h>
22251 +#include <asm/types.h>
22252 +
22253 +/* reiser4-specific inode flags. They are "transient" and are not
22254 +   supposed to be stored on disk. Used to trace "state" of
22255 +   inode
22256 +*/
22257 +typedef enum {
22258 +       /* this is light-weight inode, inheriting some state from its
22259 +          parent  */
22260 +       REISER4_LIGHT_WEIGHT = 0,
22261 +       /* stat data wasn't yet created */
22262 +       REISER4_NO_SD = 1,
22263 +       /* internal immutable flag. Currently is only used
22264 +           to avoid race condition during file creation.
22265 +           See comment in create_object(). */
22266 +       REISER4_IMMUTABLE = 2,
22267 +       /* inode was read from storage */
22268 +       REISER4_LOADED = 3,
22269 +       /* this bit is set for symlinks. inode->u.generic_ip points to target
22270 +          name of symlink. */
22271 +       REISER4_GENERIC_PTR_USED = 4,
22272 +       /* set if size of stat-data item for this inode is known. If this is
22273 +        * set we can avoid recalculating size of stat-data on each update. */
22274 +       REISER4_SDLEN_KNOWN   = 5,
22275 +       /* reiser4_inode->crypt points to the crypto stat */
22276 +       REISER4_CRYPTO_STAT_LOADED = 6,
22277 +       /* reiser4_inode->cluster_shift makes sense */
22278 +       REISER4_CLUSTER_KNOWN = 7,
22279 +       /* cryptcompress_inode_data points to the secret key */
22280 +       REISER4_SECRET_KEY_INSTALLED = 8,
22281 +       /* File (possibly) has pages corresponding to the tail items, that
22282 +        * were created by ->readpage. It is set by mmap_unix_file() and
22283 +        * sendfile_unix_file(). This bit is inspected by write_unix_file and
22284 +        * kill-hook of tail items. It is never cleared once set. This bit is
22285 +        * modified and inspected under i_sem. */
22286 +       REISER4_HAS_MMAP = 9,
22287 +       /* file was partially converted. It's body consists of a mix of tail
22288 +        * and extent items. */
22289 +       REISER4_PART_CONV = 10,
22290 +} reiser4_file_plugin_flags;
22291 +
22292 +/* state associated with each inode.
22293 +   reiser4 inode.
22294 +
22295 +   NOTE-NIKITA In 2.5 kernels it is not necessary that all file-system inodes
22296 +   be of the same size. File-system allocates inodes by itself through
22297 +   s_op->allocate_inode() method. So, it is possible to adjust size of inode
22298 +   at the time of its creation.
22299 +
22300 +
22301 +   Invariants involving parts of this data-type:
22302 +
22303 +      [inode->eflushed]
22304 +
22305 +*/
22306 +
22307 +typedef struct reiser4_inode reiser4_inode;
22308 +/* return pointer to reiser4-specific part of inode */
22309 +static inline reiser4_inode *
22310 +reiser4_inode_data(const struct inode * inode /* inode queried */);
22311 +
22312 +#include "plugin/file/file.h"
22313 +
22314 +#if BITS_PER_LONG == 64
22315 +
22316 +#define REISER4_INO_IS_OID (1)
22317 +typedef struct {;
22318 +} oid_hi_t;
22319 +
22320 +/* BITS_PER_LONG == 64 */
22321 +#else
22322 +
22323 +#define REISER4_INO_IS_OID (0)
22324 +typedef __u32 oid_hi_t;
22325 +
22326 +/* BITS_PER_LONG == 64 */
22327 +#endif
22328 +
22329 +struct reiser4_inode {
22330 +       /* spin lock protecting fields of this structure. */
22331 +       reiser4_spin_data guard;
22332 +       /* object plugins */
22333 +       plugin_set *pset;
22334 +       /* plugins set for inheritance */
22335 +       plugin_set *hset;
22336 +       /* high 32 bits of object id */
22337 +       oid_hi_t oid_hi;
22338 +       /* seal for stat-data */
22339 +       seal_t sd_seal;
22340 +       /* locality id for this file */
22341 +       oid_t locality_id;
22342 +#if REISER4_LARGE_KEY
22343 +       __u64 ordering;
22344 +#endif
22345 +       /* coord of stat-data in sealed node */
22346 +       coord_t sd_coord;
22347 +       /* bit-mask of stat-data extentions used by this file */
22348 +       __u64 extmask;
22349 +       /* bitmask of non-default plugins for this inode */
22350 +       __u16 plugin_mask;
22351 +       /* cluster parameter for crypto and compression */
22352 +       __u8 cluster_shift;
22353 +       /* secret key parameter for crypto */
22354 +       crypto_stat_t *crypt;
22355 +
22356 +       union {
22357 +               readdir_list_head readdir_list;
22358 +               struct list_head not_used;
22359 +       } lists;
22360 +       /* per-inode flags. Filled by values of reiser4_file_plugin_flags */
22361 +       unsigned long flags;
22362 +       union {
22363 +               /* fields specific to unix_file plugin */
22364 +               unix_file_info_t unix_file_info;
22365 +               /* fields specific to cryptcompress plugin */
22366 +               cryptcompress_info_t cryptcompress_info;
22367 +               /* fields specific to pseudo file plugin */
22368 +               pseudo_info_t pseudo_info;
22369 +       } file_plugin_data;
22370 +       struct rw_semaphore coc_sem; /* filemap_nopage takes it for read, copy_on_capture - for write. Under this it
22371 +                              tries to unmap page for which it is called. This prevents process from using page which
22372 +                              was copied on capture */
22373 +       /* tree of jnodes. Jnodes in this tree are distinguished by radix tree
22374 +          tags */
22375 +       struct radix_tree_root jnodes_tree;
22376 +#if REISER4_DEBUG
22377 +       /* list of jnodes. Number of jnodes in this list is the above jnodes field */
22378 +       inode_jnodes_list_head jnodes_list;
22379 +
22380 +       /* numbers of eflushed jnodes of each type in the above tree */
22381 +       int anonymous_eflushed;
22382 +       int captured_eflushed;
22383 +       /* number of unformatted node jnodes of this file in jnode hash table */
22384 +       unsigned long nr_jnodes;
22385 +#endif
22386 +
22387 +       /* block number of virtual root for this object. See comment above
22388 +        * fs/reiser4/search.c:handle_vroot() */
22389 +       reiser4_block_nr vroot;
22390 +       struct semaphore loading;
22391 +};
22392 +
22393 +
22394 +#define I_JNODES (512) /* inode state bit. Set when in hash table there are more than 0 jnodes of unformatted nodes of
22395 +                          an inode */
22396 +
22397 +typedef struct reiser4_inode_object {
22398 +       /* private part */
22399 +       reiser4_inode p;
22400 +       /* generic fields not specific to reiser4, but used by VFS */
22401 +       struct inode vfs_inode;
22402 +} reiser4_inode_object;
22403 +
22404 +/* return pointer to the reiser4 specific portion of @inode */
22405 +static inline reiser4_inode *
22406 +reiser4_inode_data(const struct inode * inode /* inode queried */)
22407 +{
22408 +       assert("nikita-254", inode != NULL);
22409 +       return &container_of(inode, reiser4_inode_object, vfs_inode)->p;
22410 +}
22411 +
22412 +static inline struct inode *
22413 +inode_by_reiser4_inode(const reiser4_inode *r4_inode /* inode queried */)
22414 +{
22415 +       return &container_of(r4_inode, reiser4_inode_object, p)->vfs_inode;
22416 +}
22417 +
22418 +/*
22419 + * reiser4 inodes are identified by 64bit object-id (oid_t), but in struct
22420 + * inode ->i_ino field is of type ino_t (long) that can be either 32 or 64
22421 + * bits.
22422 + *
22423 + * If ->i_ino is 32 bits we store remaining 32 bits in reiser4 specific part
22424 + * of inode, otherwise whole oid is stored in i_ino.
22425 + *
22426 + * Wrappers below ([sg]et_inode_oid()) are used to hide this difference.
22427 + */
22428 +
22429 +#define OID_HI_SHIFT (sizeof(ino_t) * 8)
22430 +
22431 +#if REISER4_INO_IS_OID
22432 +
22433 +static inline oid_t
22434 +get_inode_oid(const struct inode *inode)
22435 +{
22436 +       return inode->i_ino;
22437 +}
22438 +
22439 +static inline void
22440 +set_inode_oid(struct inode *inode, oid_t oid)
22441 +{
22442 +       inode->i_ino = oid;
22443 +}
22444 +
22445 +/* REISER4_INO_IS_OID */
22446 +#else
22447 +
22448 +static inline oid_t
22449 +get_inode_oid(const struct inode *inode)
22450 +{
22451 +       return
22452 +               ((__u64)reiser4_inode_data(inode)->oid_hi << OID_HI_SHIFT) |
22453 +               inode->i_ino;
22454 +}
22455 +
22456 +static inline void
22457 +set_inode_oid(struct inode *inode, oid_t oid)
22458 +{
22459 +       assert("nikita-2519", inode != NULL);
22460 +       inode->i_ino = (ino_t)(oid);
22461 +       reiser4_inode_data(inode)->oid_hi = (oid) >> OID_HI_SHIFT;
22462 +       assert("nikita-2521", get_inode_oid(inode) == (oid));
22463 +}
22464 +
22465 +/* REISER4_INO_IS_OID */
22466 +#endif
22467 +
22468 +static inline oid_t
22469 +get_inode_locality(const struct inode *inode)
22470 +{
22471 +       return reiser4_inode_data(inode)->locality_id;
22472 +}
22473 +
22474 +#if REISER4_LARGE_KEY
22475 +static inline __u64 get_inode_ordering(const struct inode *inode)
22476 +{
22477 +       return reiser4_inode_data(inode)->ordering;
22478 +}
22479 +
22480 +static inline void set_inode_ordering(const struct inode *inode, __u64 ordering)
22481 +{
22482 +       reiser4_inode_data(inode)->ordering = ordering;
22483 +}
22484 +
22485 +#else
22486 +
22487 +#define get_inode_ordering(inode) (0)
22488 +#define set_inode_ordering(inode, val) noop
22489 +
22490 +#endif
22491 +
22492 +/* return inode in which @uf_info is embedded */
22493 +static inline struct inode *
22494 +unix_file_info_to_inode(const unix_file_info_t *uf_info)
22495 +{
22496 +       return &container_of(uf_info, reiser4_inode_object,
22497 +                            p.file_plugin_data.unix_file_info)->vfs_inode;
22498 +}
22499 +
22500 +/* ordering predicate for inode spin lock: only jnode lock can be held */
22501 +#define spin_ordering_pred_inode_object(inode)                 \
22502 +       ( lock_counters() -> rw_locked_dk == 0 ) &&             \
22503 +       ( lock_counters() -> rw_locked_tree == 0 ) &&           \
22504 +       ( lock_counters() -> spin_locked_txnh == 0 ) &&         \
22505 +       ( lock_counters() -> rw_locked_zlock == 0 ) &&  \
22506 +       ( lock_counters() -> spin_locked_jnode == 0 ) &&        \
22507 +       ( lock_counters() -> spin_locked_atom == 0 ) &&         \
22508 +       ( lock_counters() -> spin_locked_ktxnmgrd == 0 ) &&     \
22509 +       ( lock_counters() -> spin_locked_txnmgr == 0 )
22510 +
22511 +SPIN_LOCK_FUNCTIONS(inode_object, reiser4_inode, guard);
22512 +
22513 +extern ino_t oid_to_ino(oid_t oid) __attribute__ ((const));
22514 +extern ino_t oid_to_uino(oid_t oid) __attribute__ ((const));
22515 +
22516 +extern reiser4_tree *tree_by_inode(const struct inode *inode);
22517 +
22518 +#if REISER4_DEBUG
22519 +extern void inode_invariant(const struct inode *inode);
22520 +#else
22521 +#define inode_invariant(inode) noop
22522 +#endif
22523 +
22524 +#define spin_lock_inode(inode)                 \
22525 +({                                             \
22526 +       LOCK_INODE(reiser4_inode_data(inode));  \
22527 +       inode_invariant(inode);                 \
22528 +})
22529 +
22530 +#define spin_unlock_inode(inode)                       \
22531 +({                                                     \
22532 +       inode_invariant(inode);                         \
22533 +       UNLOCK_INODE(reiser4_inode_data(inode));        \
22534 +})
22535 +
22536 +extern znode *inode_get_vroot(struct inode *inode);
22537 +extern void   inode_set_vroot(struct inode *inode, znode *vroot);
22538 +extern void   inode_clean_vroot(struct inode *inode);
22539 +
22540 +extern int reiser4_max_filename_len(const struct inode *inode);
22541 +extern int max_hash_collisions(const struct inode *dir);
22542 +extern void reiser4_unlock_inode(struct inode *inode);
22543 +extern int is_reiser4_inode(const struct inode *inode);
22544 +extern int setup_inode_ops(struct inode *inode, reiser4_object_create_data *);
22545 +extern struct inode *reiser4_iget(struct super_block *super, const reiser4_key * key, int silent);
22546 +extern void reiser4_iget_complete (struct inode * inode);
22547 +extern int reiser4_inode_find_actor(struct inode *inode, void *opaque);
22548 +extern int get_reiser4_inode_by_key (struct inode **, const reiser4_key *);
22549 +
22550 +
22551 +extern void inode_set_flag(struct inode *inode, reiser4_file_plugin_flags f);
22552 +extern void inode_clr_flag(struct inode *inode, reiser4_file_plugin_flags f);
22553 +extern int inode_get_flag(const struct inode *inode, reiser4_file_plugin_flags f);
22554 +
22555 +/*  has inode been initialized? */
22556 +static inline int
22557 +is_inode_loaded(const struct inode *inode /* inode queried */ )
22558 +{
22559 +       assert("nikita-1120", inode != NULL);
22560 +       return inode_get_flag(inode, REISER4_LOADED);
22561 +}
22562 +
22563 +extern file_plugin *inode_file_plugin(const struct inode *inode);
22564 +extern dir_plugin *inode_dir_plugin(const struct inode *inode);
22565 +extern perm_plugin *inode_perm_plugin(const struct inode *inode);
22566 +extern formatting_plugin *inode_formatting_plugin(const struct inode *inode);
22567 +extern hash_plugin *inode_hash_plugin(const struct inode *inode);
22568 +extern fibration_plugin *inode_fibration_plugin(const struct inode *inode);
22569 +extern crypto_plugin *inode_crypto_plugin(const struct inode *inode);
22570 +extern digest_plugin *inode_digest_plugin(const struct inode *inode);
22571 +extern compression_plugin *inode_compression_plugin(const struct inode *inode);
22572 +extern item_plugin *inode_sd_plugin(const struct inode *inode);
22573 +extern item_plugin *inode_dir_item_plugin(const struct inode *inode);
22574 +
22575 +extern void inode_set_plugin(struct inode *inode,
22576 +                            reiser4_plugin * plug, pset_member memb);
22577 +extern void reiser4_make_bad_inode(struct inode *inode);
22578 +
22579 +extern void inode_set_extension(struct inode *inode, sd_ext_bits ext);
22580 +extern void inode_check_scale(struct inode *inode, __u64 old, __u64 new);
22581 +
22582 +/*
22583 + * update field @field in inode @i to contain value @value.
22584 + */
22585 +#define INODE_SET_FIELD(i, field, value)               \
22586 +({                                                     \
22587 +       struct inode *__i;                              \
22588 +       typeof(value) __v;                              \
22589 +                                                       \
22590 +       __i = (i);                                      \
22591 +       __v = (value);                                  \
22592 +       inode_check_scale(__i, __i->field, __v);        \
22593 +       __i->field = __v;                               \
22594 +})
22595 +
22596 +#define INODE_INC_FIELD(i, field)                              \
22597 +({                                                             \
22598 +       struct inode *__i;                                      \
22599 +                                                               \
22600 +       __i = (i);                                              \
22601 +       inode_check_scale(__i, __i->field, __i->field + 1);     \
22602 +       ++ __i->field;                                          \
22603 +})
22604 +
22605 +#define INODE_DEC_FIELD(i, field)                              \
22606 +({                                                             \
22607 +       struct inode *__i;                                      \
22608 +                                                               \
22609 +       __i = (i);                                              \
22610 +       inode_check_scale(__i, __i->field, __i->field - 1);     \
22611 +       -- __i->field;                                          \
22612 +})
22613 +
22614 +/* See comment before readdir_common() for description. */
22615 +static inline readdir_list_head *
22616 +get_readdir_list(const struct inode *inode)
22617 +{
22618 +       return &reiser4_inode_data(inode)->lists.readdir_list;
22619 +}
22620 +
22621 +extern void init_inode_ordering(struct inode *inode,
22622 +                               reiser4_object_create_data *crd, int create);
22623 +
22624 +static inline struct radix_tree_root *
22625 +jnode_tree_by_inode(struct inode *inode)
22626 +{
22627 +       return &reiser4_inode_data(inode)->jnodes_tree;
22628 +}
22629 +
22630 +static inline struct radix_tree_root *
22631 +jnode_tree_by_reiser4_inode(reiser4_inode *r4_inode)
22632 +{
22633 +       return &r4_inode->jnodes_tree;
22634 +}
22635 +
22636 +#if REISER4_DEBUG_OUTPUT
22637 +extern void print_inode(const char *prefix, const struct inode *i);
22638 +#else
22639 +#define print_inode(p, i) noop
22640 +#endif
22641 +
22642 +/* __REISER4_INODE_H__ */
22643 +#endif
22644 +
22645 +/* Make Linus happy.
22646 +   Local variables:
22647 +   c-indentation-style: "K&R"
22648 +   mode-name: "LC"
22649 +   c-basic-offset: 8
22650 +   tab-width: 8
22651 +   fill-column: 120
22652 +   End:
22653 +*/
22654 diff -rupN linux-2.6.8-rc3/fs/reiser4/inode_ops.c linux-2.6.8-rc3-a/fs/reiser4/inode_ops.c
22655 --- linux-2.6.8-rc3/fs/reiser4/inode_ops.c      1970-01-01 03:00:00.000000000 +0300
22656 +++ linux-2.6.8-rc3-a/fs/reiser4/inode_ops.c    2004-08-05 21:20:53.006676666 +0400
22657 @@ -0,0 +1,643 @@
22658 +/* Copyright 2003, 2004 by Hans Reiser, licensing governed by
22659 + * reiser4/README */
22660 +
22661 +/* Interface to VFS. Reiser4 inode_operations are defined here. */
22662 +
22663 +#include "forward.h"
22664 +#include "debug.h"
22665 +#include "dformat.h"
22666 +#include "coord.h"
22667 +#include "plugin/item/item.h"
22668 +#include "plugin/file/file.h"
22669 +#include "plugin/security/perm.h"
22670 +#include "plugin/disk_format/disk_format.h"
22671 +#include "plugin/plugin.h"
22672 +#include "plugin/plugin_set.h"
22673 +#include "plugin/object.h"
22674 +#include "txnmgr.h"
22675 +#include "jnode.h"
22676 +#include "znode.h"
22677 +#include "block_alloc.h"
22678 +#include "tree.h"
22679 +#include "log.h"
22680 +#include "vfs_ops.h"
22681 +#include "inode.h"
22682 +#include "page_cache.h"
22683 +#include "ktxnmgrd.h"
22684 +#include "super.h"
22685 +#include "reiser4.h"
22686 +#include "kattr.h"
22687 +#include "entd.h"
22688 +#include "emergency_flush.h"
22689 +
22690 +#include <linux/profile.h>
22691 +#include <linux/types.h>
22692 +#include <linux/mount.h>
22693 +#include <linux/vfs.h>
22694 +#include <linux/mm.h>
22695 +#include <linux/buffer_head.h>
22696 +#include <linux/dcache.h>
22697 +#include <linux/list.h>
22698 +#include <linux/pagemap.h>
22699 +#include <linux/slab.h>
22700 +#include <linux/seq_file.h>
22701 +#include <linux/init.h>
22702 +#include <linux/module.h>
22703 +#include <linux/writeback.h>
22704 +#include <linux/backing-dev.h>
22705 +#include <linux/quotaops.h>
22706 +#include <linux/security.h>
22707 +
22708 +/* inode operations */
22709 +
22710 +static int reiser4_create(struct inode *, struct dentry *, int,
22711 +                         struct nameidata *);
22712 +static struct dentry *reiser4_lookup(struct inode *, struct dentry *,
22713 +                                    struct nameidata *);
22714 +static int reiser4_link(struct dentry *, struct inode *, struct dentry *);
22715 +static int reiser4_unlink(struct inode *, struct dentry *);
22716 +static int reiser4_rmdir(struct inode *, struct dentry *);
22717 +static int reiser4_symlink(struct inode *, struct dentry *, const char *);
22718 +static int reiser4_mkdir(struct inode *, struct dentry *, int);
22719 +static int reiser4_mknod(struct inode *, struct dentry *, int, dev_t);
22720 +static int reiser4_rename(struct inode *, struct dentry *, struct inode *, struct dentry *);
22721 +static int reiser4_readlink(struct dentry *, char *, int);
22722 +static int reiser4_follow_link(struct dentry *, struct nameidata *);
22723 +static void reiser4_truncate(struct inode *);
22724 +static int reiser4_permission(struct inode *, int, struct nameidata *);
22725 +static int reiser4_setattr(struct dentry *, struct iattr *);
22726 +static int reiser4_getattr(struct vfsmount *mnt, struct dentry *, struct kstat *);
22727 +
22728 +#if 0
22729 +static int reiser4_setxattr(struct dentry *, const char *, void *, size_t, int);
22730 +static ssize_t reiser4_getxattr(struct dentry *, const char *, void *, size_t);
22731 +static ssize_t reiser4_listxattr(struct dentry *, char *, size_t);
22732 +static int reiser4_removexattr(struct dentry *, const char *);
22733 +#endif
22734 +
22735 +reiser4_internal int invoke_create_method(struct inode *parent,
22736 +                                         struct dentry *dentry,
22737 +                                         reiser4_object_create_data * data);
22738 +
22739 +/* ->create() VFS method in reiser4 inode_operations */
22740 +static int
22741 +reiser4_create(struct inode *parent    /* inode of parent
22742 +                                        * directory */,
22743 +              struct dentry *dentry    /* dentry of new object to
22744 +                                        * create */,
22745 +              int mode /* new object mode */,
22746 +              struct nameidata *nameidata)
22747 +{
22748 +       reiser4_object_create_data data;
22749 +       xmemset(&data, 0, sizeof data);
22750 +
22751 +       reiser4_stat_inc_at(parent->i_sb, vfs_calls.create);
22752 +
22753 +       data.mode = S_IFREG | mode;
22754 +       data.id = UNIX_FILE_PLUGIN_ID;
22755 +       return invoke_create_method(parent, dentry, &data);
22756 +}
22757 +
22758 +/* ->mkdir() VFS method in reiser4 inode_operations */
22759 +static int
22760 +reiser4_mkdir(struct inode *parent     /* inode of parent
22761 +                                        * directory */ ,
22762 +             struct dentry *dentry     /* dentry of new object to
22763 +                                        * create */ ,
22764 +             int mode /* new object's mode */ )
22765 +{
22766 +       reiser4_object_create_data data;
22767 +
22768 +       reiser4_stat_inc_at(parent->i_sb, vfs_calls.mkdir);
22769 +
22770 +       data.mode = S_IFDIR | mode;
22771 +       data.id = DIRECTORY_FILE_PLUGIN_ID;
22772 +       return invoke_create_method(parent, dentry, &data);
22773 +}
22774 +
22775 +/* ->symlink() VFS method in reiser4 inode_operations */
22776 +static int
22777 +reiser4_symlink(struct inode *parent   /* inode of parent
22778 +                                        * directory */ ,
22779 +               struct dentry *dentry   /* dentry of new object to
22780 +                                        * create */ ,
22781 +               const char *linkname    /* pathname to put into
22782 +                                        * symlink */ )
22783 +{
22784 +       reiser4_object_create_data data;
22785 +
22786 +       reiser4_stat_inc_at(parent->i_sb, vfs_calls.symlink);
22787 +
22788 +       data.name = linkname;
22789 +       data.id = SYMLINK_FILE_PLUGIN_ID;
22790 +       data.mode = S_IFLNK | S_IRWXUGO;
22791 +       return invoke_create_method(parent, dentry, &data);
22792 +}
22793 +
22794 +/* ->mknod() VFS method in reiser4 inode_operations */
22795 +static int
22796 +reiser4_mknod(struct inode *parent /* inode of parent directory */ ,
22797 +             struct dentry *dentry     /* dentry of new object to
22798 +                                        * create */ ,
22799 +             int mode /* new object's mode */ ,
22800 +             dev_t rdev /* minor and major of new device node */ )
22801 +{
22802 +       reiser4_object_create_data data;
22803 +
22804 +       reiser4_stat_inc_at(parent->i_sb, vfs_calls.mknod);
22805 +
22806 +       data.mode = mode;
22807 +       data.rdev = rdev;
22808 +       data.id = SPECIAL_FILE_PLUGIN_ID;
22809 +       return invoke_create_method(parent, dentry, &data);
22810 +}
22811 +
22812 +/* ->rename() inode operation */
22813 +static int
22814 +reiser4_rename(struct inode *old_dir, struct dentry *old, struct inode *new_dir, struct dentry *new)
22815 +{
22816 +       int result;
22817 +       reiser4_context ctx;
22818 +
22819 +       assert("nikita-2314", old_dir != NULL);
22820 +       assert("nikita-2315", old != NULL);
22821 +       assert("nikita-2316", new_dir != NULL);
22822 +       assert("nikita-2317", new != NULL);
22823 +
22824 +       init_context(&ctx, old_dir->i_sb);
22825 +       reiser4_stat_inc(vfs_calls.rename);
22826 +
22827 +       result = perm_chk(old_dir, rename, old_dir, old, new_dir, new);
22828 +       if (result == 0) {
22829 +               dir_plugin *dplug;
22830 +
22831 +               dplug = inode_dir_plugin(old_dir);
22832 +               if (dplug == NULL)
22833 +                       result = RETERR(-ENOTDIR);
22834 +               else if (dplug->rename == NULL)
22835 +                       result = RETERR(-EPERM);
22836 +               else
22837 +                       result = dplug->rename(old_dir, old, new_dir, new);
22838 +       }
22839 +       context_set_commit_async(&ctx);
22840 +       reiser4_exit_context(&ctx);
22841 +       return result;
22842 +}
22843 +
22844 +/* reiser4_lookup() - entry point for ->lookup() method.
22845 +
22846 +   This is a wrapper for lookup_object which is a wrapper for the directory
22847 +   plugin that does the lookup.
22848 +
22849 +   This is installed in ->lookup() in reiser4_inode_operations.
22850 +*/
22851 +static struct dentry *
22852 +reiser4_lookup(struct inode *parent,   /* directory within which we are to
22853 +                                        * look for the name specified in
22854 +                                        * dentry */
22855 +              struct dentry *dentry,   /* this contains the name that is to
22856 +                                          be looked for on entry, and on exit
22857 +                                          contains a filled in dentry with a
22858 +                                          pointer to the inode (unless name
22859 +                                          not found) */
22860 +              struct nameidata *nameidata)
22861 +{
22862 +       dir_plugin *dplug;
22863 +       int retval;
22864 +       struct dentry *result;
22865 +       reiser4_context ctx;
22866 +       int (*lookup) (struct inode * parent_inode, struct dentry **dentry);
22867 +
22868 +       assert("nikita-403", parent != NULL);
22869 +       assert("nikita-404", dentry != NULL);
22870 +
22871 +       init_context(&ctx, parent->i_sb);
22872 +       reiser4_stat_inc(vfs_calls.lookup);
22873 +
22874 +       /* find @parent directory plugin and make sure that it has lookup
22875 +          method */
22876 +       dplug = inode_dir_plugin(parent);
22877 +       if (dplug != NULL && dplug->lookup != NULL)
22878 +               /* if parent directory has directory plugin with ->lookup
22879 +                * method, use the latter to do lookup */
22880 +               lookup = dplug->lookup;
22881 +       else if (!reiser4_is_set(parent->i_sb, REISER4_NO_PSEUDO))
22882 +               /* even if there is no ->lookup method, pseudo file lookup
22883 +                * should still be performed, but only unless we are in
22884 +                * "no-pseudo" mode */
22885 +               lookup = lookup_pseudo_file;
22886 +       else
22887 +               lookup = NULL;
22888 +       if (lookup != NULL) {
22889 +               struct dentry *name;
22890 +
22891 +               name = dentry;
22892 +               /* call its lookup method */
22893 +               retval = lookup(parent, &name);
22894 +               if (retval == 0) {
22895 +                       if (name == NULL) {
22896 +                               /*
22897 +                                * new object was looked up. Initialize it.
22898 +                                */
22899 +                               struct inode *obj;
22900 +                               file_plugin *fplug;
22901 +
22902 +                               obj = dentry->d_inode;
22903 +                               assert("nikita-2645", obj != NULL);
22904 +                               fplug = inode_file_plugin(obj);
22905 +                               retval = fplug->bind(obj, parent);
22906 +                       }
22907 +               } else if (retval == -ENOENT) {
22908 +                       /* object not found */
22909 +                       d_add(dentry, NULL);
22910 +                       retval = 0;
22911 +                       name = NULL;
22912 +               }
22913 +
22914 +               if (retval == 0)
22915 +                       /* success */
22916 +                       result = name;
22917 +               else
22918 +                       result = ERR_PTR(retval);
22919 +       } else
22920 +               result = ERR_PTR(-ENOTDIR);
22921 +
22922 +       /* prevent balance_dirty_pages() from being called: we don't want to
22923 +        * do this under directory i_sem. */
22924 +       context_set_commit_async(&ctx);
22925 +       reiser4_exit_context(&ctx);
22926 +       return result;
22927 +}
22928 +
22929 +/* ->readlink() inode method, returns content of symbolic link */
22930 +static int
22931 +reiser4_readlink(struct dentry *dentry, char *buf, int buflen)
22932 +{
22933 +       assert("vs-852", S_ISLNK(dentry->d_inode->i_mode));
22934 +       reiser4_stat_inc_at(dentry->d_inode->i_sb, vfs_calls.readlink);
22935 +       if (!dentry->d_inode->u.generic_ip || !inode_get_flag(dentry->d_inode, REISER4_GENERIC_PTR_USED))
22936 +               return RETERR(-EINVAL);
22937 +       return vfs_readlink(dentry, buf, buflen, dentry->d_inode->u.generic_ip);
22938 +}
22939 +
22940 +/* ->follow_link() inode method. Follows a symbolic link */
22941 +static int
22942 +reiser4_follow_link(struct dentry *dentry, struct nameidata *data)
22943 +{
22944 +       assert("vs-851", S_ISLNK(dentry->d_inode->i_mode));
22945 +
22946 +       reiser4_stat_inc_at(dentry->d_inode->i_sb, vfs_calls.follow_link);
22947 +       if (!dentry->d_inode->u.generic_ip || !inode_get_flag(dentry->d_inode, REISER4_GENERIC_PTR_USED))
22948 +               return RETERR(-EINVAL);
22949 +       return vfs_follow_link(data, dentry->d_inode->u.generic_ip);
22950 +}
22951 +
22952 +/* ->setattr() inode operation
22953 +
22954 +   Called from notify_change. */
22955 +static int
22956 +reiser4_setattr(struct dentry *dentry, struct iattr *attr)
22957 +{
22958 +       struct inode *inode;
22959 +       int result;
22960 +       reiser4_context ctx;
22961 +
22962 +       assert("nikita-2269", attr != NULL);
22963 +
22964 +       inode = dentry->d_inode;
22965 +       assert("vs-1108", inode != NULL);
22966 +       init_context(&ctx, inode->i_sb);
22967 +       reiser4_stat_inc(vfs_calls.setattr);
22968 +       result = perm_chk(inode, setattr, dentry, attr);
22969 +       if (result == 0) {
22970 +               if (!inode_get_flag(inode, REISER4_IMMUTABLE)) {
22971 +                       file_plugin *fplug;
22972 +
22973 +                       fplug = inode_file_plugin(inode);
22974 +                       assert("nikita-2271", fplug != NULL);
22975 +                       assert("nikita-2296", fplug->setattr != NULL);
22976 +                       result = fplug->setattr(inode, attr);
22977 +               } else
22978 +                       result = RETERR(-E_REPEAT);
22979 +       }
22980 +       context_set_commit_async(&ctx);
22981 +       reiser4_exit_context(&ctx);
22982 +       return result;
22983 +}
22984 +
22985 +/* ->getattr() inode operation called (indirectly) by sys_stat(). */
22986 +static int
22987 +reiser4_getattr(struct vfsmount *mnt UNUSED_ARG, struct dentry *dentry, struct kstat *stat)
22988 +{
22989 +       struct inode *inode;
22990 +       int result;
22991 +       reiser4_context ctx;
22992 +
22993 +       inode = dentry->d_inode;
22994 +       init_context(&ctx, inode->i_sb);
22995 +       reiser4_stat_inc(vfs_calls.getattr);
22996 +       result = perm_chk(inode, getattr, mnt, dentry, stat);
22997 +       if (result == 0) {
22998 +               file_plugin *fplug;
22999 +
23000 +               fplug = inode_file_plugin(inode);
23001 +               assert("nikita-2295", fplug != NULL);
23002 +               assert("nikita-2297", fplug->getattr != NULL);
23003 +               result = fplug->getattr(mnt, dentry, stat);
23004 +       }
23005 +       reiser4_exit_context(&ctx);
23006 +       return result;
23007 +}
23008 +
23009 +/* helper function: call object plugin to truncate file to @size */
23010 +static int
23011 +truncate_object(struct inode *inode /* object to truncate */ ,
23012 +               loff_t size /* size to truncate object to */ )
23013 +{
23014 +       file_plugin *fplug;
23015 +       int result;
23016 +
23017 +       assert("nikita-1026", inode != NULL);
23018 +       assert("nikita-1027", is_reiser4_inode(inode));
23019 +       assert("nikita-1028", inode->i_sb != NULL);
23020 +
23021 +       write_syscall_log("%llu %lli", get_inode_oid(inode), size);
23022 +
23023 +       fplug = inode_file_plugin(inode);
23024 +       assert("vs-142", fplug != NULL);
23025 +
23026 +       assert("nikita-2933", fplug->truncate != NULL);
23027 +       result = fplug->truncate(inode, size);
23028 +       if (result != 0)
23029 +               warning("nikita-1602", "Truncate error: %i for %lli", result, get_inode_oid(inode));
23030 +
23031 +       write_syscall_log("ex");
23032 +       return result;
23033 +}
23034 +
23035 +/* ->truncate() VFS method in reiser4 inode_operations */
23036 +static void
23037 +reiser4_truncate(struct inode *inode /* inode to truncate */ )
23038 +{
23039 +       reiser4_context ctx;
23040 +
23041 +       assert("umka-075", inode != NULL);
23042 +
23043 +       init_context(&ctx, inode->i_sb);
23044 +       reiser4_stat_inc(vfs_calls.truncate);
23045 +       ON_TRACE(TRACE_VFS_OPS, "TRUNCATE: i_ino %li to size %lli\n", inode->i_ino, inode->i_size);
23046 +
23047 +       truncate_object(inode, inode->i_size);
23048 +
23049 +       /* for mysterious reasons ->truncate() VFS call doesn't return
23050 +          value  */
23051 +       reiser4_exit_context(&ctx);
23052 +}
23053 +
23054 +/* ->permission() method in reiser4_inode_operations. */
23055 +static int
23056 +reiser4_permission(struct inode *inode /* object */ ,
23057 +                  int mask,    /* mode bits to check permissions
23058 +                                * for */
23059 +                  struct nameidata *nameidata)
23060 +{
23061 +       /* reiser4_context creation/destruction removed from here,
23062 +          because permission checks currently don't require this.
23063 +
23064 +          Permission plugin have to create context itself if necessary. */
23065 +       assert("nikita-1687", inode != NULL);
23066 +
23067 +       return perm_chk(inode, mask, inode, mask);
23068 +}
23069 +
23070 +/* common part of both unlink and rmdir. */
23071 +static int
23072 +unlink_file(struct inode *parent /* parent directory */ ,
23073 +           struct dentry *victim       /* name of object being
23074 +                                        * unlinked */ )
23075 +{
23076 +       int result;
23077 +       dir_plugin *dplug;
23078 +       reiser4_context ctx;
23079 +
23080 +       init_context(&ctx, parent->i_sb);
23081 +       write_syscall_log("%s", victim->d_name.name);
23082 +
23083 +       assert("nikita-1435", parent != NULL);
23084 +       assert("nikita-1436", victim != NULL);
23085 +
23086 +       ON_TRACE(TRACE_DIR | TRACE_VFS_OPS, "unlink: %lli/%s\n",
23087 +                get_inode_oid(parent), victim->d_name.name);
23088 +
23089 +       dplug = inode_dir_plugin(parent);
23090 +       assert("nikita-1429", dplug != NULL);
23091 +       if (dplug->unlink != NULL)
23092 +               result = dplug->unlink(parent, victim);
23093 +       else
23094 +               result = RETERR(-EPERM);
23095 +       write_syscall_log("ex");
23096 +       /* @victim can be already removed from the disk by this time. Inode is
23097 +          then marked so that iput() wouldn't try to remove stat data. But
23098 +          inode itself is still there.
23099 +       */
23100 +       /* we cannot release directory semaphore here, because name has
23101 +        * already been deleted, but dentry (@victim) still exists. */
23102 +       /* prevent balance_dirty_pages() from being called: we don't want to
23103 +        * do this under directory i_sem. */
23104 +
23105 +       context_set_commit_async(&ctx);
23106 +       reiser4_exit_context(&ctx);
23107 +       return result;
23108 +}
23109 +
23110 +/* ->unlink() VFS method in reiser4 inode_operations
23111 +
23112 +   remove link from @parent directory to @victim object: delegate work
23113 +   to object plugin
23114 +*/
23115 +/* Audited by: umka (2002.06.12) */
23116 +static int
23117 +reiser4_unlink(struct inode *parent /* parent directory */ ,
23118 +              struct dentry *victim    /* name of object being
23119 +                                        * unlinked */ )
23120 +{
23121 +       assert("nikita-2011", parent != NULL);
23122 +       assert("nikita-2012", victim != NULL);
23123 +       assert("nikita-2013", victim->d_inode != NULL);
23124 +       reiser4_stat_inc_at(parent->i_sb,vfs_calls.unlink);
23125 +       if (inode_dir_plugin(victim->d_inode) == NULL)
23126 +               return unlink_file(parent, victim);
23127 +       else
23128 +               return RETERR(-EISDIR);
23129 +}
23130 +
23131 +/* ->rmdir() VFS method in reiser4 inode_operations
23132 +
23133 +   The same as unlink, but only for directories.
23134 +
23135 +*/
23136 +/* Audited by: umka (2002.06.12) */
23137 +static int
23138 +reiser4_rmdir(struct inode *parent /* parent directory */ ,
23139 +             struct dentry *victim     /* name of directory being
23140 +                                        * unlinked */ )
23141 +{
23142 +       assert("nikita-2014", parent != NULL);
23143 +       assert("nikita-2015", victim != NULL);
23144 +       assert("nikita-2016", victim->d_inode != NULL);
23145 +
23146 +       reiser4_stat_inc_at(parent->i_sb, vfs_calls.rmdir);
23147 +       if (inode_dir_plugin(victim->d_inode) != NULL)
23148 +               /* there is no difference between unlink and rmdir for
23149 +                  reiser4 */
23150 +               return unlink_file(parent, victim);
23151 +       else
23152 +               return RETERR(-ENOTDIR);
23153 +}
23154 +
23155 +/* ->link() VFS method in reiser4 inode_operations
23156 +
23157 +   entry point for ->link() method.
23158 +
23159 +   This is installed as ->link inode operation for reiser4
23160 +   inodes. Delegates all work to object plugin
23161 +*/
23162 +/* Audited by: umka (2002.06.12) */
23163 +static int
23164 +reiser4_link(struct dentry *existing   /* dentry of existing
23165 +                                        * object */ ,
23166 +            struct inode *parent /* parent directory */ ,
23167 +            struct dentry *where /* new name for @existing */ )
23168 +{
23169 +       int result;
23170 +       dir_plugin *dplug;
23171 +       reiser4_context ctx;
23172 +
23173 +       assert("umka-080", existing != NULL);
23174 +       assert("nikita-1031", parent != NULL);
23175 +
23176 +       init_context(&ctx, parent->i_sb);
23177 +       context_set_commit_async(&ctx);
23178 +       reiser4_stat_inc(vfs_calls.link);
23179 +
23180 +       dplug = inode_dir_plugin(parent);
23181 +       assert("nikita-1430", dplug != NULL);
23182 +       if (dplug->link != NULL) {
23183 +               result = dplug->link(parent, existing, where);
23184 +               if (result == 0)
23185 +                       d_instantiate(where, existing->d_inode);
23186 +       } else {
23187 +               result = RETERR(-EPERM);
23188 +       }
23189 +       up(&existing->d_inode->i_sem);
23190 +       up(&parent->i_sem);
23191 +       reiser4_exit_context(&ctx);
23192 +       down(&parent->i_sem);
23193 +       down(&existing->d_inode->i_sem);
23194 +       return result;
23195 +}
23196 +
23197 +/* call ->create() directory plugin method. */
23198 +reiser4_internal int
23199 +invoke_create_method(struct inode *parent /* parent directory */ ,
23200 +                    struct dentry *dentry      /* dentry of new
23201 +                                                * object */ ,
23202 +                    reiser4_object_create_data * data  /* information
23203 +                                                        * necessary
23204 +                                                        * to create
23205 +                                                        * new
23206 +                                                        * object */ )
23207 +{
23208 +       int result;
23209 +       dir_plugin *dplug;
23210 +       reiser4_context ctx;
23211 +
23212 +       init_context(&ctx, parent->i_sb);
23213 +       context_set_commit_async(&ctx);
23214 +       write_syscall_log("%s %o", dentry->d_name.name, data->mode);
23215 +
23216 +       assert("nikita-426", parent != NULL);
23217 +       assert("nikita-427", dentry != NULL);
23218 +       assert("nikita-428", data != NULL);
23219 +
23220 +       dplug = inode_dir_plugin(parent);
23221 +       if (dplug == NULL)
23222 +               result = RETERR(-ENOTDIR);
23223 +       else if (dplug->create_child != NULL) {
23224 +               struct inode *child;
23225 +
23226 +               child = NULL;
23227 +
23228 +               data->parent = parent;
23229 +               data->dentry = dentry;
23230 +
23231 +               result = dplug->create_child(data, &child);
23232 +               if (unlikely(result != 0)) {
23233 +                       if (child != NULL) {
23234 +                               /*
23235 +                                * what we actually want to check in the
23236 +                                * assertion below is that @child only
23237 +                                * contains items that iput()->... is going to
23238 +                                * remove (usually stat-data). Obvious check
23239 +                                * for child->i_size == 0 doesn't work for
23240 +                                * symlinks.
23241 +                                */
23242 +                               assert("nikita-3140", S_ISLNK(child->i_mode) ||
23243 +                                      child->i_size == 0);
23244 +                               reiser4_make_bad_inode(child);
23245 +                               iput(child);
23246 +                       }
23247 +               } else {
23248 +                       d_instantiate(dentry, child);
23249 +                       ON_TRACE(TRACE_VFS_OPS, "create: %s (%o) %llu\n",
23250 +                                dentry->d_name.name,
23251 +                                data->mode, get_inode_oid(child));
23252 +               }
23253 +       } else
23254 +               result = RETERR(-EPERM);
23255 +
23256 +       write_syscall_log("ex");
23257 +
23258 +       reiser4_exit_context(&ctx);
23259 +       return result;
23260 +}
23261 +
23262 +struct inode_operations reiser4_inode_operations = {
23263 +       .create = reiser4_create,       /* d */
23264 +       .lookup = reiser4_lookup,       /* d */
23265 +       .link = reiser4_link,   /* d */
23266 +       .unlink = reiser4_unlink,       /* d */
23267 +       .symlink = reiser4_symlink,     /* d */
23268 +       .mkdir = reiser4_mkdir, /* d */
23269 +       .rmdir = reiser4_rmdir, /* d */
23270 +       .mknod = reiser4_mknod, /* d */
23271 +       .rename = reiser4_rename,       /* d */
23272 +       .readlink = NULL,
23273 +       .follow_link = NULL,
23274 +       .truncate = reiser4_truncate,   /* d */
23275 +       .permission = reiser4_permission,       /* d */
23276 +       .setattr = reiser4_setattr,     /* d */
23277 +       .getattr = reiser4_getattr,     /* d */
23278 +};
23279 +
23280 +struct inode_operations reiser4_symlink_inode_operations = {
23281 +       .setattr = reiser4_setattr,     /* d */
23282 +       .getattr = reiser4_getattr,     /* d */
23283 +       .readlink = reiser4_readlink,
23284 +       .follow_link = reiser4_follow_link
23285 +};
23286 +
23287 +struct inode_operations reiser4_special_inode_operations = {
23288 +       .setattr = reiser4_setattr,     /* d */
23289 +       .getattr = reiser4_getattr      /* d */
23290 +};
23291 +
23292 +/* Make Linus happy.
23293 +   Local variables:
23294 +   c-indentation-style: "K&R"
23295 +   mode-name: "LC"
23296 +   c-basic-offset: 8
23297 +   tab-width: 8
23298 +   fill-column: 120
23299 +   End:
23300 +*/
23301 diff -rupN linux-2.6.8-rc3/fs/reiser4/interpolate.c linux-2.6.8-rc3-a/fs/reiser4/interpolate.c
23302 --- linux-2.6.8-rc3/fs/reiser4/interpolate.c    1970-01-01 03:00:00.000000000 +0300
23303 +++ linux-2.6.8-rc3-a/fs/reiser4/interpolate.c  2004-08-05 21:20:53.394594845 +0400
23304 @@ -0,0 +1,20 @@
23305 +/* We will use @ as the symbol for dereferencing, we won't use * because
23306 +we want to reserve it for use as a wildcard someday.
23307 +
23308 +Inheriting stat data from source_filename can be done as:
23309 +
23310 +target_filename/mode<=@source_filename/mode
23311 +
23312 +File body inheritance is accomplished by extending symlink functionality:
23313 +
23314 +file_body_inheritance example:
23315 +
23316 +target_filename/symlink<=`@freshly_interpolate_this_filename_whenever_resolving_target_filename+`here is some text stored directly in the symlink''+@interpolate_this_filename_at_symlink_creation_time+`@freshly_interpolate_this_filename2_whenever_resolving_target_filename+"this is some more text that is directly embedded in the symlink"'
23317 +
23318 +Mr. Demidov, flesh this out in detail, being careful to worry about
23319 +how to write to interpolated files.  I think you need to interpret
23320 +strings that are between interpolations as the delimiters of those
23321 +interpolations, and changing those strings can then only be done by
23322 +writing to filename/sym.
23323 +
23324 +*/
23325 diff -rupN linux-2.6.8-rc3/fs/reiser4/ioctl.h linux-2.6.8-rc3-a/fs/reiser4/ioctl.h
23326 --- linux-2.6.8-rc3/fs/reiser4/ioctl.h  1970-01-01 03:00:00.000000000 +0300
23327 +++ linux-2.6.8-rc3-a/fs/reiser4/ioctl.h        2004-08-05 21:20:53.282618463 +0400
23328 @@ -0,0 +1,41 @@
23329 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
23330 + * reiser4/README */
23331 +
23332 +#if !defined( __REISER4_IOCTL_H__ )
23333 +#define __REISER4_IOCTL_H__
23334 +
23335 +#include <linux/fs.h>
23336 +
23337 +/*
23338 + * ioctl(2) command used to "unpack" reiser4 file, that is, convert it into
23339 + * extents and fix in this state. This is used by applications that rely on
23340 + *
23341 + *     . files being block aligned, and
23342 + *
23343 + *     . files never migrating on disk
23344 + *
23345 + * for example, boot loaders (LILO) need this.
23346 + *
23347 + * This ioctl should be used as
23348 + *
23349 + *     result = ioctl(fd, REISER4_IOC_UNPACK);
23350 + *
23351 + * File behind fd descriptor will be converted to the extents (if necessary),
23352 + * and its stat-data will be updated so that it will never be converted back
23353 + * into tails again.
23354 + */
23355 +#define REISER4_IOC_UNPACK _IOW(0xCD,1,long)
23356 +
23357 +/* __REISER4_IOCTL_H__ */
23358 +#endif
23359 +
23360 +/* Make Linus happy.
23361 +   Local variables:
23362 +   c-indentation-style: "K&R"
23363 +   mode-name: "LC"
23364 +   c-basic-offset: 8
23365 +   tab-width: 8
23366 +   fill-column: 120
23367 +   scroll-step: 1
23368 +   End:
23369 +*/
23370 diff -rupN linux-2.6.8-rc3/fs/reiser4/jnode.c linux-2.6.8-rc3-a/fs/reiser4/jnode.c
23371 --- linux-2.6.8-rc3/fs/reiser4/jnode.c  1970-01-01 03:00:00.000000000 +0300
23372 +++ linux-2.6.8-rc3-a/fs/reiser4/jnode.c        2004-08-05 21:20:53.469579029 +0400
23373 @@ -0,0 +1,2114 @@
23374 +/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
23375 + * reiser4/README */
23376 +/* Jnode manipulation functions. */
23377 +/* Jnode is entity used to track blocks with data and meta-data in reiser4.
23378 +
23379 +   In particular, jnodes are used to track transactional information
23380 +   associated with each block. Each znode contains jnode as ->zjnode field.
23381 +
23382 +   Jnode stands for either Josh or Journal node.
23383 +*/
23384 +
23385 +/*
23386 + * Taxonomy.
23387 + *
23388 + *     Jnode represents block containing data or meta-data. There are jnodes
23389 + *     for:
23390 + *
23391 + *         unformatted blocks (jnodes proper). There are plans, however to
23392 + *         have a handle per extent unit rather than per each unformatted
23393 + *         block, because there are so many of them.
23394 + *
23395 + *         For bitmaps. Each bitmap is actually represented by two jnodes--one
23396 + *         for working and another for "commit" data, together forming bnode.
23397 + *
23398 + *         For io-heads. These are used by log writer.
23399 + *
23400 + *         For formatted nodes (znode). See comment at the top of znode.c for
23401 + *         details specific to the formatted nodes (znodes).
23402 + *
23403 + * Node data.
23404 + *
23405 + *     Jnode provides access to the data of node it represents. Data are
23406 + *     stored in a page. Page is kept in a page cache. This means, that jnodes
23407 + *     are highly interconnected with page cache and VM internals.
23408 + *
23409 + *     jnode has a pointer to page (->pg) containing its data. Pointer to data
23410 + *     themselves is cached in ->data field to avoid frequent calls to
23411 + *     page_address().
23412 + *
23413 + *     jnode and page are attached to each other by jnode_attach_page(). This
23414 + *     function places pointer to jnode in page->private, sets PG_private flag
23415 + *     and increments page counter.
23416 + *
23417 + *     Opposite operation is performed by page_clear_jnode().
23418 + *
23419 + *     jnode->pg is protected by jnode spin lock, and page->private is
23420 + *     protected by page lock. See comment at the top of page_cache.c for
23421 + *     more.
23422 + *
23423 + *     page can be detached from jnode for two reasons:
23424 + *
23425 + *         . jnode is removed from a tree (file is truncated, of formatted
23426 + *         node is removed by balancing).
23427 + *
23428 + *         . during memory pressure, VM calls ->releasepage() method
23429 + *         (reiser4_releasepage()) to evict page from memory.
23430 + *
23431 + *    (there, of course, is also umount, but this is special case we are not
23432 + *    concerned with here).
23433 + *
23434 + *    To protect jnode page from eviction, one calls jload() function that
23435 + *    "pins" page in memory (loading it if necessary), increments
23436 + *    jnode->d_count, and kmap()s page. Page is unpinned through call to
23437 + *    jrelse().
23438 + *
23439 + * Jnode life cycle.
23440 + *
23441 + *    jnode is created, placed in hash table, and, optionally, in per-inode
23442 + *    radix tree. Page can be attached to jnode, pinned, released, etc.
23443 + *
23444 + *    When jnode is captured into atom its reference counter is
23445 + *    increased. While being part of an atom, jnode can be "early
23446 + *    flushed". This means that as part of flush procedure, jnode is placed
23447 + *    into "relocate set", and its page is submitted to the disk. After io
23448 + *    completes, page can be detached, then loaded again, re-dirtied, etc.
23449 + *
23450 + *    Thread acquired reference to jnode by calling jref() and releases it by
23451 + *    jput(). When last reference is removed, jnode is still retained in
23452 + *    memory (cached) if it has page attached, _unless_ it is scheduled for
23453 + *    destruction (has JNODE_HEARD_BANSHEE bit set).
23454 + *
23455 + *    Tree read-write lock was used as "existential" lock for jnodes. That is,
23456 + *    jnode->x_count could be changed from 0 to 1 only under tree write lock,
23457 + *    that is, tree lock protected unreferenced jnodes stored in the hash
23458 + *    table, from recycling.
23459 + *
23460 + *    This resulted in high contention on tree lock, because jref()/jput() is
23461 + *    frequent operation. To ameliorate this problem, RCU is used: when jput()
23462 + *    is just about to release last reference on jnode it sets JNODE_RIP bit
23463 + *    on it, and then proceed with jnode destruction (removing jnode from hash
23464 + *    table, cbk_cache, detaching page, etc.). All places that change jnode
23465 + *    reference counter from 0 to 1 (jlookup(), zlook(), zget(), and
23466 + *    cbk_cache_scan_slots()) check for JNODE_RIP bit (this is done by
23467 + *    jnode_rip_check() function), and pretend that nothing was found in hash
23468 + *    table if bit is set.
23469 + *
23470 + *    jput defers actual return of jnode into slab cache to some later time
23471 + *    (by call_rcu()), this guarantees that other threads can safely continue
23472 + *    working with JNODE_RIP-ped jnode.
23473 + *
23474 + */
23475 +
23476 +#include "reiser4.h"
23477 +#include "debug.h"
23478 +#include "dformat.h"
23479 +#include "plugin/plugin_header.h"
23480 +#include "plugin/plugin.h"
23481 +#include "txnmgr.h"
23482 +#include "jnode.h"
23483 +#include "znode.h"
23484 +#include "tree.h"
23485 +#include "tree_walk.h"
23486 +#include "super.h"
23487 +#include "inode.h"
23488 +#include "page_cache.h"
23489 +#include "prof.h"
23490 +
23491 +#include <asm/uaccess.h>        /* UML needs this for PAGE_OFFSET */
23492 +#include <linux/types.h>
23493 +#include <linux/slab.h>
23494 +#include <linux/pagemap.h>
23495 +#include <linux/vmalloc.h>      /* for vmalloc(), vfree() */
23496 +#include <linux/swap.h>
23497 +#include <linux/fs.h>          /* for struct address_space  */
23498 +#include <linux/writeback.h>   /* for inode_lock */
23499 +
23500 +static kmem_cache_t *_jnode_slab = NULL;
23501 +
23502 +static void jnode_set_type(jnode * node, jnode_type type);
23503 +
23504 +
23505 +/* true if valid page is attached to jnode */
23506 +static inline int jnode_is_parsed (jnode * node)
23507 +{
23508 +       return JF_ISSET(node, JNODE_PARSED);
23509 +}
23510 +
23511 +/* hash table support */
23512 +
23513 +/* compare two jnode keys for equality. Used by hash-table macros */
23514 +static inline int
23515 +jnode_key_eq(const jnode_key_t * k1, const jnode_key_t * k2)
23516 +{
23517 +       assert("nikita-2350", k1 != NULL);
23518 +       assert("nikita-2351", k2 != NULL);
23519 +
23520 +       return (k1->index == k2->index && k1->objectid == k2->objectid);
23521 +}
23522 +
23523 +/* Hash jnode by its key (inode plus offset). Used by hash-table macros */
23524 +static inline __u32
23525 +jnode_key_hashfn(j_hash_table *table, const jnode_key_t * key)
23526 +{
23527 +       assert("nikita-2352", key != NULL);
23528 +       assert("nikita-3346", IS_POW(table->_buckets));
23529 +
23530 +       /* yes, this is remarkable simply (where not stupid) hash function. */
23531 +       return (key->objectid + key->index) & (table->_buckets - 1);
23532 +}
23533 +
23534 +/* The hash table definition */
23535 +#define KMALLOC(size) vmalloc(size)
23536 +#define KFREE(ptr, size) vfree(ptr)
23537 +TYPE_SAFE_HASH_DEFINE(j, jnode, jnode_key_t, key.j, link.j, jnode_key_hashfn, jnode_key_eq);
23538 +#undef KFREE
23539 +#undef KMALLOC
23540 +
23541 +/* call this to initialise jnode hash table */
23542 +reiser4_internal int
23543 +jnodes_tree_init(reiser4_tree * tree /* tree to initialise jnodes for */ )
23544 +{
23545 +       int buckets;
23546 +       int result;
23547 +
23548 +       assert("nikita-2359", tree != NULL);
23549 +
23550 +       /*
23551 +        * number of hash buckets in hash table depends on amount of memory
23552 +        * available. If we cannot allocate that much, number of buckets is
23553 +        * halved until allocation succeeds.
23554 +        */
23555 +       buckets = 1 << fls(nr_free_pagecache_pages());
23556 +       do {
23557 +               result = j_hash_init(&tree->jhash_table, buckets,
23558 +                                    reiser4_stat(tree->super, hashes.jnode));
23559 +               buckets >>= 1;
23560 +       } while (result == -ENOMEM);
23561 +       return result;
23562 +}
23563 +
23564 +/* call this to destroy jnode hash table. This is called during umount. */
23565 +reiser4_internal int
23566 +jnodes_tree_done(reiser4_tree * tree /* tree to destroy jnodes for */ )
23567 +{
23568 +       j_hash_table *jtable;
23569 +       jnode *node;
23570 +       jnode *next;
23571 +
23572 +       assert("nikita-2360", tree != NULL);
23573 +
23574 +       /*
23575 +        * Scan hash table and free all jnodes.
23576 +        */
23577 +
23578 +       IF_TRACE(TRACE_ZWEB, UNDER_RW_VOID(tree, tree, read,
23579 +                                          print_jnodes("umount", tree)));
23580 +
23581 +       jtable = &tree->jhash_table;
23582 +       for_all_in_htable(jtable, j, node, next) {
23583 +               if (atomic_read(&node->x_count))
23584 +                       info_jnode("x_count != 0", node);
23585 +               assert("nikita-2361", !atomic_read(&node->x_count));
23586 +               jdrop(node);
23587 +       }
23588 +
23589 +       j_hash_done(&tree->jhash_table);
23590 +       return 0;
23591 +}
23592 +
23593 +/* Initialize static variables in this file. */
23594 +reiser4_internal int
23595 +jnode_init_static(void)
23596 +{
23597 +       assert("umka-168", _jnode_slab == NULL);
23598 +
23599 +       _jnode_slab = kmem_cache_create("jnode", sizeof (jnode), 0,
23600 +                                       SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT,
23601 +                                       NULL, NULL);
23602 +
23603 +       if (_jnode_slab == NULL)
23604 +               goto error;
23605 +
23606 +       return 0;
23607 +
23608 +error:
23609 +
23610 +       if (_jnode_slab != NULL)
23611 +               kmem_cache_destroy(_jnode_slab);
23612 +
23613 +       return RETERR(-ENOMEM);
23614 +}
23615 +
23616 +/* Dual to jnode_init_static */
23617 +reiser4_internal int
23618 +jnode_done_static(void)
23619 +{
23620 +       int ret = 0;
23621 +
23622 +       if (_jnode_slab != NULL) {
23623 +               ret = kmem_cache_destroy(_jnode_slab);
23624 +               _jnode_slab = NULL;
23625 +       }
23626 +
23627 +       return ret;
23628 +}
23629 +
23630 +/* Initialize a jnode. */
23631 +reiser4_internal void
23632 +jnode_init(jnode * node, reiser4_tree * tree, jnode_type type)
23633 +{
23634 +       assert("umka-175", node != NULL);
23635 +
23636 +       xmemset(node, 0, sizeof (jnode));
23637 +       ON_DEBUG(node->magic = JMAGIC);
23638 +       jnode_set_type(node, type);
23639 +       atomic_set(&node->d_count, 0);
23640 +       atomic_set(&node->x_count, 0);
23641 +       spin_jnode_init(node);
23642 +       spin_jload_init(node);
23643 +       node->atom = NULL;
23644 +       node->tree = tree;
23645 +       capture_list_clean(node);
23646 +
23647 +       ASSIGN_NODE_LIST(node, NOT_CAPTURED);
23648 +
23649 +       INIT_RCU_HEAD(&node->rcu);
23650 +
23651 +#if REISER4_DEBUG
23652 +       {
23653 +               reiser4_super_info_data *sbinfo;
23654 +
23655 +               sbinfo = get_super_private(tree->super);
23656 +               spin_lock_irq(&sbinfo->all_guard);
23657 +               list_add(&node->jnodes, &sbinfo->all_jnodes);
23658 +               spin_unlock_irq(&sbinfo->all_guard);
23659 +               /* link with which jnode is attached to reiser4_inode */
23660 +               inode_jnodes_list_clean(node);
23661 +       }
23662 +#endif
23663 +}
23664 +
23665 +#if REISER4_DEBUG
23666 +/*
23667 + * Remove jnode from ->all_jnodes list.
23668 + */
23669 +void
23670 +jnode_done(jnode * node, reiser4_tree * tree)
23671 +{
23672 +       reiser4_super_info_data *sbinfo;
23673 +
23674 +       sbinfo = get_super_private(tree->super);
23675 +
23676 +       spin_lock_irq(&sbinfo->all_guard);
23677 +       assert("nikita-2422", !list_empty(&node->jnodes));
23678 +       list_del_init(&node->jnodes);
23679 +       spin_unlock_irq(&sbinfo->all_guard);
23680 +}
23681 +#endif
23682 +
23683 +/* return already existing jnode of page */
23684 +reiser4_internal jnode *
23685 +jnode_by_page(struct page *pg)
23686 +{
23687 +       assert("nikita-2066", pg != NULL);
23688 +       assert("nikita-2400", PageLocked(pg));
23689 +       assert("nikita-2068", PagePrivate(pg));
23690 +       assert("nikita-2067", jprivate(pg) != NULL);
23691 +       return jprivate(pg);
23692 +}
23693 +
23694 +/* exported functions to allocate/free jnode objects outside this file */
23695 +reiser4_internal jnode *
23696 +jalloc(void)
23697 +{
23698 +       jnode *jal = kmem_cache_alloc(_jnode_slab, GFP_KERNEL);
23699 +       return jal;
23700 +}
23701 +
23702 +/* return jnode back to the slab allocator */
23703 +reiser4_internal inline void
23704 +jfree(jnode * node)
23705 +{
23706 +       assert("zam-449", node != NULL);
23707 +
23708 +       assert("nikita-2663", capture_list_is_clean(node) && NODE_LIST(node) == NOT_CAPTURED);
23709 +       assert("nikita-2774", !JF_ISSET(node, JNODE_EFLUSH));
23710 +       assert("nikita-3222", list_empty(&node->jnodes));
23711 +       assert("nikita-3221", jnode_page(node) == NULL);
23712 +
23713 +       /* not yet phash_jnode_destroy(node); */
23714 +
23715 +       /* poison memory. */
23716 +       ON_DEBUG(xmemset(node, 0xad, sizeof *node));
23717 +       kmem_cache_free(_jnode_slab, node);
23718 +}
23719 +
23720 +/*
23721 + * This function is supplied as RCU callback. It actually frees jnode when
23722 + * last reference to it is gone.
23723 + */
23724 +static void
23725 +jnode_free_actor(struct rcu_head *head)
23726 +{
23727 +       jnode * node;
23728 +       jnode_type jtype;
23729 +
23730 +       node = container_of(head, jnode, rcu);
23731 +       jtype = jnode_get_type(node);
23732 +
23733 +       ON_DEBUG(jnode_done(node, jnode_get_tree(node)));
23734 +
23735 +       switch (jtype) {
23736 +       case JNODE_IO_HEAD:
23737 +       case JNODE_BITMAP:
23738 +       case JNODE_UNFORMATTED_BLOCK:
23739 +               jfree(node);
23740 +               break;
23741 +       case JNODE_FORMATTED_BLOCK:
23742 +               zfree(JZNODE(node));
23743 +               break;
23744 +       case JNODE_INODE:
23745 +       default:
23746 +               wrong_return_value("nikita-3197", "Wrong jnode type");
23747 +       }
23748 +}
23749 +
23750 +/*
23751 + * Free a jnode. Post a callback to be executed later through RCU when all
23752 + * references to @node are released.
23753 + */
23754 +static inline void
23755 +jnode_free(jnode * node, jnode_type jtype)
23756 +{
23757 +       if (jtype != JNODE_INODE) {
23758 +               /*assert("nikita-3219", list_empty(&node->rcu.list));*/
23759 +               call_rcu(&node->rcu, jnode_free_actor);
23760 +       } else
23761 +               jnode_list_remove(node);
23762 +}
23763 +
23764 +/* allocate new unformatted jnode */
23765 +static jnode *
23766 +jnew_unformatted(void)
23767 +{
23768 +       jnode *jal;
23769 +
23770 +       jal = jalloc();
23771 +       if (jal == NULL)
23772 +               return NULL;
23773 +
23774 +       jnode_init(jal, current_tree, JNODE_UNFORMATTED_BLOCK);
23775 +       jal->key.j.mapping = 0;
23776 +       jal->key.j.index = (unsigned long)-1;
23777 +       jal->key.j.objectid = 0;
23778 +       return jal;
23779 +}
23780 +
23781 +/* look for jnode with given mapping and offset within hash table */
23782 +reiser4_internal jnode *
23783 +jlookup(reiser4_tree * tree, oid_t objectid, unsigned long index)
23784 +{
23785 +       jnode_key_t jkey;
23786 +       jnode *node;
23787 +
23788 +       assert("nikita-2353", tree != NULL);
23789 +
23790 +       jkey.objectid = objectid;
23791 +       jkey.index = index;
23792 +
23793 +       /*
23794 +        * hash table is _not_ protected by any lock during lookups. All we
23795 +        * have to do is to disable preemption to keep RCU happy.
23796 +        */
23797 +
23798 +       rcu_read_lock();
23799 +       node = j_hash_find(&tree->jhash_table, &jkey);
23800 +       if (node != NULL) {
23801 +               /* protect @node from recycling */
23802 +               jref(node);
23803 +               assert("nikita-2955", jnode_invariant(node, 0, 0));
23804 +               node = jnode_rip_check(tree, node);
23805 +       }
23806 +       rcu_read_unlock();
23807 +       return node;
23808 +}
23809 +
23810 +/* per inode radix tree of jnodes is protected by tree's read write spin lock */
23811 +static jnode *
23812 +jfind_nolock(struct address_space *mapping, unsigned long index)
23813 +{
23814 +       assert("vs-1694", mapping->host != NULL);
23815 +
23816 +       return radix_tree_lookup(jnode_tree_by_inode(mapping->host), index);
23817 +}
23818 +
23819 +reiser4_internal jnode *
23820 +jfind(struct address_space *mapping, unsigned long index)
23821 +{
23822 +       reiser4_tree *tree;
23823 +       jnode *node;
23824 +
23825 +       assert("vs-1694", mapping->host != NULL);
23826 +       tree = tree_by_inode(mapping->host);
23827 +
23828 +       RLOCK_TREE(tree);
23829 +       node = jfind_nolock(mapping, index);
23830 +       if (node != NULL)
23831 +               jref(node);
23832 +       RUNLOCK_TREE(tree);
23833 +       return node;
23834 +}
23835 +
23836 +static void inode_attach_jnode(jnode * node)
23837 +{
23838 +       struct inode * inode;
23839 +       reiser4_inode * info;
23840 +       struct radix_tree_root * rtree;
23841 +
23842 +       assert ("zam-1043", node->key.j.mapping != NULL);
23843 +       inode = node->key.j.mapping->host;
23844 +       info = reiser4_inode_data(inode);
23845 +       rtree = jnode_tree_by_reiser4_inode(info);
23846 +
23847 +       spin_lock(&inode_lock);
23848 +       assert("zam-1049", equi(rtree->rnode !=NULL, info->nr_jnodes != 0));
23849 +       check_me("zam-1045", !radix_tree_insert(rtree, node->key.j.index, node));
23850 +       ON_DEBUG(info->nr_jnodes ++);
23851 +       inode->i_state |= I_JNODES;
23852 +       spin_unlock(&inode_lock);
23853 +}
23854 +
23855 +static void inode_detach_jnode(jnode * node)
23856 +{
23857 +       struct inode * inode;
23858 +       reiser4_inode * info;
23859 +       struct radix_tree_root * rtree;
23860 +
23861 +       assert ("zam-1044", node->key.j.mapping != NULL);
23862 +       inode = node->key.j.mapping->host;
23863 +       info = reiser4_inode_data(inode);
23864 +       rtree = jnode_tree_by_reiser4_inode(info);
23865 +
23866 +       spin_lock(&inode_lock);
23867 +       assert("zam-1051", info->nr_jnodes != 0);
23868 +       assert("zam-1052", rtree->rnode != NULL);
23869 +       ON_DEBUG(info->nr_jnodes --);
23870 +       check_me("zam-1046", radix_tree_delete(rtree, node->key.j.index));
23871 +       if (rtree->rnode == NULL) {
23872 +               inode->i_state &= ~I_JNODES;
23873 +       }
23874 +       spin_unlock(&inode_lock);
23875 +}
23876 +
23877 +/* put jnode into hash table (where they can be found by flush who does not know
23878 +   mapping) and to inode's tree of jnodes (where they can be found (hopefully
23879 +   faster) in places where mapping is known). Currently it is used by
23880 +   fs/reiser4/plugin/item/extent_file_ops.c:index_extent_jnode when new jnode is
23881 +   created */
23882 +static void
23883 +hash_unformatted_jnode(jnode *node, struct address_space *mapping, unsigned long index)
23884 +{
23885 +       j_hash_table *jtable;
23886 +
23887 +       assert("vs-1446", jnode_is_unformatted(node));
23888 +       assert("vs-1442", node->key.j.mapping == 0);
23889 +       assert("vs-1443", node->key.j.objectid == 0);
23890 +       assert("vs-1444", node->key.j.index == (unsigned long)-1);
23891 +       assert("nikita-3439", rw_tree_is_write_locked(jnode_get_tree(node)));
23892 +
23893 +       node->key.j.mapping  = mapping;
23894 +       node->key.j.objectid = get_inode_oid(mapping->host);
23895 +       node->key.j.index    = index;
23896 +
23897 +       jtable = &jnode_get_tree(node)->jhash_table;
23898 +
23899 +       /* race with some other thread inserting jnode into the hash table is
23900 +        * impossible, because we keep the page lock. */
23901 +       /*
23902 +        * following assertion no longer holds because of RCU: it is possible
23903 +        * jnode is in the hash table, but with JNODE_RIP bit set.
23904 +        */
23905 +       /* assert("nikita-3211", j_hash_find(jtable, &node->key.j) == NULL); */
23906 +       j_hash_insert_rcu(jtable, node);
23907 +       inode_attach_jnode(node);
23908 +}
23909 +
23910 +static void
23911 +unhash_unformatted_node_nolock(jnode *node)
23912 +{
23913 +       assert("vs-1683", node->key.j.mapping != NULL);
23914 +       assert("vs-1684", node->key.j.objectid == get_inode_oid(node->key.j.mapping->host));
23915 +
23916 +       /* remove jnode from hash-table */
23917 +       j_hash_remove_rcu(&node->tree->jhash_table, node);
23918 +       inode_detach_jnode(node);
23919 +       node->key.j.mapping = 0;
23920 +       node->key.j.index = (unsigned long)-1;
23921 +       node->key.j.objectid = 0;
23922 +
23923 +}
23924 +
23925 +/* remove jnode from hash table and from inode's tree of jnodes. This is used in
23926 +   reiser4_invalidatepage and in kill_hook_extent -> truncate_inode_jnodes ->
23927 +   uncapture_jnode */
23928 +reiser4_internal void
23929 +unhash_unformatted_jnode(jnode *node)
23930 +{
23931 +       assert("vs-1445", jnode_is_unformatted(node));
23932 +       WLOCK_TREE(node->tree);
23933 +
23934 +       unhash_unformatted_node_nolock(node);
23935 +
23936 +       WUNLOCK_TREE(node->tree);
23937 +}
23938 +
23939 +/*
23940 + * search hash table for a jnode with given oid and index. If not found,
23941 + * allocate new jnode, insert it, and also insert into radix tree for the
23942 + * given inode/mapping.
23943 + */
23944 +reiser4_internal jnode *
23945 +find_get_jnode(reiser4_tree * tree, struct address_space *mapping, oid_t oid,
23946 +              unsigned long index)
23947 +{
23948 +       jnode *result;
23949 +       jnode *shadow;
23950 +       int preload;
23951 +
23952 +       result = jnew_unformatted();
23953 +
23954 +       if (unlikely(result == NULL))
23955 +               return ERR_PTR(RETERR(-ENOMEM));
23956 +
23957 +       preload = radix_tree_preload(GFP_KERNEL);
23958 +       if (preload != 0)
23959 +               return ERR_PTR(preload);
23960 +
23961 +       WLOCK_TREE(tree);
23962 +       shadow = jfind_nolock(mapping, index);
23963 +       if (likely(shadow == NULL)) {
23964 +               /* add new jnode to hash table and inode's radix tree of jnodes */
23965 +               jref(result);
23966 +               hash_unformatted_jnode(result, mapping, index);
23967 +       } else {
23968 +               /* jnode is found in inode's radix tree of jnodes */
23969 +               jref(shadow);
23970 +               jnode_free(result, JNODE_UNFORMATTED_BLOCK);
23971 +               assert("vs-1498", shadow->key.j.mapping == mapping);
23972 +               result = shadow;
23973 +       }
23974 +       WUNLOCK_TREE(tree);
23975 +
23976 +       assert("nikita-2955", ergo(result != NULL, jnode_invariant(result, 0, 0)));
23977 +       radix_tree_preload_end();
23978 +       return result;
23979 +}
23980 +
23981 +
23982 +/* jget() (a la zget() but for unformatted nodes). Returns (and possibly
23983 +   creates) jnode corresponding to page @pg. jnode is attached to page and
23984 +   inserted into jnode hash-table. */
23985 +static jnode *
23986 +do_jget(reiser4_tree * tree, struct page * pg)
23987 +{
23988 +       /*
23989 +        * There are two ways to create jnode: starting with pre-existing page
23990 +        * and without page.
23991 +        *
23992 +        * When page already exists, jnode is created
23993 +        * (jnode_of_page()->do_jget()) under page lock. This is done in
23994 +        * ->writepage(), or when capturing anonymous page dirtied through
23995 +        * mmap.
23996 +        *
23997 +        * Jnode without page is created by index_extent_jnode().
23998 +        *
23999 +        */
24000 +
24001 +       jnode *result;
24002 +       oid_t oid = get_inode_oid(pg->mapping->host);
24003 +
24004 +       assert("umka-176", pg != NULL);
24005 +       assert("nikita-2394", PageLocked(pg));
24006 +
24007 +       result = jprivate(pg);
24008 +       if (likely(result != NULL))
24009 +               return jref(result);
24010 +
24011 +       tree = tree_by_page(pg);
24012 +
24013 +       /* check hash-table first */
24014 +       result = jfind(pg->mapping, pg->index);
24015 +       if (unlikely(result != NULL)) {
24016 +               UNDER_SPIN_VOID(jnode, result, jnode_attach_page(result, pg));
24017 +               result->key.j.mapping = pg->mapping;
24018 +               return result;
24019 +       }
24020 +
24021 +       result = find_get_jnode(tree, pg->mapping, oid, pg->index);
24022 +       if (unlikely(IS_ERR(result)))
24023 +               return result;
24024 +       /* attach jnode to page */
24025 +       UNDER_SPIN_VOID(jnode, result, jnode_attach_page(result, pg));
24026 +       return result;
24027 +}
24028 +
24029 +/*
24030 + * return jnode for @pg, creating it if necessary.
24031 + */
24032 +reiser4_internal jnode *
24033 +jnode_of_page(struct page * pg)
24034 +{
24035 +       jnode * result;
24036 +
24037 +       assert("umka-176", pg != NULL);
24038 +       assert("nikita-2394", PageLocked(pg));
24039 +
24040 +       result = do_jget(tree_by_page(pg), pg);
24041 +
24042 +       if (REISER4_DEBUG && !IS_ERR(result)) {
24043 +               assert("nikita-3210", result == jprivate(pg));
24044 +               assert("nikita-2046", jnode_page(jprivate(pg)) == pg);
24045 +               if (jnode_is_unformatted(jprivate(pg))) {
24046 +                       assert("nikita-2364", jprivate(pg)->key.j.index == pg->index);
24047 +                       assert("nikita-2367",
24048 +                              jprivate(pg)->key.j.mapping == pg->mapping);
24049 +                       assert("nikita-2365",
24050 +                              jprivate(pg)->key.j.objectid == get_inode_oid(pg->mapping->host));
24051 +                       assert("vs-1200",
24052 +                              jprivate(pg)->key.j.objectid == pg->mapping->host->i_ino);
24053 +                       assert("nikita-2356", jnode_is_unformatted(jnode_by_page(pg)));
24054 +               }
24055 +               assert("nikita-2956", jnode_invariant(jprivate(pg), 0, 0));
24056 +       }
24057 +       return result;
24058 +}
24059 +
24060 +/* attach page to jnode: set ->pg pointer in jnode, and ->private one in the
24061 + * page.*/
24062 +reiser4_internal void
24063 +jnode_attach_page(jnode * node, struct page *pg)
24064 +{
24065 +       assert("nikita-2060", node != NULL);
24066 +       assert("nikita-2061", pg != NULL);
24067 +
24068 +       assert("nikita-2050", pg->private == 0ul);
24069 +       assert("nikita-2393", !PagePrivate(pg));
24070 +
24071 +       assert("nikita-2396", PageLocked(pg));
24072 +       assert("nikita-2397", spin_jnode_is_locked(node));
24073 +
24074 +       page_cache_get(pg);
24075 +       pg->private = (unsigned long) node;
24076 +       node->pg = pg;
24077 +       SetPagePrivate(pg);
24078 +}
24079 +
24080 +/* Dual to jnode_attach_page: break a binding between page and jnode */
24081 +reiser4_internal void
24082 +page_clear_jnode(struct page *page, jnode * node)
24083 +{
24084 +       assert("nikita-2424", page != NULL);
24085 +       assert("nikita-2425", PageLocked(page));
24086 +       assert("nikita-2426", node != NULL);
24087 +       assert("nikita-2427", spin_jnode_is_locked(node));
24088 +       assert("nikita-2428", PagePrivate(page));
24089 +
24090 +       assert("nikita-3551", !PageWriteback(page));
24091 +
24092 +       JF_CLR(node, JNODE_PARSED);
24093 +       page->private = 0ul;
24094 +       ClearPagePrivate(page);
24095 +       node->pg = NULL;
24096 +       page_cache_release(page);
24097 +       if (REISER4_DEBUG_MODIFY && jnode_is_znode(node))
24098 +               ON_DEBUG_MODIFY(JZNODE(node)->cksum = 0);
24099 +}
24100 +
24101 +/* it is only used in one place to handle error */
24102 +reiser4_internal void
24103 +page_detach_jnode(struct page *page, struct address_space *mapping, unsigned long index)
24104 +{
24105 +       assert("nikita-2395", page != NULL);
24106 +
24107 +       lock_page(page);
24108 +       if ((page->mapping == mapping) && (page->index == index) && PagePrivate(page)) {
24109 +               jnode *node;
24110 +
24111 +               node = jprivate(page);
24112 +               assert("nikita-2399", spin_jnode_is_not_locked(node));
24113 +               UNDER_SPIN_VOID(jnode, node, page_clear_jnode(page, node));
24114 +       }
24115 +       unlock_page(page);
24116 +}
24117 +
24118 +/* return @node page locked.
24119 +
24120 +   Locking ordering requires that one first takes page lock and afterwards
24121 +   spin lock on node attached to this page. Sometimes it is necessary to go in
24122 +   the opposite direction. This is done through standard trylock-and-release
24123 +   loop.
24124 +*/
24125 +reiser4_internal struct page *
24126 +jnode_lock_page(jnode * node)
24127 +{
24128 +       struct page *page;
24129 +
24130 +       assert("nikita-2052", node != NULL);
24131 +       assert("nikita-2401", spin_jnode_is_not_locked(node));
24132 +
24133 +       while (1) {
24134 +
24135 +               LOCK_JNODE(node);
24136 +               page = jnode_page(node);
24137 +               if (page == NULL) {
24138 +                       break;
24139 +               }
24140 +
24141 +               /* no need to page_cache_get( page ) here, because page cannot
24142 +                  be evicted from memory without detaching it from jnode and
24143 +                  this requires spin lock on jnode that we already hold.
24144 +               */
24145 +               if (!TestSetPageLocked(page)) {
24146 +                       /* We won a lock on jnode page, proceed. */
24147 +                       break;
24148 +               }
24149 +
24150 +               /* Page is locked by someone else. */
24151 +               page_cache_get(page);
24152 +               UNLOCK_JNODE(node);
24153 +               wait_on_page_locked(page);
24154 +               /* it is possible that page was detached from jnode and
24155 +                  returned to the free pool, or re-assigned while we were
24156 +                  waiting on locked bit. This will be rechecked on the next
24157 +                  loop iteration.
24158 +               */
24159 +               page_cache_release(page);
24160 +
24161 +               /* try again */
24162 +       }
24163 +       return page;
24164 +}
24165 +
24166 +/*
24167 + * is JNODE_PARSED bit is not set, call ->parse() method of jnode, to verify
24168 + * validness of jnode content.
24169 + */
24170 +static inline int
24171 +jparse(jnode * node)
24172 +{
24173 +       int result;
24174 +
24175 +       assert("nikita-2466", node != NULL);
24176 +
24177 +       LOCK_JNODE(node);
24178 +       if (likely(!jnode_is_parsed(node))) {
24179 +               result = jnode_ops(node)->parse(node);
24180 +               if (likely(result == 0))
24181 +                       JF_SET(node, JNODE_PARSED);
24182 +       } else
24183 +               result = 0;
24184 +       UNLOCK_JNODE(node);
24185 +       return result;
24186 +}
24187 +
24188 +/* Lock a page attached to jnode, create and attach page to jnode if it had no
24189 + * one. */
24190 +reiser4_internal struct page *
24191 +jnode_get_page_locked(jnode * node, int gfp_flags)
24192 +{
24193 +       struct page * page;
24194 +
24195 +       LOCK_JNODE(node);
24196 +       page = jnode_page(node);
24197 +
24198 +       if (page == NULL) {
24199 +               UNLOCK_JNODE(node);
24200 +               page = find_or_create_page(jnode_get_mapping(node),
24201 +                                          jnode_get_index(node), gfp_flags);
24202 +               if (page == NULL)
24203 +                       return ERR_PTR(RETERR(-ENOMEM));
24204 +       } else {
24205 +               if (!TestSetPageLocked(page)) {
24206 +                       UNLOCK_JNODE(node);
24207 +                       return page;
24208 +               }
24209 +               page_cache_get(page);
24210 +               UNLOCK_JNODE(node);
24211 +               lock_page(page);
24212 +               assert("nikita-3134", page->mapping == jnode_get_mapping(node));
24213 +       }
24214 +
24215 +       LOCK_JNODE(node);
24216 +       if (!jnode_page(node))
24217 +               jnode_attach_page(node, page);
24218 +       UNLOCK_JNODE(node);
24219 +
24220 +       page_cache_release(page);
24221 +       assert ("zam-894", jnode_page(node) == page);
24222 +       return page;
24223 +}
24224 +
24225 +/* Start read operation for jnode's page if page is not up-to-date. */
24226 +static int jnode_start_read (jnode * node, struct page * page)
24227 +{
24228 +       assert ("zam-893", PageLocked(page));
24229 +
24230 +       if (PageUptodate(page)) {
24231 +               unlock_page(page);
24232 +               return 0;
24233 +       }
24234 +       return page_io(page, node, READ, GFP_KERNEL);
24235 +}
24236 +
24237 +#if REISER4_DEBUG
24238 +static void check_jload(jnode * node, struct page * page)
24239 +{
24240 +       if (jnode_is_znode(node)) {
24241 +               node40_header *nh;
24242 +               znode *z;
24243 +
24244 +               z = JZNODE(node);
24245 +               if (znode_is_any_locked(z)) {
24246 +                       nh = (node40_header *)kmap(page);
24247 +                       /* this only works for node40-only file systems. For
24248 +                        * debugging. */
24249 +                       assert("nikita-3253",
24250 +                              z->nr_items == d16tocpu(&nh->nr_items));
24251 +                       kunmap(page);
24252 +               }
24253 +               assert("nikita-3565", znode_invariant(z));
24254 +       }
24255 +}
24256 +#else
24257 +#define check_jload(node, page) noop
24258 +#endif
24259 +
24260 +/* prefetch jnode to speed up next call to jload. Call this when you are going
24261 + * to call jload() shortly. This will bring appropriate portion of jnode into
24262 + * CPU cache. */
24263 +reiser4_internal void jload_prefetch(const jnode * node)
24264 +{
24265 +       prefetchw(&node->x_count);
24266 +}
24267 +
24268 +/* load jnode's data into memory */
24269 +reiser4_internal int
24270 +jload_gfp (jnode * node /* node to load */,
24271 +          int gfp_flags /* allocation flags*/,
24272 +          int do_kmap /* true if page should be kmapped */)
24273 +{
24274 +       struct page * page;
24275 +       int result = 0;
24276 +       int parsed;
24277 +
24278 +       assert("nikita-3010", schedulable());
24279 +       write_node_log(node);
24280 +
24281 +       prefetchw(&node->pg);
24282 +
24283 +       /* taking d-reference implies taking x-reference. */
24284 +       jref(node);
24285 +
24286 +       /*
24287 +        * acquiring d-reference to @jnode and check for JNODE_PARSED bit
24288 +        * should be atomic, otherwise there is a race against
24289 +        * reiser4_releasepage().
24290 +        */
24291 +       LOCK_JLOAD(node);
24292 +       add_d_ref(node);
24293 +       parsed = jnode_is_parsed(node);
24294 +       UNLOCK_JLOAD(node);
24295 +
24296 +       if (unlikely(!parsed)) {
24297 +               ON_TRACE(TRACE_PCACHE, "read node: %p\n", node);
24298 +
24299 +               page = jnode_get_page_locked(node, gfp_flags);
24300 +               if (unlikely(IS_ERR(page))) {
24301 +                       result = PTR_ERR(page);
24302 +                       goto failed;
24303 +               }
24304 +
24305 +               result = jnode_start_read(node, page);
24306 +               if (unlikely(result != 0))
24307 +                       goto failed;
24308 +
24309 +               wait_on_page_locked(page);
24310 +               if (unlikely(!PageUptodate(page))) {
24311 +                       result = RETERR(-EIO);
24312 +                       goto failed;
24313 +               }
24314 +
24315 +               if (do_kmap)
24316 +                       node->data = kmap(page);
24317 +
24318 +               result = jparse(node);
24319 +               if (unlikely(result != 0)) {
24320 +                       if (do_kmap)
24321 +                               kunmap(page);
24322 +                       goto failed;
24323 +               }
24324 +               check_jload(node, page);
24325 +       } else {
24326 +               page = jnode_page(node);
24327 +               check_jload(node, page);
24328 +               if (do_kmap)
24329 +                       node->data = kmap(page);
24330 +               reiser4_stat_inc_at_level(jnode_get_level(node),
24331 +                                         jnode.jload_already);
24332 +       }
24333 +
24334 +       if (unlikely(JF_ISSET(node, JNODE_EFLUSH)))
24335 +               UNDER_SPIN_VOID(jnode, node, eflush_del(node, 0));
24336 +
24337 +       if (!is_writeout_mode())
24338 +               /* We do not mark pages active if jload is called as a part of
24339 +                * jnode_flush() or reiser4_write_logs().  Both jnode_flush()
24340 +                * and write_logs() add no value to cached data, there is no
24341 +                * sense to mark pages as active when they go to disk, it just
24342 +                * confuses vm scanning routines because clean page could be
24343 +                * moved out from inactive list as a result of this
24344 +                * mark_page_accessed() call. */
24345 +               mark_page_accessed(page);
24346 +
24347 +       return 0;
24348 +
24349 + failed:
24350 +       jrelse_tail(node);
24351 +       return result;
24352 +
24353 +}
24354 +
24355 +/* start asynchronous reading for given jnode's page. */
24356 +reiser4_internal int jstartio (jnode * node)
24357 +{
24358 +       struct page * page;
24359 +
24360 +       page = jnode_get_page_locked(node, GFP_KERNEL);
24361 +       if (IS_ERR(page))
24362 +               return PTR_ERR(page);
24363 +
24364 +       return jnode_start_read(node, page);
24365 +}
24366 +
24367 +
24368 +/* Initialize a node by calling appropriate plugin instead of reading
24369 + * node from disk as in jload(). */
24370 +reiser4_internal int jinit_new (jnode * node, int gfp_flags)
24371 +{
24372 +       struct page * page;
24373 +       int result;
24374 +
24375 +       jref(node);
24376 +       add_d_ref(node);
24377 +
24378 +       page = jnode_get_page_locked(node, gfp_flags);
24379 +       if (IS_ERR(page)) {
24380 +               result = PTR_ERR(page);
24381 +               goto failed;
24382 +       }
24383 +
24384 +       SetPageUptodate(page);
24385 +       unlock_page(page);
24386 +
24387 +       node->data = kmap(page);
24388 +
24389 +       if (!jnode_is_parsed(node)) {
24390 +               jnode_plugin * jplug = jnode_ops(node);
24391 +               result = UNDER_SPIN(jnode, node, jplug->init(node));
24392 +               if (result) {
24393 +                       kunmap(page);
24394 +                       goto failed;
24395 +               }
24396 +               JF_SET(node, JNODE_PARSED);
24397 +       }
24398 +
24399 +       return 0;
24400 +
24401 + failed:
24402 +       jrelse(node);
24403 +       return result;
24404 +}
24405 +
24406 +/* release a reference to jnode acquired by jload(), decrement ->d_count */
24407 +reiser4_internal void
24408 +jrelse_tail(jnode * node /* jnode to release references to */)
24409 +{
24410 +       assert("nikita-489", atomic_read(&node->d_count) > 0);
24411 +       atomic_dec(&node->d_count);
24412 +       /* release reference acquired in jload_gfp() or jinit_new() */
24413 +       jput(node);
24414 +       LOCK_CNT_DEC(d_refs);
24415 +}
24416 +
24417 +/* drop reference to node data. When last reference is dropped, data are
24418 +   unloaded. */
24419 +reiser4_internal void
24420 +jrelse(jnode * node /* jnode to release references to */)
24421 +{
24422 +       struct page *page;
24423 +
24424 +       assert("nikita-487", node != NULL);
24425 +       assert("nikita-1906", spin_jnode_is_not_locked(node));
24426 +
24427 +       ON_TRACE(TRACE_PCACHE, "release node: %p\n", node);
24428 +
24429 +       page = jnode_page(node);
24430 +       if (likely(page != NULL)) {
24431 +               /*
24432 +                * it is safe not to lock jnode here, because at this point
24433 +                * @node->d_count is greater than zero (if jrelse() is used
24434 +                * correctly, that is). JNODE_PARSED may be not set yet, if,
24435 +                * for example, we got here as a result of error handling path
24436 +                * in jload(). Anyway, page cannot be detached by
24437 +                * reiser4_releasepage(). truncate will invalidate page
24438 +                * regardless, but this should not be a problem.
24439 +                */
24440 +               kunmap(page);
24441 +       }
24442 +       jrelse_tail(node);
24443 +}
24444 +
24445 +/* called from jput() to wait for io completion */
24446 +static void jnode_finish_io(jnode * node)
24447 +{
24448 +       struct page *page;
24449 +
24450 +       assert("nikita-2922", node != NULL);
24451 +
24452 +       LOCK_JNODE(node);
24453 +       page = jnode_page(node);
24454 +       if (page != NULL) {
24455 +               page_cache_get(page);
24456 +               UNLOCK_JNODE(node);
24457 +               wait_on_page_writeback(page);
24458 +               page_cache_release(page);
24459 +       } else
24460 +               UNLOCK_JNODE(node);
24461 +}
24462 +
24463 +/*
24464 + * This is called by jput() when last reference to jnode is released. This is
24465 + * separate function, because we want fast path of jput() to be inline and,
24466 + * therefore, small.
24467 + */
24468 +reiser4_internal void
24469 +jput_final(jnode * node)
24470 +{
24471 +       int r_i_p;
24472 +
24473 +       /* A fast check for keeping node in cache. We always keep node in cache
24474 +        * if its page is present and node was not marked for deletion */
24475 +       if (jnode_page(node) != NULL && !JF_ISSET(node, JNODE_HEARD_BANSHEE)) {
24476 +               rcu_read_unlock();
24477 +               return;
24478 +       }
24479 +
24480 +       r_i_p = !JF_TEST_AND_SET(node, JNODE_RIP);
24481 +       /*
24482 +        * if r_i_p is true, we were first to set JNODE_RIP on this node. In
24483 +        * this case it is safe to access node after unlock.
24484 +        */
24485 +       rcu_read_unlock();
24486 +       if (r_i_p) {
24487 +               jnode_finish_io(node);
24488 +               if (JF_ISSET(node, JNODE_HEARD_BANSHEE))
24489 +                       /* node is removed from the tree. */
24490 +                       jdelete(node);
24491 +               else
24492 +                       jnode_try_drop(node);
24493 +       }
24494 +       /* if !r_i_p some other thread is already killing it */
24495 +}
24496 +
24497 +reiser4_internal int
24498 +jwait_io(jnode * node, int rw)
24499 +{
24500 +       struct page *page;
24501 +       int result;
24502 +
24503 +       assert("zam-447", node != NULL);
24504 +       assert("zam-448", jnode_page(node) != NULL);
24505 +
24506 +       page = jnode_page(node);
24507 +
24508 +       result = 0;
24509 +       if (rw == READ) {
24510 +               wait_on_page_locked(page);
24511 +       } else {
24512 +               assert("nikita-2227", rw == WRITE);
24513 +               wait_on_page_writeback(page);
24514 +       }
24515 +       if (PageError(page))
24516 +               result = RETERR(-EIO);
24517 +
24518 +       return result;
24519 +}
24520 +
24521 +/*
24522 + * jnode types and plugins.
24523 + *
24524 + * jnode by itself is a "base type". There are several different jnode
24525 + * flavors, called "jnode types" (see jnode_type for a list). Sometimes code
24526 + * has to do different things based on jnode type. In the standard reiser4 way
24527 + * this is done by having jnode plugin (see fs/reiser4/plugin.h:jnode_plugin).
24528 + *
24529 + * Functions below deal with jnode types and define methods of jnode plugin.
24530 + *
24531 + */
24532 +
24533 +/* set jnode type. This is done during jnode initialization. */
24534 +static void
24535 +jnode_set_type(jnode * node, jnode_type type)
24536 +{
24537 +       static unsigned long type_to_mask[] = {
24538 +               [JNODE_UNFORMATTED_BLOCK] = 1,
24539 +               [JNODE_FORMATTED_BLOCK] = 0,
24540 +               [JNODE_BITMAP] = 2,
24541 +               [JNODE_IO_HEAD] = 6,
24542 +               [JNODE_INODE] = 4
24543 +       };
24544 +
24545 +       assert("zam-647", type < LAST_JNODE_TYPE);
24546 +       assert("nikita-2815", !jnode_is_loaded(node));
24547 +       assert("nikita-3386", node->state == 0);
24548 +
24549 +       node->state |= (type_to_mask[type] << JNODE_TYPE_1);
24550 +}
24551 +
24552 +/* ->init() method of jnode plugin for jnodes that don't require plugin
24553 + * specific initialization. */
24554 +static int
24555 +init_noinit(jnode * node UNUSED_ARG)
24556 +{
24557 +       return 0;
24558 +}
24559 +
24560 +/* ->parse() method of jnode plugin for jnodes that don't require plugin
24561 + * specific pasring. */
24562 +static int
24563 +parse_noparse(jnode * node UNUSED_ARG)
24564 +{
24565 +       return 0;
24566 +}
24567 +
24568 +/* ->mapping() method for unformatted jnode */
24569 +reiser4_internal struct address_space *
24570 +mapping_jnode(const jnode * node)
24571 +{
24572 +       struct address_space *map;
24573 +
24574 +       assert("nikita-2713", node != NULL);
24575 +
24576 +       /* mapping is stored in jnode */
24577 +
24578 +       map = node->key.j.mapping;
24579 +       assert("nikita-2714", map != NULL);
24580 +       assert("nikita-2897", is_reiser4_inode(map->host));
24581 +       assert("nikita-2715", get_inode_oid(map->host) == node->key.j.objectid);
24582 +       assert("vs-1447", !JF_ISSET(node, JNODE_CC));
24583 +       return map;
24584 +}
24585 +
24586 +/* ->index() method for unformatted jnodes */
24587 +reiser4_internal unsigned long
24588 +index_jnode(const jnode * node)
24589 +{
24590 +       assert("vs-1447", !JF_ISSET(node, JNODE_CC));
24591 +       /* index is stored in jnode */
24592 +       return node->key.j.index;
24593 +}
24594 +
24595 +/* ->remove() method for unformatted jnodes */
24596 +static inline void
24597 +remove_jnode(jnode * node, reiser4_tree * tree)
24598 +{
24599 +       /* remove jnode from hash table and radix tree */
24600 +       if (node->key.j.mapping)
24601 +               unhash_unformatted_node_nolock(node);
24602 +}
24603 +
24604 +/* ->mapping() method for znodes */
24605 +static struct address_space *
24606 +mapping_znode(const jnode * node)
24607 +{
24608 +       assert("vs-1447", !JF_ISSET(node, JNODE_CC));
24609 +       /* all znodes belong to fake inode */
24610 +       return get_super_fake(jnode_get_tree(node)->super)->i_mapping;
24611 +}
24612 +
24613 +extern int znode_shift_order;
24614 +/* ->index() method for znodes */
24615 +static unsigned long
24616 +index_znode(const jnode * node)
24617 +{
24618 +       unsigned long addr;
24619 +       assert("nikita-3317", (1 << znode_shift_order) < sizeof(znode));
24620 +
24621 +       /* index of znode is just its address (shifted) */
24622 +       addr = (unsigned long)node;
24623 +       return (addr - PAGE_OFFSET) >> znode_shift_order;
24624 +}
24625 +
24626 +/* ->mapping() method for bitmap jnode */
24627 +static struct address_space *
24628 +mapping_bitmap(const jnode * node)
24629 +{
24630 +       /* all bitmap blocks belong to special bitmap inode */
24631 +       return get_super_private(jnode_get_tree(node)->super)->bitmap->i_mapping;
24632 +}
24633 +
24634 +/* ->index() method for jnodes that are indexed by address */
24635 +static unsigned long
24636 +index_is_address(const jnode * node)
24637 +{
24638 +       unsigned long ind;
24639 +
24640 +       ind = (unsigned long)node;
24641 +       return ind - PAGE_OFFSET;
24642 +}
24643 +
24644 +/* resolve race with jput */
24645 +reiser4_internal jnode *
24646 +jnode_rip_sync(reiser4_tree *t, jnode * node)
24647 +{
24648 +       /*
24649 +        * This is used as part of RCU-based jnode handling.
24650 +        *
24651 +        * jlookup(), zlook(), zget(), and cbk_cache_scan_slots() have to work
24652 +        * with unreferenced jnodes (ones with ->x_count == 0). Hash table is
24653 +        * not protected during this, so concurrent thread may execute
24654 +        * zget-set-HEARD_BANSHEE-zput, or somehow else cause jnode to be
24655 +        * freed in jput_final(). To avoid such races, jput_final() sets
24656 +        * JNODE_RIP on jnode (under tree lock). All places that work with
24657 +        * unreferenced jnodes call this function. It checks for JNODE_RIP bit
24658 +        * (first without taking tree lock), and if this bit is set, released
24659 +        * reference acquired by the current thread and returns NULL.
24660 +        *
24661 +        * As a result, if jnode is being concurrently freed, NULL is returned
24662 +        * and caller should pretend that jnode wasn't found in the first
24663 +        * place.
24664 +        *
24665 +        * Otherwise it's safe to release "rcu-read-lock" and continue with
24666 +        * jnode.
24667 +        */
24668 +       if (unlikely(JF_ISSET(node, JNODE_RIP))) {
24669 +               RLOCK_TREE(t);
24670 +               if (JF_ISSET(node, JNODE_RIP)) {
24671 +                       dec_x_ref(node);
24672 +                       node = NULL;
24673 +               }
24674 +               RUNLOCK_TREE(t);
24675 +       }
24676 +       return node;
24677 +}
24678 +
24679 +
24680 +reiser4_internal reiser4_key *
24681 +jnode_build_key(const jnode * node, reiser4_key * key)
24682 +{
24683 +       struct inode *inode;
24684 +       item_plugin *iplug;
24685 +       loff_t off;
24686 +
24687 +       assert("nikita-3092", node != NULL);
24688 +       assert("nikita-3093", key != NULL);
24689 +       assert("nikita-3094", jnode_is_unformatted(node));
24690 +
24691 +
24692 +       off   = ((loff_t)index_jnode(node)) << PAGE_CACHE_SHIFT;
24693 +       inode = mapping_jnode(node)->host;
24694 +
24695 +       if (node->parent_item_id != 0)
24696 +               iplug = item_plugin_by_id(node->parent_item_id);
24697 +       else
24698 +               iplug = NULL;
24699 +
24700 +       if (iplug != NULL && iplug->f.key_by_offset)
24701 +               iplug->f.key_by_offset(inode, off, key);
24702 +       else {
24703 +               file_plugin *fplug;
24704 +
24705 +               fplug = inode_file_plugin(inode);
24706 +               assert ("zam-1007", fplug != NULL);
24707 +               assert ("zam-1008", fplug->key_by_inode != NULL);
24708 +
24709 +               fplug->key_by_inode(inode, off, key);
24710 +       }
24711 +
24712 +       return key;
24713 +}
24714 +
24715 +extern int zparse(znode * node);
24716 +
24717 +/* ->parse() method for formatted nodes */
24718 +static int
24719 +parse_znode(jnode * node)
24720 +{
24721 +       return zparse(JZNODE(node));
24722 +}
24723 +
24724 +/* ->delete() method for formatted nodes */
24725 +static void
24726 +delete_znode(jnode * node, reiser4_tree * tree)
24727 +{
24728 +       znode *z;
24729 +
24730 +       assert("nikita-2128", rw_tree_is_write_locked(tree));
24731 +       assert("vs-898", JF_ISSET(node, JNODE_HEARD_BANSHEE));
24732 +
24733 +       z = JZNODE(node);
24734 +       assert("vs-899", z->c_count == 0);
24735 +
24736 +       /* delete znode from sibling list. */
24737 +       sibling_list_remove(z);
24738 +
24739 +       znode_remove(z, tree);
24740 +}
24741 +
24742 +/* ->remove() method for formatted nodes */
24743 +static int
24744 +remove_znode(jnode * node, reiser4_tree * tree)
24745 +{
24746 +       znode *z;
24747 +
24748 +       assert("nikita-2128", rw_tree_is_locked(tree));
24749 +       z = JZNODE(node);
24750 +
24751 +       if (z->c_count == 0) {
24752 +               /* detach znode from sibling list. */
24753 +               sibling_list_drop(z);
24754 +               /* this is called with tree spin-lock held, so call
24755 +                  znode_remove() directly (rather than znode_lock_remove()). */
24756 +               znode_remove(z, tree);
24757 +               return 0;
24758 +       }
24759 +       return RETERR(-EBUSY);
24760 +}
24761 +
24762 +/* ->init() method for formatted nodes */
24763 +static int
24764 +init_znode(jnode * node)
24765 +{
24766 +       znode *z;
24767 +
24768 +       z = JZNODE(node);
24769 +       /* call node plugin to do actual initialization */
24770 +       return z->nplug->init(z);
24771 +}
24772 +
24773 +/* jplug->clone for formatted nodes (znodes) */
24774 +znode *zalloc(int gfp_flag);
24775 +void zinit(znode *, const znode * parent, reiser4_tree *);
24776 +
24777 +/* ->clone() method for formatted nodes */
24778 +reiser4_internal jnode *
24779 +clone_formatted(jnode *node)
24780 +{
24781 +       znode *clone;
24782 +
24783 +       assert("vs-1430", jnode_is_znode(node));
24784 +       clone = zalloc(GFP_KERNEL);
24785 +       if (clone == NULL)
24786 +               return ERR_PTR(RETERR(-ENOMEM));
24787 +       zinit(clone, 0, current_tree);
24788 +       jnode_set_block(ZJNODE(clone), jnode_get_block(node));
24789 +       /* ZJNODE(clone)->key.z is not initialized */
24790 +       clone->level = JZNODE(node)->level;
24791 +
24792 +       return ZJNODE(clone);
24793 +}
24794 +
24795 +/* jplug->clone for unformatted nodes */
24796 +reiser4_internal jnode *
24797 +clone_unformatted(jnode *node)
24798 +{
24799 +       jnode *clone;
24800 +
24801 +       assert("vs-1431", jnode_is_unformatted(node));
24802 +       clone = jalloc();
24803 +       if (clone == NULL)
24804 +               return ERR_PTR(RETERR(-ENOMEM));
24805 +
24806 +       jnode_init(clone, current_tree, JNODE_UNFORMATTED_BLOCK);
24807 +       jnode_set_block(clone, jnode_get_block(node));
24808 +
24809 +       return clone;
24810 +
24811 +}
24812 +
24813 +/*
24814 + * Setup jnode plugin methods for various jnode types.
24815 + */
24816 +
24817 +jnode_plugin jnode_plugins[LAST_JNODE_TYPE] = {
24818 +       [JNODE_UNFORMATTED_BLOCK] = {
24819 +               .h = {
24820 +                       .type_id = REISER4_JNODE_PLUGIN_TYPE,
24821 +                       .id = JNODE_UNFORMATTED_BLOCK,
24822 +                       .pops = NULL,
24823 +                       .label = "unformatted",
24824 +                       .desc = "unformatted node",
24825 +                       .linkage = TYPE_SAFE_LIST_LINK_ZERO
24826 +               },
24827 +               .init = init_noinit,
24828 +               .parse = parse_noparse,
24829 +               .mapping = mapping_jnode,
24830 +               .index = index_jnode,
24831 +               .clone = clone_unformatted
24832 +       },
24833 +       [JNODE_FORMATTED_BLOCK] = {
24834 +               .h = {
24835 +                       .type_id = REISER4_JNODE_PLUGIN_TYPE,
24836 +                       .id = JNODE_FORMATTED_BLOCK,
24837 +                       .pops = NULL,
24838 +                       .label = "formatted",
24839 +                       .desc = "formatted tree node",
24840 +                       .linkage = TYPE_SAFE_LIST_LINK_ZERO
24841 +               },
24842 +               .init = init_znode,
24843 +               .parse = parse_znode,
24844 +               .mapping = mapping_znode,
24845 +               .index = index_znode,
24846 +               .clone = clone_formatted
24847 +       },
24848 +       [JNODE_BITMAP] = {
24849 +               .h = {
24850 +                       .type_id = REISER4_JNODE_PLUGIN_TYPE,
24851 +                       .id = JNODE_BITMAP,
24852 +                       .pops = NULL,
24853 +                       .label = "bitmap",
24854 +                       .desc = "bitmap node",
24855 +                       .linkage = TYPE_SAFE_LIST_LINK_ZERO
24856 +               },
24857 +               .init = init_noinit,
24858 +               .parse = parse_noparse,
24859 +               .mapping = mapping_bitmap,
24860 +               .index = index_is_address,
24861 +               .clone = NULL
24862 +       },
24863 +       [JNODE_IO_HEAD] = {
24864 +               .h = {
24865 +                       .type_id = REISER4_JNODE_PLUGIN_TYPE,
24866 +                       .id = JNODE_IO_HEAD,
24867 +                       .pops = NULL,
24868 +                       .label = "io head",
24869 +                       .desc = "io head",
24870 +                       .linkage = TYPE_SAFE_LIST_LINK_ZERO
24871 +               },
24872 +               .init = init_noinit,
24873 +               .parse = parse_noparse,
24874 +               .mapping = mapping_bitmap,
24875 +               .index = index_is_address,
24876 +               .clone = NULL
24877 +       },
24878 +       [JNODE_INODE] = {
24879 +               .h = {
24880 +                       .type_id = REISER4_JNODE_PLUGIN_TYPE,
24881 +                       .id = JNODE_INODE,
24882 +                       .pops = NULL,
24883 +                       .label = "inode",
24884 +                       .desc = "inode's builtin jnode",
24885 +                       .linkage = TYPE_SAFE_LIST_LINK_ZERO
24886 +               },
24887 +               .init = NULL,
24888 +               .parse = NULL,
24889 +               .mapping = NULL,
24890 +               .index = NULL,
24891 +               .clone = NULL
24892 +       }
24893 +};
24894 +
24895 +/*
24896 + * jnode destruction.
24897 + *
24898 + * Thread may use a jnode after it acquired a reference to it. References are
24899 + * counted in ->x_count field. Reference protects jnode from being
24900 + * recycled. This is different from protecting jnode data (that are stored in
24901 + * jnode page) from being evicted from memory. Data are protected by jload()
24902 + * and released by jrelse().
24903 + *
24904 + * If thread already possesses a reference to the jnode it can acquire another
24905 + * one through jref(). Initial reference is obtained (usually) by locating
24906 + * jnode in some indexing structure that depends on jnode type: formatted
24907 + * nodes are kept in global hash table, where they are indexed by block
24908 + * number, and also in the cbk cache. Unformatted jnodes are also kept in hash
24909 + * table, which is indexed by oid and offset within file, and in per-inode
24910 + * radix tree.
24911 + *
24912 + * Reference to jnode is released by jput(). If last reference is released,
24913 + * jput_final() is called. This function determines whether jnode has to be
24914 + * deleted (this happens when corresponding node is removed from the file
24915 + * system, jnode is marked with JNODE_HEARD_BANSHEE bit in this case), or it
24916 + * should be just "removed" (deleted from memory).
24917 + *
24918 + * Jnode destruction is signally delicate dance because of locking and RCU.
24919 + */
24920 +
24921 +/*
24922 + * Returns true if jnode cannot be removed right now. This check is called
24923 + * under tree lock. If it returns true, jnode is irrevocably committed to be
24924 + * deleted/removed.
24925 + */
24926 +static inline int
24927 +jnode_is_busy(const jnode * node, jnode_type jtype)
24928 +{
24929 +       /* if other thread managed to acquire a reference to this jnode, don't
24930 +        * free it. */
24931 +       if (atomic_read(&node->x_count) > 0)
24932 +               return 1;
24933 +       /* also, don't free znode that has children in memory */
24934 +       if (jtype == JNODE_FORMATTED_BLOCK && JZNODE(node)->c_count > 0)
24935 +               return 1;
24936 +       return 0;
24937 +}
24938 +
24939 +/*
24940 + * this is called as part of removing jnode. Based on jnode type, call
24941 + * corresponding function that removes jnode from indices and returns it back
24942 + * to the appropriate slab (through RCU).
24943 + */
24944 +static inline void
24945 +jnode_remove(jnode * node, jnode_type jtype, reiser4_tree * tree)
24946 +{
24947 +       switch (jtype) {
24948 +       case JNODE_UNFORMATTED_BLOCK:
24949 +               remove_jnode(node, tree);
24950 +               break;
24951 +       case JNODE_IO_HEAD:
24952 +       case JNODE_BITMAP:
24953 +               break;
24954 +       case JNODE_INODE:
24955 +               break;
24956 +       case JNODE_FORMATTED_BLOCK:
24957 +               remove_znode(node, tree);
24958 +               break;
24959 +       default:
24960 +               wrong_return_value("nikita-3196", "Wrong jnode type");
24961 +       }
24962 +}
24963 +
24964 +/*
24965 + * this is called as part of deleting jnode. Based on jnode type, call
24966 + * corresponding function that removes jnode from indices and returns it back
24967 + * to the appropriate slab (through RCU).
24968 + *
24969 + * This differs from jnode_remove() only for formatted nodes---for them
24970 + * sibling list handling is different for removal and deletion.
24971 + */
24972 +static inline void
24973 +jnode_delete(jnode * node, jnode_type jtype, reiser4_tree * tree UNUSED_ARG)
24974 +{
24975 +       switch (jtype) {
24976 +       case JNODE_UNFORMATTED_BLOCK:
24977 +               remove_jnode(node, tree);
24978 +               break;
24979 +       case JNODE_IO_HEAD:
24980 +       case JNODE_BITMAP:
24981 +               break;
24982 +       case JNODE_FORMATTED_BLOCK:
24983 +               delete_znode(node, tree);
24984 +               break;
24985 +       case JNODE_INODE:
24986 +       default:
24987 +               wrong_return_value("nikita-3195", "Wrong jnode type");
24988 +       }
24989 +}
24990 +
24991 +#if REISER4_DEBUG
24992 +/*
24993 + * remove jnode from the debugging list of all jnodes hanging off super-block.
24994 + */
24995 +void jnode_list_remove(jnode * node)
24996 +{
24997 +       reiser4_super_info_data *sbinfo;
24998 +
24999 +       sbinfo = get_super_private(jnode_get_tree(node)->super);
25000 +
25001 +       spin_lock_irq(&sbinfo->all_guard);
25002 +       assert("nikita-2422", !list_empty(&node->jnodes));
25003 +       list_del_init(&node->jnodes);
25004 +       spin_unlock_irq(&sbinfo->all_guard);
25005 +}
25006 +#endif
25007 +
25008 +/*
25009 + * this is called by jput_final() to remove jnode when last reference to it is
25010 + * released.
25011 + */
25012 +reiser4_internal int
25013 +jnode_try_drop(jnode * node)
25014 +{
25015 +       int result;
25016 +       reiser4_tree *tree;
25017 +       jnode_type    jtype;
25018 +
25019 +       trace_stamp(TRACE_ZNODES);
25020 +       assert("nikita-2491", node != NULL);
25021 +       assert("nikita-2583", JF_ISSET(node, JNODE_RIP));
25022 +
25023 +       ON_TRACE(TRACE_PCACHE, "trying to drop node: %p\n", node);
25024 +
25025 +       tree = jnode_get_tree(node);
25026 +       jtype = jnode_get_type(node);
25027 +
25028 +       LOCK_JNODE(node);
25029 +       WLOCK_TREE(tree);
25030 +       /*
25031 +        * if jnode has a page---leave it alone. Memory pressure will
25032 +        * eventually kill page and jnode.
25033 +        */
25034 +       if (jnode_page(node) != NULL) {
25035 +               UNLOCK_JNODE(node);
25036 +               WUNLOCK_TREE(tree);
25037 +               JF_CLR(node, JNODE_RIP);
25038 +               return RETERR(-EBUSY);
25039 +       }
25040 +
25041 +       /* re-check ->x_count under tree lock. */
25042 +       result = jnode_is_busy(node, jtype);
25043 +       if (result == 0) {
25044 +               assert("nikita-2582", !JF_ISSET(node, JNODE_HEARD_BANSHEE));
25045 +               assert("nikita-3223", !JF_ISSET(node, JNODE_EFLUSH));
25046 +               assert("jmacd-511/b", atomic_read(&node->d_count) == 0);
25047 +
25048 +               UNLOCK_JNODE(node);
25049 +               /* no page and no references---despatch him. */
25050 +               jnode_remove(node, jtype, tree);
25051 +               WUNLOCK_TREE(tree);
25052 +               jnode_free(node, jtype);
25053 +       } else {
25054 +               /* busy check failed: reference was acquired by concurrent
25055 +                * thread. */
25056 +               WUNLOCK_TREE(tree);
25057 +               UNLOCK_JNODE(node);
25058 +               JF_CLR(node, JNODE_RIP);
25059 +       }
25060 +       return result;
25061 +}
25062 +
25063 +/* jdelete() -- Delete jnode from the tree and file system */
25064 +reiser4_internal int
25065 +jdelete(jnode * node /* jnode to finish with */)
25066 +{
25067 +       struct page *page;
25068 +       int result;
25069 +       reiser4_tree *tree;
25070 +       jnode_type    jtype;
25071 +
25072 +       trace_stamp(TRACE_ZNODES);
25073 +       assert("nikita-467", node != NULL);
25074 +       assert("nikita-2531", JF_ISSET(node, JNODE_RIP));
25075 +       /* jnode cannot be eflushed at this point, because emegrency flush
25076 +        * acquired additional reference counter. */
25077 +       assert("nikita-2917", !JF_ISSET(node, JNODE_EFLUSH));
25078 +
25079 +       ON_TRACE(TRACE_PCACHE, "delete node: %p\n", node);
25080 +
25081 +       jtype = jnode_get_type(node);
25082 +
25083 +       page = jnode_lock_page(node);
25084 +       assert("nikita-2402", spin_jnode_is_locked(node));
25085 +
25086 +       tree = jnode_get_tree(node);
25087 +
25088 +       WLOCK_TREE(tree);
25089 +       /* re-check ->x_count under tree lock. */
25090 +       result = jnode_is_busy(node, jtype);
25091 +       if (likely(!result)) {
25092 +               assert("nikita-2123", JF_ISSET(node, JNODE_HEARD_BANSHEE));
25093 +               assert("jmacd-511", atomic_read(&node->d_count) == 0);
25094 +
25095 +               /* detach page */
25096 +               if (page != NULL) {
25097 +                       /*
25098 +                        * FIXME this is racy against jnode_extent_write().
25099 +                        */
25100 +                       page_clear_jnode(page, node);
25101 +               }
25102 +               UNLOCK_JNODE(node);
25103 +               /* goodbye */
25104 +               jnode_delete(node, jtype, tree);
25105 +               WUNLOCK_TREE(tree);
25106 +               jnode_free(node, jtype);
25107 +               /* @node is no longer valid pointer */
25108 +               if (page != NULL)
25109 +                       drop_page(page);
25110 +       } else {
25111 +               /* busy check failed: reference was acquired by concurrent
25112 +                * thread. */
25113 +               JF_CLR(node, JNODE_RIP);
25114 +               WUNLOCK_TREE(tree);
25115 +               UNLOCK_JNODE(node);
25116 +               if (page != NULL)
25117 +                       unlock_page(page);
25118 +       }
25119 +       return result;
25120 +}
25121 +
25122 +/* drop jnode on the floor.
25123 +
25124 +   Return value:
25125 +
25126 +    -EBUSY:  failed to drop jnode, because there are still references to it
25127 +
25128 +    0:       successfully dropped jnode
25129 +
25130 +*/
25131 +static int
25132 +jdrop_in_tree(jnode * node, reiser4_tree * tree)
25133 +{
25134 +       struct page *page;
25135 +       jnode_type    jtype;
25136 +       int result;
25137 +
25138 +       assert("zam-602", node != NULL);
25139 +       assert("nikita-2362", rw_tree_is_not_locked(tree));
25140 +       assert("nikita-2403", !JF_ISSET(node, JNODE_HEARD_BANSHEE));
25141 +       // assert( "nikita-2532", JF_ISSET( node, JNODE_RIP ) );
25142 +
25143 +       ON_TRACE(TRACE_PCACHE, "drop node: %p\n", node);
25144 +
25145 +       jtype = jnode_get_type(node);
25146 +
25147 +       page = jnode_lock_page(node);
25148 +       assert("nikita-2405", spin_jnode_is_locked(node));
25149 +
25150 +       WLOCK_TREE(tree);
25151 +
25152 +       /* re-check ->x_count under tree lock. */
25153 +       result = jnode_is_busy(node, jtype);
25154 +       if (!result) {
25155 +               assert("nikita-2488", page == jnode_page(node));
25156 +               assert("nikita-2533", atomic_read(&node->d_count) == 0);
25157 +               if (page != NULL) {
25158 +                       assert("nikita-2126", !PageDirty(page));
25159 +                       assert("nikita-2127", PageUptodate(page));
25160 +                       assert("nikita-2181", PageLocked(page));
25161 +                       page_clear_jnode(page, node);
25162 +               }
25163 +               UNLOCK_JNODE(node);
25164 +               jnode_remove(node, jtype, tree);
25165 +               WUNLOCK_TREE(tree);
25166 +               jnode_free(node, jtype);
25167 +               if (page != NULL) {
25168 +                       drop_page(page);
25169 +               }
25170 +       } else {
25171 +               /* busy check failed: reference was acquired by concurrent
25172 +                * thread. */
25173 +               JF_CLR(node, JNODE_RIP);
25174 +               WUNLOCK_TREE(tree);
25175 +               UNLOCK_JNODE(node);
25176 +               if (page != NULL)
25177 +                       unlock_page(page);
25178 +       }
25179 +       return result;
25180 +}
25181 +
25182 +/* This function frees jnode "if possible". In particular, [dcx]_count has to
25183 +   be 0 (where applicable).  */
25184 +reiser4_internal void
25185 +jdrop(jnode * node)
25186 +{
25187 +       jdrop_in_tree(node, jnode_get_tree(node));
25188 +}
25189 +
25190 +
25191 +/* IO head jnode implementation; The io heads are simple j-nodes with limited
25192 +   functionality (these j-nodes are not in any hash table) just for reading
25193 +   from and writing to disk. */
25194 +
25195 +reiser4_internal jnode *
25196 +alloc_io_head(const reiser4_block_nr * block)
25197 +{
25198 +       jnode *jal = jalloc();
25199 +
25200 +       if (jal != NULL) {
25201 +               jnode_init(jal, current_tree, JNODE_IO_HEAD);
25202 +               jnode_set_block(jal, block);
25203 +       }
25204 +
25205 +       jref(jal);
25206 +
25207 +       return jal;
25208 +}
25209 +
25210 +reiser4_internal void
25211 +drop_io_head(jnode * node)
25212 +{
25213 +       assert("zam-648", jnode_get_type(node) == JNODE_IO_HEAD);
25214 +
25215 +       jput(node);
25216 +       jdrop(node);
25217 +}
25218 +
25219 +/* protect keep jnode data from reiser4_releasepage()  */
25220 +reiser4_internal void
25221 +pin_jnode_data(jnode * node)
25222 +{
25223 +       assert("zam-671", jnode_page(node) != NULL);
25224 +       page_cache_get(jnode_page(node));
25225 +}
25226 +
25227 +/* make jnode data free-able again */
25228 +reiser4_internal void
25229 +unpin_jnode_data(jnode * node)
25230 +{
25231 +       assert("zam-672", jnode_page(node) != NULL);
25232 +       page_cache_release(jnode_page(node));
25233 +}
25234 +
25235 +reiser4_internal struct address_space *
25236 +jnode_get_mapping(const jnode * node)
25237 +{
25238 +       assert("nikita-3162", node != NULL);
25239 +       return jnode_ops(node)->mapping(node);
25240 +}
25241 +
25242 +#if REISER4_DEBUG_NODE_INVARIANT
25243 +/* debugging aid: jnode invariant */
25244 +reiser4_internal int
25245 +jnode_invariant_f(const jnode * node,
25246 +                 char const **msg)
25247 +{
25248 +#define _ergo(ant, con)                                                \
25249 +       ((*msg) = "{" #ant "} ergo {" #con "}", ergo((ant), (con)))
25250 +#define _check(exp) ((*msg) = #exp, (exp))
25251 +
25252 +       return
25253 +               _check(node != NULL) &&
25254 +
25255 +               /* [jnode-queued] */
25256 +
25257 +               /* only relocated node can be queued, except that when znode
25258 +                * is being deleted, its JNODE_RELOC bit is cleared */
25259 +               _ergo(JF_ISSET(node, JNODE_FLUSH_QUEUED),
25260 +                     JF_ISSET(node, JNODE_RELOC) ||
25261 +                     JF_ISSET(node, JNODE_HEARD_BANSHEE)) &&
25262 +
25263 +               _check(node->jnodes.prev != NULL) &&
25264 +               _check(node->jnodes.next != NULL) &&
25265 +
25266 +               /* [jnode-dirty] invariant */
25267 +
25268 +               /* dirty inode is part of atom */
25269 +               _ergo(jnode_is_dirty(node), node->atom != NULL) &&
25270 +
25271 +               /* [jnode-oid] invariant */
25272 +
25273 +               /* for unformatted node ->objectid and ->mapping fields are
25274 +                * consistent */
25275 +               _ergo(jnode_is_unformatted(node) && node->key.j.mapping != NULL,
25276 +                     node->key.j.objectid == get_inode_oid(node->key.j.mapping->host)) &&
25277 +               /* [jnode-atom-valid] invariant */
25278 +
25279 +               /* node atom has valid state */
25280 +               _ergo(node->atom != NULL,
25281 +                     node->atom->stage != ASTAGE_INVALID) &&
25282 +
25283 +               /* [jnode-page-binding] invariant */
25284 +
25285 +               /* if node points to page, it points back to node */
25286 +               _ergo(node->pg != NULL, jprivate(node->pg) == node) &&
25287 +
25288 +               /* [jnode-refs] invariant */
25289 +
25290 +               /* only referenced jnode can be loaded */
25291 +               _check(atomic_read(&node->x_count) >= atomic_read(&node->d_count));
25292 +
25293 +}
25294 +
25295 +/* debugging aid: check znode invariant and panic if it doesn't hold */
25296 +int
25297 +jnode_invariant(const jnode * node, int tlocked, int jlocked)
25298 +{
25299 +       char const *failed_msg;
25300 +       int result;
25301 +       reiser4_tree *tree;
25302 +
25303 +       tree = jnode_get_tree(node);
25304 +
25305 +       assert("umka-063312", node != NULL);
25306 +       assert("umka-064321", tree != NULL);
25307 +
25308 +       if (!jlocked && !tlocked)
25309 +               LOCK_JNODE((jnode *) node);
25310 +       if (!tlocked)
25311 +               RLOCK_TREE(jnode_get_tree(node));
25312 +       result = jnode_invariant_f(node, &failed_msg);
25313 +       if (!result) {
25314 +               info_jnode("corrupted node", node);
25315 +               warning("jmacd-555", "Condition %s failed", failed_msg);
25316 +       }
25317 +       if (!tlocked)
25318 +               RUNLOCK_TREE(jnode_get_tree(node));
25319 +       if (!jlocked && !tlocked)
25320 +               UNLOCK_JNODE((jnode *) node);
25321 +       return result;
25322 +}
25323 +
25324 +/* REISER4_DEBUG_NODE_INVARIANT */
25325 +#endif
25326 +
25327 +#if REISER4_STATS
25328 +void reiser4_stat_inc_at_level_jput(const jnode * node)
25329 +{
25330 +       reiser4_stat_inc_at_level(jnode_get_level(node), jnode.jput);
25331 +}
25332 +
25333 +void reiser4_stat_inc_at_level_jputlast(const jnode * node)
25334 +{
25335 +       reiser4_stat_inc_at_level(jnode_get_level(node), jnode.jputlast);
25336 +}
25337 +/* REISER4_STATS */
25338 +#endif
25339 +
25340 +#if REISER4_DEBUG_OUTPUT
25341 +
25342 +reiser4_internal const char *
25343 +jnode_type_name(jnode_type type)
25344 +{
25345 +       switch (type) {
25346 +       case JNODE_UNFORMATTED_BLOCK:
25347 +               return "unformatted";
25348 +       case JNODE_FORMATTED_BLOCK:
25349 +               return "formatted";
25350 +       case JNODE_BITMAP:
25351 +               return "bitmap";
25352 +       case JNODE_IO_HEAD:
25353 +               return "io head";
25354 +       case JNODE_INODE:
25355 +               return "inode";
25356 +       case LAST_JNODE_TYPE:
25357 +               return "last";
25358 +       default:{
25359 +                       static char unknown[30];
25360 +
25361 +                       sprintf(unknown, "unknown %i", type);
25362 +                       return unknown;
25363 +               }
25364 +       }
25365 +}
25366 +
25367 +#define jnode_state_name( node, flag )                 \
25368 +       ( JF_ISSET( ( node ), ( flag ) ) ? ((#flag "|")+6) : "" )
25369 +
25370 +/* debugging aid: output human readable information about @node */
25371 +reiser4_internal void
25372 +info_jnode(const char *prefix /* prefix to print */ ,
25373 +          const jnode * node /* node to print */ )
25374 +{
25375 +       assert("umka-068", prefix != NULL);
25376 +
25377 +       if (node == NULL) {
25378 +               printk("%s: null\n", prefix);
25379 +               return;
25380 +       }
25381 +
25382 +       printk("%s: %p: state: %lx: [%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s], level: %i,"
25383 +              " block: %s, d_count: %d, x_count: %d, "
25384 +              "pg: %p, atom: %p, lock: %i:%i, type: %s, ",
25385 +              prefix, node, node->state,
25386 +              jnode_state_name(node, JNODE_PARSED),
25387 +              jnode_state_name(node, JNODE_HEARD_BANSHEE),
25388 +              jnode_state_name(node, JNODE_LEFT_CONNECTED),
25389 +              jnode_state_name(node, JNODE_RIGHT_CONNECTED),
25390 +              jnode_state_name(node, JNODE_ORPHAN),
25391 +              jnode_state_name(node, JNODE_CREATED),
25392 +              jnode_state_name(node, JNODE_RELOC),
25393 +              jnode_state_name(node, JNODE_OVRWR),
25394 +              jnode_state_name(node, JNODE_DIRTY),
25395 +              jnode_state_name(node, JNODE_IS_DYING),
25396 +              jnode_state_name(node, JNODE_EFLUSH),
25397 +              jnode_state_name(node, JNODE_FLUSH_QUEUED),
25398 +              jnode_state_name(node, JNODE_RIP),
25399 +              jnode_state_name(node, JNODE_MISSED_IN_CAPTURE),
25400 +              jnode_state_name(node, JNODE_WRITEBACK),
25401 +              jnode_state_name(node, JNODE_NEW),
25402 +              jnode_state_name(node, JNODE_DKSET),
25403 +              jnode_state_name(node, JNODE_EPROTECTED),
25404 +              jnode_state_name(node, JNODE_REPACK),
25405 +              jnode_state_name(node, JNODE_CLUSTER_PAGE),
25406 +              jnode_get_level(node), sprint_address(jnode_get_block(node)),
25407 +              atomic_read(&node->d_count), atomic_read(&node->x_count),
25408 +              jnode_page(node), node->atom,
25409 +#if REISER4_LOCKPROF && REISER4_LOCKPROF_OBJECTS
25410 +              node->guard.held, node->guard.trying,
25411 +#else
25412 +              0, 0,
25413 +#endif
25414 +              jnode_type_name(jnode_get_type(node)));
25415 +       if (jnode_is_unformatted(node)) {
25416 +               printk("inode: %llu, index: %lu, ",
25417 +                      node->key.j.objectid, node->key.j.index);
25418 +       }
25419 +}
25420 +
25421 +/* debugging aid: output human readable information about @node */
25422 +reiser4_internal void
25423 +print_jnode(const char *prefix /* prefix to print */ ,
25424 +           const jnode * node /* node to print */)
25425 +{
25426 +       if (jnode_is_znode(node))
25427 +               print_znode(prefix, JZNODE(node));
25428 +       else
25429 +               info_jnode(prefix, node);
25430 +}
25431 +
25432 +/* this is cut-n-paste replica of print_znodes() */
25433 +reiser4_internal void
25434 +print_jnodes(const char *prefix, reiser4_tree * tree)
25435 +{
25436 +       jnode *node;
25437 +       jnode *next;
25438 +       j_hash_table *htable;
25439 +       int tree_lock_taken;
25440 +
25441 +       if (tree == NULL)
25442 +               tree = current_tree;
25443 +
25444 +       /* this is a debugging function. It can be called by reiser4_panic()
25445 +          with tree spin-lock already held. Trylock is not exactly what we
25446 +          want here, but it is passable.
25447 +       */
25448 +       tree_lock_taken = write_trylock_tree(tree);
25449 +       htable = &tree->jhash_table;
25450 +
25451 +       for_all_in_htable(htable, j, node, next) {
25452 +               info_jnode(prefix, node);
25453 +               printk("\n");
25454 +       }
25455 +       if (tree_lock_taken)
25456 +               WUNLOCK_TREE(tree);
25457 +}
25458 +
25459 +/* REISER4_DEBUG_OUTPUT */
25460 +#endif
25461 +
25462 +/* this is only used to created jnode during capture copy */
25463 +reiser4_internal jnode *jclone(jnode *node)
25464 +{
25465 +       jnode *clone;
25466 +
25467 +       assert("vs-1429", jnode_ops(node)->clone);
25468 +       clone = jnode_ops(node)->clone(node);
25469 +       if (IS_ERR(clone))
25470 +               return clone;
25471 +
25472 +       jref(clone);
25473 +       JF_SET(clone, JNODE_HEARD_BANSHEE);
25474 +       JF_SET(clone, JNODE_CC);
25475 +       return clone;
25476 +}
25477 +
25478 +
25479 +/* Make Linus happy.
25480 +   Local variables:
25481 +   c-indentation-style: "K&R"
25482 +   mode-name: "LC"
25483 +   c-basic-offset: 8
25484 +   tab-width: 8
25485 +   fill-column: 80
25486 +   End:
25487 +*/
25488 diff -rupN linux-2.6.8-rc3/fs/reiser4/jnode.h linux-2.6.8-rc3-a/fs/reiser4/jnode.h
25489 --- linux-2.6.8-rc3/fs/reiser4/jnode.h  1970-01-01 03:00:00.000000000 +0300
25490 +++ linux-2.6.8-rc3-a/fs/reiser4/jnode.h        2004-08-05 21:20:53.115653680 +0400
25491 @@ -0,0 +1,790 @@
25492 +/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
25493 + * reiser4/README */
25494 +
25495 +/* Declaration of jnode. See jnode.c for details. */
25496 +
25497 +#ifndef __JNODE_H__
25498 +#define __JNODE_H__
25499 +
25500 +#include "forward.h"
25501 +#include "type_safe_hash.h"
25502 +#include "type_safe_list.h"
25503 +#include "txnmgr.h"
25504 +#include "key.h"
25505 +#include "debug.h"
25506 +#include "dformat.h"
25507 +#include "spin_macros.h"
25508 +#include "emergency_flush.h"
25509 +
25510 +#include "plugin/plugin.h"
25511 +
25512 +#include <linux/fs.h>
25513 +#include <linux/mm.h>
25514 +#include <linux/spinlock.h>
25515 +#include <asm/atomic.h>
25516 +#include <asm/bitops.h>
25517 +#include <linux/list.h>
25518 +#include <linux/rcupdate.h>
25519 +
25520 +/* declare hash table of jnodes (jnodes proper, that is, unformatted
25521 +   nodes)  */
25522 +TYPE_SAFE_HASH_DECLARE(j, jnode);
25523 +
25524 +/* declare hash table of znodes */
25525 +TYPE_SAFE_HASH_DECLARE(z, znode);
25526 +
25527 +typedef struct {
25528 +       __u64 objectid;
25529 +       unsigned long index;
25530 +       struct address_space *mapping;
25531 +} jnode_key_t;
25532 +
25533 +/*
25534 +   Jnode is the "base class" of other nodes in reiser4. It is also happens to
25535 +   be exactly the node we use for unformatted tree nodes.
25536 +
25537 +   Jnode provides following basic functionality:
25538 +
25539 +   . reference counting and indexing.
25540 +
25541 +   . integration with page cache. Jnode has ->pg reference to which page can
25542 +   be attached.
25543 +
25544 +   . interface to transaction manager. It is jnode that is kept in transaction
25545 +   manager lists, attached to atoms, etc. (NOTE-NIKITA one may argue that this
25546 +   means, there should be special type of jnode for inode.)
25547 +
25548 +   Locking:
25549 +
25550 +   Spin lock: the following fields are protected by the per-jnode spin lock:
25551 +
25552 +    ->state
25553 +    ->atom
25554 +    ->capture_link
25555 +
25556 +   Following fields are protected by the global tree lock:
25557 +
25558 +    ->link
25559 +    ->key.z (content of ->key.z is only changed in znode_rehash())
25560 +    ->key.j
25561 +
25562 +   Atomic counters
25563 +
25564 +    ->x_count
25565 +    ->d_count
25566 +
25567 +    ->pg, and ->data are protected by spin lock for unused jnode and are
25568 +    immutable for used jnode (one for which fs/reiser4/vfs_ops.c:releasable()
25569 +    is false).
25570 +
25571 +    ->tree is immutable after creation
25572 +
25573 +   Unclear
25574 +
25575 +    ->blocknr: should be under jnode spin-lock, but current interface is based
25576 +    on passing of block address.
25577 +
25578 +   If you ever need to spin lock two nodes at once, do this in "natural"
25579 +   memory order: lock znode with lower address first. (See lock_two_nodes().)
25580 +
25581 +   Invariants involving this data-type:
25582 +
25583 +      [jnode-dirty]
25584 +      [jnode-refs]
25585 +      [jnode-oid]
25586 +      [jnode-queued]
25587 +      [jnode-atom-valid]
25588 +      [jnode-page-binding]
25589 +*/
25590 +
25591 +struct jnode {
25592 +#if REISER4_DEBUG
25593 +#define JMAGIC 0x52654973 /* "ReIs" */
25594 +       int magic;
25595 +#endif
25596 +       /* FIRST CACHE LINE (16 bytes): data used by jload */
25597 +
25598 +       /* jnode's state: bitwise flags from the reiser4_jnode_state enum. */
25599 +       /*   0 */ unsigned long state;
25600 +
25601 +       /* lock, protecting jnode's fields. */
25602 +       /*   4 */ reiser4_spin_data load;
25603 +
25604 +       /* counter of references to jnode itself. Increased on jref().
25605 +          Decreased on jput().
25606 +       */
25607 +       /*   8 */ atomic_t x_count;
25608 +
25609 +       /* counter of references to jnode's data. Pin data page(s) in
25610 +          memory while this is greater than 0. Increased on jload().
25611 +          Decreased on jrelse().
25612 +       */
25613 +       /*   12 */ atomic_t d_count;
25614 +
25615 +       /* SECOND CACHE LINE: data used by hash table lookups */
25616 +
25617 +       /*   16 */ union {
25618 +               /* znodes are hashed by block number */
25619 +               reiser4_block_nr z;
25620 +               /* unformatted nodes are hashed by mapping plus offset */
25621 +               jnode_key_t j;
25622 +       } key;
25623 +
25624 +       /* THIRD CACHE LINE */
25625 +
25626 +       /*   32 */ union {
25627 +               /* pointers to maintain hash-table */
25628 +               z_hash_link z;
25629 +               j_hash_link j;
25630 +       } link;
25631 +
25632 +       /* pointer to jnode page.  */
25633 +       /*   36 */ struct page *pg;
25634 +       /* pointer to node itself. This is page_address(node->pg) when page is
25635 +          attached to the jnode
25636 +       */
25637 +       /*   40 */ void *data;
25638 +
25639 +       /*   44 */ reiser4_tree *tree;
25640 +
25641 +       /* FOURTH CACHE LINE: atom related fields */
25642 +
25643 +       /*   48 */ reiser4_spin_data guard;
25644 +
25645 +       /* atom the block is in, if any */
25646 +       /*   52 */ txn_atom *atom;
25647 +
25648 +       /* capture list */
25649 +       /*   56 */ capture_list_link capture_link;
25650 +
25651 +       /* FIFTH CACHE LINE */
25652 +
25653 +       /*   64 */ struct rcu_head rcu; /* crosses cache line */
25654 +
25655 +       /* SIXTH CACHE LINE */
25656 +
25657 +       /* the real blocknr (where io is going to/from) */
25658 +       /*   80 */ reiser4_block_nr blocknr;
25659 +       /* Parent item type, unformatted and CRC need it for offset => key conversion.  */
25660 +       /* NOTE: this parent_item_id looks like jnode type. */
25661 +       /*   88 */ reiser4_plugin_id parent_item_id;
25662 +       /*   92 */
25663 +#if REISER4_DEBUG
25664 +       /* list of all jnodes for debugging purposes. */
25665 +       struct list_head jnodes;
25666 +       /* how many times this jnode was written in one transaction */
25667 +       int      written;
25668 +       /* this indicates which atom's list the jnode is on */
25669 +        atom_list list1;
25670 +       /* for debugging jnodes of one inode are attached to inode via this list */
25671 +       inode_jnodes_list_link inode_link;
25672 +#endif
25673 +} __attribute__((aligned(16)));
25674 +
25675 +
25676 +/*
25677 + * jnode types. Enumeration of existing jnode types.
25678 + */
25679 +typedef enum {
25680 +       JNODE_UNFORMATTED_BLOCK, /* unformatted block */
25681 +       JNODE_FORMATTED_BLOCK,   /* formatted block, znode */
25682 +       JNODE_BITMAP,            /* bitmap */
25683 +       JNODE_IO_HEAD,           /* jnode representing a block in the
25684 +                                 * wandering log */
25685 +       JNODE_INODE,             /* jnode embedded into inode */
25686 +       LAST_JNODE_TYPE
25687 +} jnode_type;
25688 +
25689 +TYPE_SAFE_LIST_DEFINE(capture, jnode, capture_link);
25690 +#if REISER4_DEBUG
25691 +TYPE_SAFE_LIST_DEFINE(inode_jnodes, jnode, inode_link);
25692 +#endif
25693 +
25694 +/* jnode states */
25695 +typedef enum {
25696 +       /* jnode's page is loaded and data checked */
25697 +       JNODE_PARSED = 0,
25698 +       /* node was deleted, not all locks on it were released. This
25699 +          node is empty and is going to be removed from the tree
25700 +          shortly. */
25701 +       JNODE_HEARD_BANSHEE = 1,
25702 +       /* left sibling pointer is valid */
25703 +       JNODE_LEFT_CONNECTED = 2,
25704 +       /* right sibling pointer is valid */
25705 +       JNODE_RIGHT_CONNECTED = 3,
25706 +
25707 +       /* znode was just created and doesn't yet have a pointer from
25708 +          its parent */
25709 +       JNODE_ORPHAN = 4,
25710 +
25711 +       /* this node was created by its transaction and has not been assigned
25712 +         a block address. */
25713 +       JNODE_CREATED = 5,
25714 +
25715 +       /* this node is currently relocated */
25716 +       JNODE_RELOC = 6,
25717 +       /* this node is currently wandered */
25718 +       JNODE_OVRWR = 7,
25719 +
25720 +       /* this znode has been modified */
25721 +       JNODE_DIRTY = 8,
25722 +
25723 +       /* znode lock is being invalidated */
25724 +       JNODE_IS_DYING = 9,
25725 +
25726 +       /* THIS PLACE IS INTENTIONALLY LEFT BLANK */
25727 +
25728 +       JNODE_EFLUSH = 11,
25729 +
25730 +       /* jnode is queued for flushing. */
25731 +       JNODE_FLUSH_QUEUED = 12,
25732 +
25733 +       /* In the following bits jnode type is encoded. */
25734 +       JNODE_TYPE_1 = 13,
25735 +       JNODE_TYPE_2 = 14,
25736 +       JNODE_TYPE_3 = 15,
25737 +
25738 +       /* jnode is being destroyed */
25739 +       JNODE_RIP = 16,
25740 +
25741 +       /* znode was not captured during locking (it might so be because
25742 +         ->level != LEAF_LEVEL and lock_mode == READ_LOCK) */
25743 +       JNODE_MISSED_IN_CAPTURE = 17,
25744 +
25745 +       /* write is in progress */
25746 +       JNODE_WRITEBACK = 18,
25747 +
25748 +       /* FIXME: now it is used by crypto-compress plugin only */
25749 +       JNODE_NEW = 19,
25750 +
25751 +       /* delimiting keys are already set for this znode. */
25752 +       JNODE_DKSET = 20,
25753 +       /* if page was dirtied through mmap, we don't want to lose data, even
25754 +        * though page and jnode may be clean. Mark jnode with JNODE_KEEPME so
25755 +        * that ->releasepage() can tell. As this is used only for
25756 +        * unformatted, we can share bit with DKSET which is only meaningful
25757 +        * for formatted. */
25758 +       JNODE_KEEPME = 20,
25759 +
25760 +       /* cheap and effective protection of jnode from emergency flush. This
25761 +        * bit can only be set by thread that holds long term lock on jnode
25762 +        * parent node (twig node, where extent unit lives). */
25763 +       JNODE_EPROTECTED = 21,
25764 +       JNODE_CLUSTER_PAGE = 22,
25765 +       /* Jnode is marked for repacking, that means the reiser4 flush and the
25766 +        * block allocator should process this node special way  */
25767 +       JNODE_REPACK = 23,
25768 +       /* enable node squeezing */
25769 +       JNODE_SQUEEZABLE = 24,
25770 +
25771 +       JNODE_SCANNED = 25,
25772 +       JNODE_JLOADED_BY_GET_OVERWRITE_SET = 26,
25773 +       /* capture copy jnode */
25774 +       JNODE_CC = 27,
25775 +       /* this jnode is copy of coced original */
25776 +       JNODE_CCED = 28,
25777 +       /*
25778 +        * When jnode is dirtied for the first time in given transaction,
25779 +        * do_jnode_make_dirty() checks whether this jnode can possible became
25780 +        * member of overwrite set. If so, this bit is set, and one block is
25781 +        * reserved in the ->flush_reserved space of atom.
25782 +        *
25783 +        * This block is "used" (and JNODE_FLUSH_RESERVED bit is cleared) when
25784 +        *
25785 +        *     (1) flush decides that we want this block to go into relocate
25786 +        *     set after all.
25787 +        *
25788 +        *     (2) wandering log is allocated (by log writer)
25789 +        *
25790 +        *     (3) extent is allocated
25791 +        *
25792 +        */
25793 +       JNODE_FLUSH_RESERVED = 29
25794 +} reiser4_jnode_state;
25795 +
25796 +/* Macros for accessing the jnode state. */
25797 +
25798 +static inline void
25799 +JF_CLR(jnode * j, int f)
25800 +{
25801 +       assert("unknown-1", j->magic == JMAGIC);
25802 +       clear_bit(f, &j->state);
25803 +}
25804 +static inline int
25805 +JF_ISSET(const jnode * j, int f)
25806 +{
25807 +       assert("unknown-2", j->magic == JMAGIC);
25808 +       return test_bit(f, &((jnode *) j)->state);
25809 +}
25810 +static inline void
25811 +JF_SET(jnode * j, int f)
25812 +{
25813 +       assert("unknown-3", j->magic == JMAGIC);
25814 +       set_bit(f, &j->state);
25815 +}
25816 +
25817 +static inline int
25818 +JF_TEST_AND_SET(jnode * j, int f)
25819 +{
25820 +       assert("unknown-4", j->magic == JMAGIC);
25821 +       return test_and_set_bit(f, &j->state);
25822 +}
25823 +
25824 +/* ordering constraint for znode spin lock: znode lock is weaker than
25825 +   tree lock and dk lock */
25826 +#define spin_ordering_pred_jnode( node )                                       \
25827 +       ( ( lock_counters() -> rw_locked_tree == 0 ) &&                 \
25828 +         ( lock_counters() -> spin_locked_txnh == 0 ) &&                       \
25829 +         ( lock_counters() -> rw_locked_zlock == 0 ) &&                      \
25830 +         ( lock_counters() -> rw_locked_dk == 0 )   &&                       \
25831 +         /*                                                                    \
25832 +            in addition you cannot hold more than one jnode spin lock at a     \
25833 +            time.                                                              \
25834 +         */                                                                   \
25835 +         ( lock_counters() -> spin_locked_jnode < 2 ) )
25836 +
25837 +/* Define spin_lock_jnode, spin_unlock_jnode, and spin_jnode_is_locked.
25838 +   Take and release short-term spinlocks.  Don't hold these across
25839 +   io.
25840 +*/
25841 +SPIN_LOCK_FUNCTIONS(jnode, jnode, guard);
25842 +
25843 +#define spin_ordering_pred_jload(node) (1)
25844 +
25845 +SPIN_LOCK_FUNCTIONS(jload, jnode, load);
25846 +
25847 +static inline int
25848 +jnode_is_in_deleteset(const jnode * node)
25849 +{
25850 +       return JF_ISSET(node, JNODE_RELOC);
25851 +}
25852 +
25853 +
25854 +extern int jnode_init_static(void);
25855 +extern int jnode_done_static(void);
25856 +
25857 +/* Jnode routines */
25858 +extern jnode *jalloc(void);
25859 +extern void jfree(jnode * node) NONNULL;
25860 +extern jnode *jclone(jnode *);
25861 +extern jnode *jlookup(reiser4_tree * tree,
25862 +                     oid_t objectid, unsigned long ind) NONNULL;
25863 +extern jnode *jfind(struct address_space *, unsigned long index) NONNULL;
25864 +extern jnode *jnode_by_page(struct page *pg) NONNULL;
25865 +extern jnode *jnode_of_page(struct page *pg) NONNULL;
25866 +void jnode_attach_page(jnode * node, struct page *pg);
25867 +jnode *find_get_jnode(reiser4_tree * tree,
25868 +                     struct address_space *mapping, oid_t oid,
25869 +                     unsigned long index);
25870 +
25871 +void unhash_unformatted_jnode(jnode *);
25872 +struct page *jnode_get_page_locked(jnode *, int gfp_flags);
25873 +extern jnode *page_next_jnode(jnode * node) NONNULL;
25874 +extern void jnode_init(jnode * node, reiser4_tree * tree, jnode_type) NONNULL;
25875 +extern void jnode_make_dirty(jnode * node) NONNULL;
25876 +extern void jnode_make_clean(jnode * node) NONNULL;
25877 +extern void jnode_make_wander_nolock(jnode * node) NONNULL;
25878 +extern void jnode_make_wander(jnode*) NONNULL;
25879 +extern void znode_make_reloc(znode*, flush_queue_t*) NONNULL;
25880 +extern void unformatted_make_reloc(jnode*, flush_queue_t*) NONNULL;
25881 +
25882 +extern void jnode_set_block(jnode * node,
25883 +                           const reiser4_block_nr * blocknr) NONNULL;
25884 +extern struct page *jnode_lock_page(jnode *) NONNULL;
25885 +extern struct address_space *jnode_get_mapping(const jnode * node) NONNULL;
25886 +
25887 +/* block number of node */
25888 +static inline const reiser4_block_nr *
25889 +jnode_get_block(const jnode * node /* jnode to query */)
25890 +{
25891 +       assert("nikita-528", node != NULL);
25892 +
25893 +       return &node->blocknr;
25894 +}
25895 +
25896 +/* block number for IO. Usually this is the same as jnode_get_block(), unless
25897 + * jnode was emergency flushed---then block number chosen by eflush is
25898 + * used. */
25899 +static inline const reiser4_block_nr *
25900 +jnode_get_io_block(const jnode * node)
25901 +{
25902 +       assert("nikita-2768", node != NULL);
25903 +       assert("nikita-2769", spin_jnode_is_locked(node));
25904 +
25905 +       if (unlikely(JF_ISSET(node, JNODE_EFLUSH)))
25906 +               return eflush_get(node);
25907 +       else
25908 +               return jnode_get_block(node);
25909 +}
25910 +
25911 +/* Jnode flush interface. */
25912 +extern reiser4_blocknr_hint *pos_hint(flush_pos_t * pos);
25913 +extern int pos_leaf_relocate(flush_pos_t * pos);
25914 +extern flush_queue_t * pos_fq(flush_pos_t * pos);
25915 +
25916 +/* FIXME-VS: these are used in plugin/item/extent.c */
25917 +
25918 +/* does extent_get_block have to be called */
25919 +#define jnode_mapped(node)     JF_ISSET (node, JNODE_MAPPED)
25920 +#define jnode_set_mapped(node) JF_SET (node, JNODE_MAPPED)
25921 +/* pointer to this block was just created (either by appending or by plugging a
25922 +   hole), or zinit_new was called */
25923 +#define jnode_created(node)        JF_ISSET (node, JNODE_CREATED)
25924 +#define jnode_set_created(node)    JF_SET (node, JNODE_CREATED)
25925 +
25926 +/* the node should be squeezed during flush squalloc phase */
25927 +#define jnode_squeezable(node)        JF_ISSET (node, JNODE_SQUEEZABLE)
25928 +#define jnode_set_squeezable(node)    JF_SET (node, JNODE_SQUEEZABLE)
25929 +
25930 +/* Macros to convert from jnode to znode, znode to jnode.  These are macros
25931 +   because C doesn't allow overloading of const prototypes. */
25932 +#define ZJNODE(x) (& (x) -> zjnode)
25933 +#define JZNODE(x)                                              \
25934 +({                                                             \
25935 +       typeof (x) __tmp_x;                                     \
25936 +                                                               \
25937 +       __tmp_x = (x);                                          \
25938 +       assert ("jmacd-1300", jnode_is_znode (__tmp_x));        \
25939 +       (znode*) __tmp_x;                                       \
25940 +})
25941 +
25942 +extern int jnodes_tree_init(reiser4_tree * tree);
25943 +extern int jnodes_tree_done(reiser4_tree * tree);
25944 +
25945 +#if REISER4_DEBUG
25946 +extern int znode_is_any_locked(const znode * node);
25947 +extern void jnode_list_remove(jnode * node);
25948 +#else
25949 +#define jnode_list_remove(node) noop
25950 +#endif
25951 +
25952 +#if REISER4_DEBUG_NODE_INVARIANT
25953 +extern int jnode_invariant(const jnode * node, int tlocked, int jlocked);
25954 +#else
25955 +#define jnode_invariant(n, t, j) (1)
25956 +#endif
25957 +
25958 +#if REISER4_DEBUG_OUTPUT
25959 +extern void info_jnode(const char *prefix, const jnode * node);
25960 +extern void print_jnode(const char *prefix, const jnode * node);
25961 +extern void print_jnodes(const char *prefix, reiser4_tree * tree);
25962 +#else
25963 +#define info_jnode(p, n) noop
25964 +#define print_jnodes(p, t) noop
25965 +#define print_jnode(p, n) noop
25966 +#endif
25967 +
25968 +int znode_is_root(const znode * node) NONNULL;
25969 +
25970 +/* bump reference counter on @node */
25971 +static inline void
25972 +add_x_ref(jnode * node /* node to increase x_count of */ )
25973 +{
25974 +       assert("nikita-1911", node != NULL);
25975 +
25976 +       atomic_inc(&node->x_count);
25977 +       LOCK_CNT_INC(x_refs);
25978 +}
25979 +
25980 +static inline void
25981 +dec_x_ref(jnode * node)
25982 +{
25983 +       assert("nikita-3215", node != NULL);
25984 +       assert("nikita-3216", atomic_read(&node->x_count) > 0);
25985 +
25986 +       atomic_dec(&node->x_count);
25987 +       assert("nikita-3217", LOCK_CNT_GTZ(x_refs));
25988 +       LOCK_CNT_DEC(x_refs);
25989 +}
25990 +
25991 +/* jref() - increase counter of references to jnode/znode (x_count) */
25992 +static inline jnode *
25993 +jref(jnode * node)
25994 +{
25995 +       assert("jmacd-508", (node != NULL) && !IS_ERR(node));
25996 +       add_x_ref(node);
25997 +       return node;
25998 +}
25999 +
26000 +extern int jdelete(jnode * node) NONNULL;
26001 +
26002 +/* get the page of jnode */
26003 +static inline struct page *
26004 +jnode_page(const jnode * node)
26005 +{
26006 +       return node->pg;
26007 +}
26008 +
26009 +/* return pointer to jnode data */
26010 +static inline char *
26011 +jdata(const jnode * node)
26012 +{
26013 +       assert("nikita-1415", node != NULL);
26014 +       assert("nikita-3198", jnode_page(node) != NULL);
26015 +       return node->data;
26016 +}
26017 +
26018 +static inline int
26019 +jnode_is_loaded(const jnode * node)
26020 +{
26021 +       assert("zam-506", node != NULL);
26022 +       return atomic_read(&node->d_count) > 0;
26023 +}
26024 +
26025 +extern void page_detach_jnode(struct page *page,
26026 +                             struct address_space *mapping,
26027 +                             unsigned long index) NONNULL;
26028 +extern void page_clear_jnode(struct page *page, jnode * node) NONNULL;
26029 +
26030 +static inline void
26031 +jnode_set_reloc(jnode * node)
26032 +{
26033 +       assert("nikita-2431", node != NULL);
26034 +       assert("nikita-2432", !JF_ISSET(node, JNODE_OVRWR));
26035 +       JF_SET(node, JNODE_RELOC);
26036 +}
26037 +
26038 +/* bump data counter on @node */
26039 +static inline void add_d_ref(jnode * node /* node to increase d_count of */ )
26040 +{
26041 +       assert("nikita-1962", node != NULL);
26042 +
26043 +       atomic_inc(&node->d_count);
26044 +       LOCK_CNT_INC(d_refs);
26045 +}
26046 +
26047 +
26048 +/* jload/jwrite/junload give a bread/bwrite/brelse functionality for jnodes */
26049 +
26050 +extern int jload_gfp(jnode * node, int gfp, int do_kmap) NONNULL;
26051 +
26052 +static inline int jload(jnode * node)
26053 +{
26054 +       return jload_gfp(node, GFP_KERNEL, 1);
26055 +}
26056 +
26057 +extern int jinit_new(jnode * node, int gfp_flags) NONNULL;
26058 +extern int jstartio(jnode * node) NONNULL;
26059 +
26060 +extern void jdrop(jnode * node) NONNULL;
26061 +extern int jwait_io(jnode * node, int rw) NONNULL;
26062 +
26063 +extern void jload_prefetch(const jnode * node);
26064 +
26065 +extern jnode *alloc_io_head(const reiser4_block_nr * block) NONNULL;
26066 +extern void drop_io_head(jnode * node) NONNULL;
26067 +
26068 +static inline reiser4_tree *
26069 +jnode_get_tree(const jnode * node)
26070 +{
26071 +       assert("nikita-2691", node != NULL);
26072 +       return node->tree;
26073 +}
26074 +
26075 +extern void pin_jnode_data(jnode *);
26076 +extern void unpin_jnode_data(jnode *);
26077 +
26078 +static inline jnode_type
26079 +jnode_get_type(const jnode * node)
26080 +{
26081 +       static const unsigned long state_mask =
26082 +           (1 << JNODE_TYPE_1) | (1 << JNODE_TYPE_2) | (1 << JNODE_TYPE_3);
26083 +
26084 +       static jnode_type mask_to_type[] = {
26085 +               /*  JNODE_TYPE_3 : JNODE_TYPE_2 : JNODE_TYPE_1 */
26086 +
26087 +               /* 000 */
26088 +               [0] = JNODE_FORMATTED_BLOCK,
26089 +               /* 001 */
26090 +               [1] = JNODE_UNFORMATTED_BLOCK,
26091 +               /* 010 */
26092 +               [2] = JNODE_BITMAP,
26093 +               /* 011 */
26094 +               [3] = LAST_JNODE_TYPE,  /*invalid */
26095 +               /* 100 */
26096 +               [4] = JNODE_INODE,
26097 +               /* 101 */
26098 +               [5] = LAST_JNODE_TYPE,
26099 +               /* 110 */
26100 +               [6] = JNODE_IO_HEAD,
26101 +               /* 111 */
26102 +               [7] = LAST_JNODE_TYPE,  /* invalid */
26103 +       };
26104 +
26105 +       return mask_to_type[(node->state & state_mask) >> JNODE_TYPE_1];
26106 +}
26107 +
26108 +/* returns true if node is a znode */
26109 +static inline int
26110 +jnode_is_znode(const jnode * node)
26111 +{
26112 +       return jnode_get_type(node) == JNODE_FORMATTED_BLOCK;
26113 +}
26114 +
26115 +/* return true if "node" is dirty */
26116 +static inline int
26117 +jnode_is_dirty(const jnode * node)
26118 +{
26119 +       assert("nikita-782", node != NULL);
26120 +       assert("jmacd-1800", spin_jnode_is_locked(node) || (jnode_is_znode(node) && znode_is_any_locked(JZNODE(node))));
26121 +       return JF_ISSET(node, JNODE_DIRTY);
26122 +}
26123 +
26124 +/* return true if "node" is dirty, node is unlocked */
26125 +static inline int
26126 +jnode_check_dirty(jnode * node)
26127 +{
26128 +       assert("jmacd-7798", node != NULL);
26129 +       assert("jmacd-7799", spin_jnode_is_not_locked(node));
26130 +       return UNDER_SPIN(jnode, node, jnode_is_dirty(node));
26131 +}
26132 +
26133 +static inline int
26134 +jnode_is_flushprepped(const jnode * node)
26135 +{
26136 +       assert("jmacd-78212", node != NULL);
26137 +       assert("jmacd-71276", spin_jnode_is_locked(node));
26138 +       return !jnode_is_dirty(node) || JF_ISSET(node, JNODE_RELOC)
26139 +           || JF_ISSET(node, JNODE_OVRWR);
26140 +}
26141 +
26142 +/* Return true if @node has already been processed by the squeeze and allocate
26143 +   process.  This implies the block address has been finalized for the
26144 +   duration of this atom (or it is clean and will remain in place).  If this
26145 +   returns true you may use the block number as a hint. */
26146 +static inline int
26147 +jnode_check_flushprepped(jnode * node)
26148 +{
26149 +       /* It must be clean or relocated or wandered.  New allocations are set to relocate. */
26150 +       assert("jmacd-71275", spin_jnode_is_not_locked(node));
26151 +       return UNDER_SPIN(jnode, node, jnode_is_flushprepped(node));
26152 +}
26153 +
26154 +/* returns true if node is unformatted */
26155 +static inline int
26156 +jnode_is_unformatted(const jnode * node)
26157 +{
26158 +       assert("jmacd-0123", node != NULL);
26159 +       return jnode_get_type(node) == JNODE_UNFORMATTED_BLOCK;
26160 +}
26161 +
26162 +/* returns true if node represents a cluster cache page */
26163 +static inline int
26164 +jnode_is_cluster_page(const jnode * node)
26165 +{
26166 +       assert("edward-50", node != NULL);
26167 +       return (JF_ISSET(node, JNODE_CLUSTER_PAGE));
26168 +}
26169 +
26170 +/* returns true is node is builtin inode's jnode */
26171 +static inline int
26172 +jnode_is_inode(const jnode * node)
26173 +{
26174 +       assert("vs-1240", node != NULL);
26175 +       return jnode_get_type(node) == JNODE_INODE;
26176 +}
26177 +
26178 +static inline jnode_plugin *
26179 +jnode_ops_of(const jnode_type type)
26180 +{
26181 +       assert("nikita-2367", type < LAST_JNODE_TYPE);
26182 +       return jnode_plugin_by_id((reiser4_plugin_id) type);
26183 +}
26184 +
26185 +static inline jnode_plugin *
26186 +jnode_ops(const jnode * node)
26187 +{
26188 +       assert("nikita-2366", node != NULL);
26189 +
26190 +       return jnode_ops_of(jnode_get_type(node));
26191 +}
26192 +
26193 +/* Get the index of a block. */
26194 +static inline unsigned long
26195 +jnode_get_index(jnode * node)
26196 +{
26197 +       return jnode_ops(node)->index(node);
26198 +}
26199 +
26200 +/* return true if "node" is the root */
26201 +static inline int
26202 +jnode_is_root(const jnode * node)
26203 +{
26204 +       return jnode_is_znode(node) && znode_is_root(JZNODE(node));
26205 +}
26206 +
26207 +extern struct address_space * mapping_jnode(const jnode * node);
26208 +extern unsigned long index_jnode(const jnode * node);
26209 +
26210 +extern int jnode_try_drop(jnode * node);
26211 +
26212 +static inline void jput(jnode * node);
26213 +extern void jput_final(jnode * node);
26214 +
26215 +#if REISER4_STATS
26216 +extern void reiser4_stat_inc_at_level_jput(const jnode * node);
26217 +extern void reiser4_stat_inc_at_level_jputlast(const jnode * node);
26218 +#else
26219 +#define reiser4_stat_inc_at_level_jput(node) noop
26220 +#define reiser4_stat_inc_at_level_jputlast(node) noop
26221 +#endif
26222 +
26223 +/* jput() - decrement x_count reference counter on znode.
26224 +
26225 +   Count may drop to 0, jnode stays in cache until memory pressure causes the
26226 +   eviction of its page. The c_count variable also ensures that children are
26227 +   pressured out of memory before the parent. The jnode remains hashed as
26228 +   long as the VM allows its page to stay in memory.
26229 +*/
26230 +static inline void
26231 +jput(jnode * node)
26232 +{
26233 +       trace_stamp(TRACE_ZNODES);
26234 +
26235 +       assert("jmacd-509", node != NULL);
26236 +       assert("jmacd-510", atomic_read(&node->x_count) > 0);
26237 +       assert("nikita-3065", spin_jnode_is_not_locked(node));
26238 +       assert("zam-926", schedulable());
26239 +       LOCK_CNT_DEC(x_refs);
26240 +
26241 +       reiser4_stat_inc_at_level_jput(node);
26242 +       rcu_read_lock();
26243 +       /*
26244 +        * we don't need any kind of lock here--jput_final() uses RCU.
26245 +        */
26246 +       if (unlikely(atomic_dec_and_test(&node->x_count))) {
26247 +               reiser4_stat_inc_at_level_jputlast(node);
26248 +               jput_final(node);
26249 +       } else
26250 +               rcu_read_unlock();
26251 +       assert("nikita-3473", schedulable());
26252 +}
26253 +
26254 +extern void jrelse(jnode * node);
26255 +extern void jrelse_tail(jnode * node);
26256 +
26257 +extern jnode *jnode_rip_sync(reiser4_tree *t, jnode * node);
26258 +
26259 +/* resolve race with jput */
26260 +static inline jnode *
26261 +jnode_rip_check(reiser4_tree *tree, jnode * node)
26262 +{
26263 +       if (unlikely(JF_ISSET(node, JNODE_RIP)))
26264 +               node = jnode_rip_sync(tree, node);
26265 +       return node;
26266 +}
26267 +
26268 +extern reiser4_key * jnode_build_key(const jnode * node, reiser4_key * key);
26269 +
26270 +/* __JNODE_H__ */
26271 +#endif
26272 +
26273 +/* Make Linus happy.
26274 +   Local variables:
26275 +   c-indentation-style: "K&R"
26276 +   mode-name: "LC"
26277 +   c-basic-offset: 8
26278 +   tab-width: 8
26279 +   fill-column: 120
26280 +   End:
26281 +*/
26282 diff -rupN linux-2.6.8-rc3/fs/reiser4/kassign.c linux-2.6.8-rc3-a/fs/reiser4/kassign.c
26283 --- linux-2.6.8-rc3/fs/reiser4/kassign.c        1970-01-01 03:00:00.000000000 +0300
26284 +++ linux-2.6.8-rc3-a/fs/reiser4/kassign.c      2004-08-05 21:20:53.039669707 +0400
26285 @@ -0,0 +1,781 @@
26286 +/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
26287 + * reiser4/README */
26288 +
26289 +/* Key assignment policy implementation */
26290 +
26291 +/*
26292 + * In reiser4 every piece of file system data and meta-data has a key. Keys
26293 + * are used to store information in and retrieve it from reiser4 internal
26294 + * tree. In addition to this, keys define _ordering_ of all file system
26295 + * information: things having close keys are placed into the same or
26296 + * neighboring (in the tree order) nodes of the tree. As our block allocator
26297 + * tries to respect tree order (see flush.c), keys also define order in which
26298 + * things are laid out on the disk, and hence, affect performance directly.
26299 + *
26300 + * Obviously, assignment of keys to data and meta-data should be consistent
26301 + * across whole file system. Algorithm that calculates a key for a given piece
26302 + * of data or meta-data is referred to as "key assignment".
26303 + *
26304 + * Key assignment is too expensive to be implemented as a plugin (that is,
26305 + * with an ability to support different key assignment schemas in the same
26306 + * compiled kernel image). As a compromise, all key-assignment functions and
26307 + * data-structures are collected in this single file, so that modifications to
26308 + * key assignment algorithm can be localized. Additional changes may be
26309 + * required in key.[ch].
26310 + *
26311 + * Current default reiser4 key assignment algorithm is dubbed "Plan A". As one
26312 + * may guess, there is "Plan B" too.
26313 + *
26314 + */
26315 +
26316 +/*
26317 + * Additional complication with key assignment implementation is a requirement
26318 + * to support different key length.
26319 + */
26320 +
26321 +/*
26322 + *                   KEY ASSIGNMENT: PLAN A, LONG KEYS.
26323 + *
26324 + * DIRECTORY ITEMS
26325 + *
26326 + *  |       60     | 4 | 7 |1|   56        |        64        |        64       |
26327 + *  +--------------+---+---+-+-------------+------------------+-----------------+
26328 + *  |    dirid     | 0 | F |H|  prefix-1   |    prefix-2      |  prefix-3/hash  |
26329 + *  +--------------+---+---+-+-------------+------------------+-----------------+
26330 + *  |                  |                   |                  |                 |
26331 + *  |    8 bytes       |      8 bytes      |     8 bytes      |     8 bytes     |
26332 + *
26333 + * dirid         objectid of directory this item is for
26334 + *
26335 + * F             fibration, see fs/reiser4/plugin/fibration.[ch]
26336 + *
26337 + * H             1 if last 8 bytes of the key contain hash,
26338 + *               0 if last 8 bytes of the key contain prefix-3
26339 + *
26340 + * prefix-1      first 7 characters of file name.
26341 + *               Padded by zeroes if name is not long enough.
26342 + *
26343 + * prefix-2      next 8 characters of the file name.
26344 + *
26345 + * prefix-3      next 8 characters of the file name.
26346 + *
26347 + * hash          hash of the rest of file name (i.e., portion of file
26348 + *               name not included into prefix-1 and prefix-2).
26349 + *
26350 + * File names shorter than 23 (== 7 + 8 + 8) characters are completely encoded
26351 + * in the key. Such file names are called "short". They are distinguished by H
26352 + * bit set 0 in the key.
26353 + *
26354 + * Other file names are "long". For long name, H bit is 1, and first 15 (== 7
26355 + * + 8) characters are encoded in prefix-1 and prefix-2 portions of the
26356 + * key. Last 8 bytes of the key are occupied by hash of the remaining
26357 + * characters of the name.
26358 + *
26359 + * This key assignment reaches following important goals:
26360 + *
26361 + *     (1) directory entries are sorted in approximately lexicographical
26362 + *     order.
26363 + *
26364 + *     (2) collisions (when multiple directory items have the same key), while
26365 + *     principally unavoidable in a tree with fixed length keys, are rare.
26366 + *
26367 + * STAT DATA
26368 + *
26369 + *  |       60     | 4 |       64        | 4 |     60       |        64       |
26370 + *  +--------------+---+-----------------+---+--------------+-----------------+
26371 + *  |  locality id | 1 |    ordering     | 0 |  objectid    |        0        |
26372 + *  +--------------+---+-----------------+---+--------------+-----------------+
26373 + *  |                  |                 |                  |                 |
26374 + *  |    8 bytes       |    8 bytes      |     8 bytes      |     8 bytes     |
26375 + *
26376 + * locality id     object id of a directory where first name was created for
26377 + *                 the object
26378 + *
26379 + * ordering        copy of second 8-byte portion of the key of directory
26380 + *                 entry for the first name of this object. Ordering has a form
26381 + *                         {
26382 + *                                 fibration :7;
26383 + *                                 h         :1;
26384 + *                                 prefix1   :56;
26385 + *                         }
26386 + *                 see description of key for directory entry above.
26387 + *
26388 + * objectid        object id for this object
26389 + *
26390 + * This key assignment policy is designed to keep stat-data in the same order
26391 + * as corresponding directory items, thus speeding up readdir/stat types of
26392 + * workload.
26393 + *
26394 + * FILE BODY
26395 + *
26396 + *  |       60     | 4 |       64        | 4 |     60       |        64       |
26397 + *  +--------------+---+-----------------+---+--------------+-----------------+
26398 + *  |  locality id | 4 |    ordering     | 0 |  objectid    |      offset     |
26399 + *  +--------------+---+-----------------+---+--------------+-----------------+
26400 + *  |                  |                 |                  |                 |
26401 + *  |    8 bytes       |    8 bytes      |     8 bytes      |     8 bytes     |
26402 + *
26403 + * locality id     object id of a directory where first name was created for
26404 + *                 the object
26405 + *
26406 + * ordering        the same as in the key of stat-data for this object
26407 + *
26408 + * objectid        object id for this object
26409 + *
26410 + * offset          logical offset from the beginning of this file.
26411 + *                 Measured in bytes.
26412 + *
26413 + *
26414 + *                   KEY ASSIGNMENT: PLAN A, SHORT KEYS.
26415 + *
26416 + * DIRECTORY ITEMS
26417 + *
26418 + *  |       60     | 4 | 7 |1|   56        |        64       |
26419 + *  +--------------+---+---+-+-------------+-----------------+
26420 + *  |    dirid     | 0 | F |H|  prefix-1   |  prefix-2/hash  |
26421 + *  +--------------+---+---+-+-------------+-----------------+
26422 + *  |                  |                   |                 |
26423 + *  |    8 bytes       |      8 bytes      |     8 bytes     |
26424 + *
26425 + * dirid         objectid of directory this item is for
26426 + *
26427 + * F             fibration, see fs/reiser4/plugin/fibration.[ch]
26428 + *
26429 + * H             1 if last 8 bytes of the key contain hash,
26430 + *               0 if last 8 bytes of the key contain prefix-2
26431 + *
26432 + * prefix-1      first 7 characters of file name.
26433 + *               Padded by zeroes if name is not long enough.
26434 + *
26435 + * prefix-2      next 8 characters of the file name.
26436 + *
26437 + * hash          hash of the rest of file name (i.e., portion of file
26438 + *               name not included into prefix-1).
26439 + *
26440 + * File names shorter than 15 (== 7 + 8) characters are completely encoded in
26441 + * the key. Such file names are called "short". They are distinguished by H
26442 + * bit set in the key.
26443 + *
26444 + * Other file names are "long". For long name, H bit is 0, and first 7
26445 + * characters are encoded in prefix-1 portion of the key. Last 8 bytes of the
26446 + * key are occupied by hash of the remaining characters of the name.
26447 + *
26448 + * STAT DATA
26449 + *
26450 + *  |       60     | 4 | 4 |     60       |        64       |
26451 + *  +--------------+---+---+--------------+-----------------+
26452 + *  |  locality id | 1 | 0 |  objectid    |        0        |
26453 + *  +--------------+---+---+--------------+-----------------+
26454 + *  |                  |                  |                 |
26455 + *  |    8 bytes       |     8 bytes      |     8 bytes     |
26456 + *
26457 + * locality id     object id of a directory where first name was created for
26458 + *                 the object
26459 + *
26460 + * objectid        object id for this object
26461 + *
26462 + * FILE BODY
26463 + *
26464 + *  |       60     | 4 | 4 |     60       |        64       |
26465 + *  +--------------+---+---+--------------+-----------------+
26466 + *  |  locality id | 4 | 0 |  objectid    |      offset     |
26467 + *  +--------------+---+---+--------------+-----------------+
26468 + *  |                  |                  |                 |
26469 + *  |    8 bytes       |     8 bytes      |     8 bytes     |
26470 + *
26471 + * locality id     object id of a directory where first name was created for
26472 + *                 the object
26473 + *
26474 + * objectid        object id for this object
26475 + *
26476 + * offset          logical offset from the beginning of this file.
26477 + *                 Measured in bytes.
26478 + *
26479 + *
26480 + */
26481 +
26482 +#include "debug.h"
26483 +#include "key.h"
26484 +#include "kassign.h"
26485 +#include "vfs_ops.h"
26486 +#include "inode.h"
26487 +#include "super.h"
26488 +#include "dscale.h"
26489 +
26490 +#include <linux/types.h>       /* for __u??  */
26491 +#include <linux/fs.h>          /* for struct super_block, etc  */
26492 +
26493 +#if REISER4_LARGE_KEY
26494 +#define ORDERING_CHARS (sizeof(__u64) - 1)
26495 +#define OID_CHARS (sizeof(__u64))
26496 +#else
26497 +#define ORDERING_CHARS (0)
26498 +#define OID_CHARS (sizeof(__u64) - 1)
26499 +#endif
26500 +
26501 +#define OFFSET_CHARS (sizeof(__u64))
26502 +
26503 +#define INLINE_CHARS (ORDERING_CHARS + OID_CHARS)
26504 +
26505 +/* bitmask for H bit (see comment at the beginning of this file */
26506 +static const __u64 longname_mark =  0x0100000000000000ull;
26507 +/* bitmask for F and H portions of the key. */
26508 +static const __u64 fibration_mask = 0xff00000000000000ull;
26509 +
26510 +/* return true if name is not completely encoded in @key */
26511 +reiser4_internal int
26512 +is_longname_key(const reiser4_key *key)
26513 +{
26514 +       __u64 highpart;
26515 +
26516 +       assert("nikita-2863", key != NULL);
26517 +       if (get_key_type(key) != KEY_FILE_NAME_MINOR)
26518 +               print_key("oops", key);
26519 +       assert("nikita-2864", get_key_type(key) == KEY_FILE_NAME_MINOR);
26520 +
26521 +       if (REISER4_LARGE_KEY)
26522 +               highpart = get_key_ordering(key);
26523 +       else
26524 +               highpart = get_key_objectid(key);
26525 +
26526 +       return (highpart & longname_mark) ? 1 : 0;
26527 +}
26528 +
26529 +/* return true if @name is too long to be completely encoded in the key */
26530 +reiser4_internal int
26531 +is_longname(const char *name UNUSED_ARG, int len)
26532 +{
26533 +       return len > ORDERING_CHARS + OID_CHARS + OFFSET_CHARS;
26534 +}
26535 +
26536 +/* code ascii string into __u64.
26537 +
26538 +   Put characters of @name into result (@str) one after another starting
26539 +   from @start_idx-th highest (arithmetically) byte. This produces
26540 +   endian-safe encoding. memcpy(2) will not do.
26541 +
26542 +*/
26543 +static __u64
26544 +pack_string(const char *name /* string to encode */ ,
26545 +           int start_idx       /* highest byte in result from
26546 +                                * which to start encoding */ )
26547 +{
26548 +       unsigned i;
26549 +       __u64 str;
26550 +
26551 +       str = 0;
26552 +       for (i = 0; (i < sizeof str - start_idx) && name[i]; ++i) {
26553 +               str <<= 8;
26554 +               str |= (unsigned char) name[i];
26555 +       }
26556 +       str <<= (sizeof str - i - start_idx) << 3;
26557 +       return str;
26558 +}
26559 +
26560 +#if !REISER4_DEBUG_OUTPUT
26561 +static
26562 +#endif
26563 +/* opposite to pack_string(). Takes value produced by pack_string(), restores
26564 + * string encoded in it and stores result in @buf */
26565 +reiser4_internal char *
26566 +unpack_string(__u64 value, char *buf)
26567 +{
26568 +       do {
26569 +               *buf = value >> (64 - 8);
26570 +               if (*buf)
26571 +                       ++ buf;
26572 +               value <<= 8;
26573 +       } while(value != 0);
26574 +       *buf = 0;
26575 +       return buf;
26576 +}
26577 +
26578 +/* obtain name encoded in @key and store it in @buf */
26579 +reiser4_internal char *
26580 +extract_name_from_key(const reiser4_key *key, char *buf)
26581 +{
26582 +       char *c;
26583 +
26584 +       assert("nikita-2868", !is_longname_key(key));
26585 +
26586 +       c = buf;
26587 +       if (REISER4_LARGE_KEY) {
26588 +               c = unpack_string(get_key_ordering(key) & ~fibration_mask, c);
26589 +               c = unpack_string(get_key_fulloid(key), c);
26590 +       } else
26591 +               c = unpack_string(get_key_fulloid(key) & ~fibration_mask, c);
26592 +       unpack_string(get_key_offset(key), c);
26593 +       return buf;
26594 +}
26595 +
26596 +/* build key for directory entry.
26597 +   ->build_entry_key() for directory plugin */
26598 +reiser4_internal void
26599 +build_entry_key_common(const struct inode *dir /* directory where entry is
26600 +                                                * (or will be) in.*/ ,
26601 +                      const struct qstr *qname /* name of file referenced
26602 +                                                * by this entry */ ,
26603 +                      reiser4_key * result     /* resulting key of directory
26604 +                                                * entry */ )
26605 +{
26606 +       __u64 ordering;
26607 +       __u64 objectid;
26608 +       __u64 offset;
26609 +       const char *name;
26610 +       int len;
26611 +
26612 +#if REISER4_LARGE_KEY
26613 +#define second_el ordering
26614 +#else
26615 +#define second_el objectid
26616 +#endif
26617 +
26618 +       assert("nikita-1139", dir != NULL);
26619 +       assert("nikita-1140", qname != NULL);
26620 +       assert("nikita-1141", qname->name != NULL);
26621 +       assert("nikita-1142", result != NULL);
26622 +
26623 +       name = qname->name;
26624 +       len  = qname->len;
26625 +
26626 +       assert("nikita-2867", strlen(name) == len);
26627 +
26628 +       key_init(result);
26629 +       /* locality of directory entry's key is objectid of parent
26630 +          directory */
26631 +       set_key_locality(result, get_inode_oid(dir));
26632 +       /* minor packing locality is constant */
26633 +       set_key_type(result, KEY_FILE_NAME_MINOR);
26634 +       /* dot is special case---we always want it to be first entry in
26635 +          a directory. Actually, we just want to have smallest
26636 +          directory entry.
26637 +       */
26638 +       if (len == 1 && name[0] == '.')
26639 +               return;
26640 +
26641 +       /* This is our brand new proposed key allocation algorithm for
26642 +          directory entries:
26643 +
26644 +          If name is shorter than 7 + 8 = 15 characters, put first 7
26645 +          characters into objectid field and remaining characters (if
26646 +          any) into offset field. Dream long dreamt came true: file
26647 +          name as a key!
26648 +
26649 +          If file name is longer than 15 characters, put first 7
26650 +          characters into objectid and hash of remaining characters
26651 +          into offset field.
26652 +
26653 +          To distinguish above cases, in latter set up unused high bit
26654 +          in objectid field.
26655 +
26656 +
26657 +          With large keys (REISER4_LARGE_KEY) algorithm is updated
26658 +          appropriately.
26659 +       */
26660 +
26661 +       /* objectid of key is composed of seven first characters of
26662 +          file's name. This imposes global ordering on directory
26663 +          entries.
26664 +       */
26665 +       second_el = pack_string(name, 1);
26666 +       if (REISER4_LARGE_KEY) {
26667 +               if (len > ORDERING_CHARS)
26668 +                       objectid = pack_string(name + ORDERING_CHARS, 0);
26669 +               else
26670 +                       objectid = 0ull;
26671 +       }
26672 +
26673 +       if (!is_longname(name, len)) {
26674 +               if (len > INLINE_CHARS)
26675 +                       offset = pack_string(name + INLINE_CHARS, 0);
26676 +               else
26677 +                       offset = 0ull;
26678 +       } else {
26679 +               /* note in a key the fact that offset contains hash. */
26680 +               second_el |= longname_mark;
26681 +
26682 +               /* offset is the hash of the file name. */
26683 +               offset = inode_hash_plugin(dir)->hash(name + INLINE_CHARS,
26684 +                                                     len - INLINE_CHARS);
26685 +       }
26686 +
26687 +       assert("nikita-3480", inode_fibration_plugin(dir) != NULL);
26688 +       second_el |= inode_fibration_plugin(dir)->fibre(dir, name, len);
26689 +
26690 +       if (REISER4_LARGE_KEY) {
26691 +               set_key_ordering(result, ordering);
26692 +               set_key_fulloid(result, objectid);
26693 +       } else {
26694 +               /* objectid is 60 bits */
26695 +               assert("nikita-1405", !(objectid & ~KEY_OBJECTID_MASK));
26696 +               set_key_objectid(result, objectid);
26697 +       }
26698 +       set_key_offset(result, offset);
26699 +       return;
26700 +}
26701 +
26702 +/* build key for directory entry.
26703 +   ->build_entry_key() for directory plugin
26704 +
26705 +   This is for directories where we want repeatable and restartable readdir()
26706 +   even in case 32bit user level struct dirent (readdir(3)).
26707 +*/
26708 +reiser4_internal void
26709 +build_entry_key_stable_entry(const struct inode *dir   /* directory where
26710 +                                                        * entry is (or
26711 +                                                        * will be) in. */ ,
26712 +                            const struct qstr *name    /* name of file
26713 +                                                        * referenced by
26714 +                                                        * this entry */ ,
26715 +                            reiser4_key * result       /* resulting key of
26716 +                                                        * directory entry */ )
26717 +{
26718 +       oid_t objectid;
26719 +
26720 +       assert("nikita-2283", dir != NULL);
26721 +       assert("nikita-2284", name != NULL);
26722 +       assert("nikita-2285", name->name != NULL);
26723 +       assert("nikita-2286", result != NULL);
26724 +
26725 +       key_init(result);
26726 +       /* locality of directory entry's key is objectid of parent
26727 +          directory */
26728 +       set_key_locality(result, get_inode_oid(dir));
26729 +       /* minor packing locality is constant */
26730 +       set_key_type(result, KEY_FILE_NAME_MINOR);
26731 +       /* dot is special case---we always want it to be first entry in
26732 +          a directory. Actually, we just want to have smallest
26733 +          directory entry.
26734 +       */
26735 +       if ((name->len == 1) && (name->name[0] == '.'))
26736 +               return;
26737 +
26738 +       /* objectid of key is 31 lowest bits of hash. */
26739 +       objectid = inode_hash_plugin(dir)->hash(name->name, (int) name->len) & 0x7fffffff;
26740 +
26741 +       assert("nikita-2303", !(objectid & ~KEY_OBJECTID_MASK));
26742 +       set_key_objectid(result, objectid);
26743 +
26744 +       /* offset is always 0. */
26745 +       set_key_offset(result, (__u64) 0);
26746 +       return;
26747 +}
26748 +
26749 +/* build key to be used by ->readdir() method.
26750 +
26751 +   See reiser4_readdir() for more detailed comment.
26752 +   Common implementation of dir plugin's method build_readdir_key
26753 +*/
26754 +reiser4_internal int
26755 +build_readdir_key_common(struct file *dir /* directory being read */ ,
26756 +                        reiser4_key * result /* where to store key */ )
26757 +{
26758 +       reiser4_file_fsdata *fdata;
26759 +       struct inode *inode;
26760 +
26761 +       assert("nikita-1361", dir != NULL);
26762 +       assert("nikita-1362", result != NULL);
26763 +       assert("nikita-1363", dir->f_dentry != NULL);
26764 +       inode = dir->f_dentry->d_inode;
26765 +       assert("nikita-1373", inode != NULL);
26766 +
26767 +       fdata = reiser4_get_file_fsdata(dir);
26768 +       if (IS_ERR(fdata))
26769 +               return PTR_ERR(fdata);
26770 +       assert("nikita-1364", fdata != NULL);
26771 +       return extract_key_from_de_id(get_inode_oid(inode), &fdata->dir.readdir.position.dir_entry_key, result);
26772 +
26773 +}
26774 +
26775 +/* true, if @key is the key of "." */
26776 +reiser4_internal int
26777 +is_dot_key(const reiser4_key * key /* key to check */ )
26778 +{
26779 +       assert("nikita-1717", key != NULL);
26780 +       assert("nikita-1718", get_key_type(key) == KEY_FILE_NAME_MINOR);
26781 +       return
26782 +               (get_key_ordering(key) == 0ull) &&
26783 +               (get_key_objectid(key) == 0ull) &&
26784 +               (get_key_offset(key) == 0ull);
26785 +}
26786 +
26787 +/* build key for stat-data.
26788 +
26789 +   return key of stat-data of this object. This should became sd plugin
26790 +   method in the future. For now, let it be here.
26791 +
26792 +*/
26793 +reiser4_internal reiser4_key *
26794 +build_sd_key(const struct inode * target /* inode of an object */ ,
26795 +            reiser4_key * result       /* resulting key of @target
26796 +                                          stat-data */ )
26797 +{
26798 +       assert("nikita-261", result != NULL);
26799 +
26800 +       key_init(result);
26801 +       set_key_locality(result, reiser4_inode_data(target)->locality_id);
26802 +       set_key_ordering(result, get_inode_ordering(target));
26803 +       set_key_objectid(result, get_inode_oid(target));
26804 +       set_key_type(result, KEY_SD_MINOR);
26805 +       set_key_offset(result, (__u64) 0);
26806 +       return result;
26807 +}
26808 +
26809 +/* encode part of key into &obj_key_id
26810 +
26811 +   This encodes into @id part of @key sufficient to restore @key later,
26812 +   given that latter is key of object (key of stat-data).
26813 +
26814 +   See &obj_key_id
26815 +*/
26816 +reiser4_internal int
26817 +build_obj_key_id(const reiser4_key * key /* key to encode */ ,
26818 +                obj_key_id * id /* id where key is encoded in */ )
26819 +{
26820 +       assert("nikita-1151", key != NULL);
26821 +       assert("nikita-1152", id != NULL);
26822 +
26823 +       xmemcpy(id, key, sizeof *id);
26824 +       return 0;
26825 +}
26826 +
26827 +/* encode reference to @obj in @id.
26828 +
26829 +   This is like build_obj_key_id() above, but takes inode as parameter. */
26830 +reiser4_internal int
26831 +build_inode_key_id(const struct inode *obj /* object to build key of */ ,
26832 +                  obj_key_id * id /* result */ )
26833 +{
26834 +       reiser4_key sdkey;
26835 +
26836 +       assert("nikita-1166", obj != NULL);
26837 +       assert("nikita-1167", id != NULL);
26838 +
26839 +       build_sd_key(obj, &sdkey);
26840 +       build_obj_key_id(&sdkey, id);
26841 +       return 0;
26842 +}
26843 +
26844 +/* decode @id back into @key
26845 +
26846 +   Restore key of object stat-data from @id. This is dual to
26847 +   build_obj_key_id() above.
26848 +*/
26849 +reiser4_internal int
26850 +extract_key_from_id(const obj_key_id * id      /* object key id to extract key
26851 +                                                * from */ ,
26852 +                   reiser4_key * key /* result */ )
26853 +{
26854 +       assert("nikita-1153", id != NULL);
26855 +       assert("nikita-1154", key != NULL);
26856 +
26857 +       key_init(key);
26858 +       xmemcpy(key, id, sizeof *id);
26859 +       return 0;
26860 +}
26861 +
26862 +/* extract objectid of directory from key of directory entry within said
26863 +   directory.
26864 +   */
26865 +reiser4_internal oid_t
26866 +extract_dir_id_from_key(const reiser4_key * de_key     /* key of
26867 +                                                        * directory
26868 +                                                        * entry */ )
26869 +{
26870 +       assert("nikita-1314", de_key != NULL);
26871 +       return get_key_locality(de_key);
26872 +}
26873 +
26874 +/* encode into @id key of directory entry.
26875 +
26876 +   Encode into @id information sufficient to later distinguish directory
26877 +   entries within the same directory. This is not whole key, because all
26878 +   directory entries within directory item share locality which is equal
26879 +   to objectid of their directory.
26880 +
26881 +*/
26882 +reiser4_internal int
26883 +build_de_id(const struct inode *dir /* inode of directory */ ,
26884 +           const struct qstr *name     /* name to be given to @obj by
26885 +                                        * directory entry being
26886 +                                        * constructed */ ,
26887 +           de_id * id /* short key of directory entry */ )
26888 +{
26889 +       reiser4_key key;
26890 +
26891 +       assert("nikita-1290", dir != NULL);
26892 +       assert("nikita-1292", id != NULL);
26893 +
26894 +       /* NOTE-NIKITA this is suboptimal. */
26895 +       inode_dir_plugin(dir)->build_entry_key(dir, name, &key);
26896 +       return build_de_id_by_key(&key, id);
26897 +}
26898 +
26899 +/* encode into @id key of directory entry.
26900 +
26901 +   Encode into @id information sufficient to later distinguish directory
26902 +   entries within the same directory. This is not whole key, because all
26903 +   directory entries within directory item share locality which is equal
26904 +   to objectid of their directory.
26905 +
26906 +*/
26907 +reiser4_internal int
26908 +build_de_id_by_key(const reiser4_key * entry_key       /* full key of directory
26909 +                                                        * entry */ ,
26910 +                  de_id * id /* short key of directory entry */ )
26911 +{
26912 +       xmemcpy(id, ((__u64 *) entry_key) + 1, sizeof *id);
26913 +       return 0;
26914 +}
26915 +
26916 +/* restore from @id key of directory entry.
26917 +
26918 +   Function dual to build_de_id(): given @id and locality, build full
26919 +   key of directory entry within directory item.
26920 +
26921 +*/
26922 +reiser4_internal int
26923 +extract_key_from_de_id(const oid_t locality    /* locality of directory
26924 +                                                * entry */ ,
26925 +                      const de_id * id /* directory entry id */ ,
26926 +                      reiser4_key * key /* result */ )
26927 +{
26928 +       /* no need to initialise key here: all fields are overwritten */
26929 +       xmemcpy(((__u64 *) key) + 1, id, sizeof *id);
26930 +       set_key_locality(key, locality);
26931 +       set_key_type(key, KEY_FILE_NAME_MINOR);
26932 +       return 0;
26933 +}
26934 +
26935 +/* compare two &obj_key_id */
26936 +reiser4_internal cmp_t
26937 +key_id_cmp(const obj_key_id * i1 /* first object key id to compare */ ,
26938 +          const obj_key_id * i2 /* second object key id to compare */ )
26939 +{
26940 +       reiser4_key k1;
26941 +       reiser4_key k2;
26942 +
26943 +       extract_key_from_id(i1, &k1);
26944 +       extract_key_from_id(i2, &k2);
26945 +       return keycmp(&k1, &k2);
26946 +}
26947 +
26948 +/* compare &obj_key_id with full key */
26949 +reiser4_internal cmp_t
26950 +key_id_key_cmp(const obj_key_id * id /* object key id to compare */ ,
26951 +              const reiser4_key * key /* key to compare */ )
26952 +{
26953 +       reiser4_key k1;
26954 +
26955 +       extract_key_from_id(id, &k1);
26956 +       return keycmp(&k1, key);
26957 +}
26958 +
26959 +/* compare two &de_id's */
26960 +reiser4_internal cmp_t
26961 +de_id_cmp(const de_id * id1 /* first &de_id to compare */ ,
26962 +         const de_id * id2 /* second &de_id to compare */ )
26963 +{
26964 +       /* NOTE-NIKITA ugly implementation */
26965 +       reiser4_key k1;
26966 +       reiser4_key k2;
26967 +
26968 +       extract_key_from_de_id((oid_t) 0, id1, &k1);
26969 +       extract_key_from_de_id((oid_t) 0, id2, &k2);
26970 +       return keycmp(&k1, &k2);
26971 +}
26972 +
26973 +/* compare &de_id with key */
26974 +reiser4_internal cmp_t
26975 +de_id_key_cmp(const de_id * id /* directory entry id to compare */ ,
26976 +             const reiser4_key * key /* key to compare */ )
26977 +{
26978 +       cmp_t        result;
26979 +       reiser4_key *k1;
26980 +
26981 +       k1 = (reiser4_key *)(((unsigned long)id) - sizeof key->el[0]);
26982 +       result = KEY_DIFF_EL(k1, key, 1);
26983 +       if (result == EQUAL_TO) {
26984 +               result = KEY_DIFF_EL(k1, key, 2);
26985 +               if (REISER4_LARGE_KEY && result == EQUAL_TO) {
26986 +                       result = KEY_DIFF_EL(k1, key, 3);
26987 +               }
26988 +       }
26989 +       return result;
26990 +}
26991 +
26992 +/* true if key of root directory sd */
26993 +reiser4_internal int
26994 +is_root_dir_key(const struct super_block *super /* super block to check */ ,
26995 +               const reiser4_key * key /* key to check */ )
26996 +{
26997 +       assert("nikita-1819", super != NULL);
26998 +       assert("nikita-1820", key != NULL);
26999 +       /* call disk plugin's root_dir_key method if it exists */
27000 +       if (get_super_private(super)->df_plug && get_super_private(super)->df_plug->root_dir_key)
27001 +               return keyeq(key, get_super_private(super)->df_plug->root_dir_key(super));
27002 +       return 0;
27003 +}
27004 +
27005 +/*
27006 + * return number of bytes necessary to encode @inode identity.
27007 + */
27008 +int inode_onwire_size(const struct inode *inode)
27009 +{
27010 +       int result;
27011 +
27012 +       result  = dscale_bytes(get_inode_oid(inode));
27013 +       result += dscale_bytes(get_inode_locality(inode));
27014 +
27015 +       /*
27016 +        * ordering is large (it usually has highest bits set), so it makes
27017 +        * little sense to dscale it.
27018 +        */
27019 +       if (REISER4_LARGE_KEY)
27020 +               result += sizeof(get_inode_ordering(inode));
27021 +       return result;
27022 +}
27023 +
27024 +/*
27025 + * encode @inode identity at @start
27026 + */
27027 +char *build_inode_onwire(const struct inode *inode, char *start)
27028 +{
27029 +       start += dscale_write(start, get_inode_locality(inode));
27030 +       start += dscale_write(start, get_inode_oid(inode));
27031 +
27032 +       if (REISER4_LARGE_KEY) {
27033 +               cputod64(get_inode_ordering(inode), (d64 *)start);
27034 +               start += sizeof(get_inode_ordering(inode));
27035 +       }
27036 +       return start;
27037 +}
27038 +
27039 +/*
27040 + * extract key that was previously encoded by build_inode_onwire() at @addr
27041 + */
27042 +char *extract_obj_key_id_from_onwire(char *addr, obj_key_id *key_id)
27043 +{
27044 +       __u64 val;
27045 +
27046 +       addr += dscale_read(addr, &val);
27047 +       val = (val << KEY_LOCALITY_SHIFT) | KEY_SD_MINOR;
27048 +       cputod64(val, (d64 *)key_id->locality);
27049 +       addr += dscale_read(addr, &val);
27050 +       cputod64(val, (d64 *)key_id->objectid);
27051 +#if REISER4_LARGE_KEY
27052 +       memcpy(&key_id->ordering, addr, sizeof key_id->ordering);
27053 +       addr += sizeof key_id->ordering;
27054 +#endif
27055 +       return addr;
27056 +}
27057 +
27058 +/* Make Linus happy.
27059 +   Local variables:
27060 +   c-indentation-style: "K&R"
27061 +   mode-name: "LC"
27062 +   c-basic-offset: 8
27063 +   tab-width: 8
27064 +   fill-column: 120
27065 +   End:
27066 +*/
27067 diff -rupN linux-2.6.8-rc3/fs/reiser4/kassign.h linux-2.6.8-rc3-a/fs/reiser4/kassign.h
27068 --- linux-2.6.8-rc3/fs/reiser4/kassign.h        1970-01-01 03:00:00.000000000 +0300
27069 +++ linux-2.6.8-rc3-a/fs/reiser4/kassign.h      2004-08-05 21:20:53.077661694 +0400
27070 @@ -0,0 +1,100 @@
27071 +/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
27072 + * reiser4/README */
27073 +
27074 +/* Key assignment policy interface. See kassign.c for details. */
27075 +
27076 +#if !defined( __KASSIGN_H__ )
27077 +#define __KASSIGN_H__
27078 +
27079 +#include "forward.h"
27080 +#include "key.h"
27081 +#include "dformat.h"
27082 +
27083 +#include <linux/types.h>       /* for __u??  */
27084 +#include <linux/fs.h>          /* for struct super_block, etc  */
27085 +#include <linux/dcache.h>      /* for struct qstr */
27086 +
27087 +/* key assignment functions */
27088 +
27089 +/* Information from which key of file stat-data can be uniquely
27090 +   restored. This depends on key assignment policy for
27091 +   stat-data. Currently it's enough to store object id and locality id
27092 +   (60+60==120) bits, because minor packing locality and offset of
27093 +   stat-data key are always known constants: KEY_SD_MINOR and 0
27094 +   respectively. For simplicity 4 bits are wasted in each id, and just
27095 +   two 64 bit integers are stored.
27096 +
27097 +   This field has to be byte-aligned, because we don't want to waste
27098 +   space in directory entries. There is another side of a coin of
27099 +   course: we waste CPU and bus bandwidth in stead, by copying data back
27100 +   and forth.
27101 +
27102 +   Next optimization: &obj_key_id is mainly used to address stat data from
27103 +   directory entries. Under the assumption that majority of files only have
27104 +   only name (one hard link) from *the* parent directory it seems reasonable
27105 +   to only store objectid of stat data and take its locality from key of
27106 +   directory item.
27107 +
27108 +   This requires some flag to be added to the &obj_key_id to distinguish
27109 +   between these two cases. Remaining bits in flag byte are then asking to be
27110 +   used to store file type.
27111 +
27112 +   This optimization requires changes in directory item handling code.
27113 +
27114 +*/
27115 +typedef struct obj_key_id {
27116 +       d8 locality[sizeof (__u64)];
27117 +       ON_LARGE_KEY(d8 ordering[sizeof (__u64)];)
27118 +       d8 objectid[sizeof (__u64)];
27119 +} obj_key_id;
27120 +
27121 +/* Information sufficient to uniquely identify directory entry within
27122 +   compressed directory item.
27123 +
27124 +   For alignment issues see &obj_key_id above.
27125 +*/
27126 +typedef struct de_id {
27127 +       ON_LARGE_KEY(d8 ordering[sizeof (__u64)];)
27128 +       d8 objectid[sizeof (__u64)];
27129 +       d8 offset[sizeof (__u64)];
27130 +} de_id;
27131 +
27132 +extern int inode_onwire_size(const struct inode *obj);
27133 +extern char *build_inode_onwire(const struct inode *obj, char *area);
27134 +extern char *extract_obj_key_id_from_onwire(char *area, obj_key_id * key_id);
27135 +
27136 +extern int build_inode_key_id(const struct inode *obj, obj_key_id * id);
27137 +extern int extract_key_from_id(const obj_key_id * id, reiser4_key * key);
27138 +extern int build_obj_key_id(const reiser4_key * key, obj_key_id * id);
27139 +extern oid_t extract_dir_id_from_key(const reiser4_key * de_key);
27140 +extern int build_de_id(const struct inode *dir, const struct qstr *name, de_id * id);
27141 +extern int build_de_id_by_key(const reiser4_key * entry_key, de_id * id);
27142 +extern int extract_key_from_de_id(const oid_t locality, const de_id * id, reiser4_key * key);
27143 +extern cmp_t key_id_cmp(const obj_key_id * i1, const obj_key_id * i2);
27144 +extern cmp_t key_id_key_cmp(const obj_key_id * id, const reiser4_key * key);
27145 +extern cmp_t de_id_cmp(const de_id * id1, const de_id * id2);
27146 +extern cmp_t de_id_key_cmp(const de_id * id, const reiser4_key * key);
27147 +
27148 +extern int build_readdir_key_common(struct file *dir, reiser4_key * result);
27149 +extern void build_entry_key_common(const struct inode *dir, const struct qstr *name, reiser4_key * result);
27150 +extern void build_entry_key_stable_entry(const struct inode *dir, const struct qstr *name, reiser4_key * result);
27151 +extern int is_dot_key(const reiser4_key * key);
27152 +extern reiser4_key *build_sd_key(const struct inode *target, reiser4_key * result);
27153 +extern int is_root_dir_key(const struct super_block *super, const reiser4_key * key);
27154 +
27155 +extern int is_longname_key(const reiser4_key *key);
27156 +extern int is_longname(const char *name, int len);
27157 +extern char *extract_name_from_key(const reiser4_key *key, char *buf);
27158 +
27159 +/* __KASSIGN_H__ */
27160 +#endif
27161 +
27162 +/* Make Linus happy.
27163 +   Local variables:
27164 +   c-indentation-style: "K&R"
27165 +   mode-name: "LC"
27166 +   c-basic-offset: 8
27167 +   tab-width: 8
27168 +   fill-column: 120
27169 +   End:
27170 +*/
27171 diff -rupN linux-2.6.8-rc3/fs/reiser4/kattr.c linux-2.6.8-rc3-a/fs/reiser4/kattr.c
27172 --- linux-2.6.8-rc3/fs/reiser4/kattr.c  1970-01-01 03:00:00.000000000 +0300
27173 +++ linux-2.6.8-rc3-a/fs/reiser4/kattr.c        2004-08-05 21:20:53.063664646 +0400
27174 @@ -0,0 +1,641 @@
27175 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
27176 + * reiser4/README */
27177 +
27178 +/* Interface to sysfs' attributes */
27179 +
27180 +/*
27181 + * Reiser4 exports some of its internal data through sysfs.
27182 + *
27183 + * For details on sysfs see fs/sysfs, include/linux/sysfs.h,
27184 + * include/linux/kobject.h. Roughly speaking, one embeds struct kobject into
27185 + * some kernel data type. Objects of this type will be represented as
27186 + * _directories_ somewhere below /sys. Attributes can be registered for
27187 + * kobject and they will be visible as files within corresponding
27188 + * directory. Each attribute is represented by struct kattr. How given
27189 + * attribute reacts to read and write is determined by ->show and ->store
27190 + * operations that are properties of its parent kobject.
27191 + *
27192 + * Reiser4 exports following stuff through sysfs:
27193 + *
27194 + *    path                                              kobject or attribute
27195 + *
27196 + * /sys/fs/reiser4/
27197 + *                 <dev>/                               sbinfo->kobj
27198 + *                       sb-fields                      def_attrs[]
27199 + *                       stats/                         sbinfo->stats_kobj
27200 + *                             stat-cnts                reiser4_stat_defs[]
27201 + *                             level-NN/                sbinfo->level[].kobj
27202 + *                                      stat-cnts       reiser4_stat_level_defs[]
27203 + *
27204 + * (For some reasons we also add /sys/fs and /sys/fs/reiser4 manually, but
27205 + * this is supposed to be done by core.)
27206 + *
27207 + * Our kattr.[ch] code depends on some additional functionality missing in the
27208 + * core kernel. This functionality is added in kobject-umount-race.patch from
27209 + * our core-patches repository. As it's obvious from its name this patch adds
27210 + * protection against /sys/fs/reiser4/<dev>/ * accesses and concurrent umount
27211 + * of <dev>. See commentary in this patch for more details.
27212 + *
27213 + * Shouldn't struct kobject be renamed to struct knobject?
27214 + *
27215 + */
27216 +
27217 +#include "debug.h"
27218 +#include "super.h"
27219 +#include "kattr.h"
27220 +#include "prof.h"
27221 +
27222 +#include <linux/kobject.h>     /* struct kobject */
27223 +#include <linux/fs.h>          /* struct super_block */
27224 +
27225 +#if REISER4_USE_SYSFS
27226 +
27227 +/*
27228 + * Super-block fields exporting.
27229 + *
27230 + * Many fields of reiser4-private part of super-block
27231 + * (fs/reiser4/super.h:reiser4_super_info_data) are exported through
27232 + * sysfs. Code below tries to minimize code duplication for this common case.
27233 + *
27234 + * Specifically, all fields that are "scalars" (i.e., basically integers) of
27235 + * 32 or 64 bits are handled by the same ->show() and ->store()
27236 + * functions. Each such field is represented by two pieces of data:
27237 + *
27238 + *     1. super_field_cookie, and
27239 + *
27240 + *     2. reiser4_kattr.
27241 + *
27242 + * super_field_cookie contains "field description":
27243 + *
27244 + *     1. field offset in bytes from the beginning of reiser4-specific portion
27245 + *     of super block, and
27246 + *
27247 + *     2. printf(3) format to show field content in ->show() function.
27248 + *
27249 + * reiser4_kattr is standard object we are using to embed struct fs_attribute
27250 + * in. It stores pointer to the corresponding super_field_cookie. Also
27251 + * reiser4_kattr contains ->store and ->show function pointers that are set
27252 + * according to field width and desired access rights to
27253 + * {show,store}_{ro,rw}_{32,64}().
27254 + *
27255 + * These functions use super_field_cookie (stored in ->cookie field of
27256 + * reiser4_kattr) to obtain/store value of field involved and format it
27257 + * properly.
27258 + *
27259 + */
27260 +
27261 +/* convert @attr to reiser4_kattr object it is embedded in */
27262 +typedef struct {
27263 +       /* offset in bytes to the super-block field from the beginning of
27264 +        * reiser4_super_info_data */
27265 +       ptrdiff_t   offset;
27266 +       /* desired printf(3) format for ->show() method. */
27267 +       const char *format;
27268 +} super_field_cookie;
27269 +
27270 +/*
27271 + * This macro defines super_field_cookie and reiser4_kattr for given
27272 + * super-block field.
27273 + */
27274 +#define DEFINE_SUPER_F(aname /* unique identifier used to generate variable \
27275 +                             * names */,                               \
27276 +         afield /* name of super-block field */,                       \
27277 +         aformat /* desired ->show() format */,                        \
27278 +         asize /* field size (as returned by sizeof()) */,             \
27279 +         ashow /* show method */,                                      \
27280 +         astore /* store method */,                                    \
27281 +         amode /* access method */)                                    \
27282 +static super_field_cookie __cookie_ ## aname = {                       \
27283 +       .offset = offsetof(reiser4_super_info_data, afield),            \
27284 +       .format = aformat "\n"                                          \
27285 +};                                                                     \
27286 +                                                                       \
27287 +static reiser4_kattr kattr_super_ ## aname = {                         \
27288 +       .attr = {                                                       \
27289 +               .kattr = {                                              \
27290 +                       .name = (char *) #afield,                       \
27291 +                       .mode = amode                                   \
27292 +               },                                                      \
27293 +               .show = ashow,                                          \
27294 +               .store = astore                                         \
27295 +       },                                                              \
27296 +       .cookie = &__cookie_ ## aname                                   \
27297 +}
27298 +
27299 +/*
27300 + * Specialized version of DEFINE_SUPER_F() used to generate description of
27301 + * read-only fields
27302 + */
27303 +#define DEFINE_SUPER_RO(aname, afield, aformat, asize)                 \
27304 +       DEFINE_SUPER_F(aname,                                           \
27305 +                      afield, aformat, asize, show_ro_ ## asize, NULL, 0440)
27306 +
27307 +/*
27308 + * Specialized version of DEFINE_SUPER_F() used to generate description of
27309 + * read-write fields
27310 + */
27311 +#define DEFINE_SUPER_RW(aname, afield, aformat, asize)                 \
27312 +       DEFINE_SUPER_F(aname,                                           \
27313 +                      afield, aformat, asize, show_ro_ ## asize,       \
27314 +                      store_rw_ ## asize, 0660)
27315 +
27316 +/* helper macro: return field of type @type stored at the offset of @offset
27317 + * bytes from the @ptr. */
27318 +#define getat(ptr, offset, type) *(type *)(((char *)(ptr)) + (offset))
27319 +
27320 +/* helper macro: modify value of field to @value. See getat() above for the
27321 + * meaning of other arguments */
27322 +#define setat(ptr, offset, type, val)                  \
27323 +       ({ *(type *)(((char *)(ptr)) + (offset)) = (val); })
27324 +
27325 +/* return cookie contained in reiser4_kattr that @attr is embedded into */
27326 +static inline void *
27327 +getcookie(struct fs_kattr *attr)
27328 +{
27329 +       return container_of(attr, reiser4_kattr, attr)->cookie;
27330 +}
27331 +
27332 +/*
27333 + * ->show method for read-only 32bit scalar super block fields.
27334 + */
27335 +static ssize_t
27336 +show_ro_32(struct super_block * s /* super-block field belongs to */,
27337 +          struct fs_kobject *o /* object attribute of which @kattr is. */,
27338 +          struct fs_kattr * kattr /* file-system attribute that is
27339 +                                   * exported */,
27340 +          char * buf /* buffer to store field representation into */)
27341 +{
27342 +       char *p;
27343 +       super_field_cookie *cookie;
27344 +       __u32 val;
27345 +
27346 +       cookie = getcookie(kattr);
27347 +       /* obtain field value from super-block, ... */
27348 +       val = getat(get_super_private(s), cookie->offset, __u32);
27349 +       p = buf;
27350 +       /* and print it according to the format string specified in the
27351 +        * cookie */
27352 +       KATTR_PRINT(p, buf, cookie->format, (unsigned long long)val);
27353 +       return (p - buf);
27354 +}
27355 +
27356 +/*
27357 + * ->store method for read-write 32bit scalar super-block fields.
27358 + */
27359 +static ssize_t
27360 +store_rw_32(struct super_block * s /* super-block field belongs to */,
27361 +           struct fs_kobject *o /* object attribute of which @kattr is. */,
27362 +           struct fs_kattr * kattr /* file-system attribute that is
27363 +                                   * exported */,
27364 +           const char * buf /* buffer to read field value from */,
27365 +           size_t size /* buffer size */)
27366 +{
27367 +       super_field_cookie *cookie;
27368 +       __u32 val;
27369 +
27370 +       cookie = getcookie(kattr);
27371 +       /* read value from the buffer */
27372 +       if (sscanf(buf, "%i", &val) == 1)
27373 +               /* if buffer contains well-formed value, update super-block
27374 +                * field. */
27375 +               setat(get_super_private(s), cookie->offset, __u32, val);
27376 +       else
27377 +               size = RETERR(-EINVAL);
27378 +       return size;
27379 +}
27380 +
27381 +/*
27382 + * ->show method for read-only 64bit scalar super block fields.
27383 + *
27384 + * It's exactly like show_ro_32, mutatis mutandis.
27385 + */
27386 +static ssize_t show_ro_64(struct super_block * s, struct fs_kobject *o,
27387 +                         struct fs_kattr * kattr, char * buf)
27388 +{
27389 +       char *p;
27390 +       super_field_cookie *cookie;
27391 +       __u64 val;
27392 +
27393 +       cookie = getcookie(kattr);
27394 +       val = getat(get_super_private(s), cookie->offset, __u64);
27395 +       p = buf;
27396 +       KATTR_PRINT(p, buf, cookie->format, (unsigned long long)val);
27397 +       return (p - buf);
27398 +}
27399 +
27400 +#if 0
27401 +/* We don't have writable 64bit attributes yet. */
27402 +static ssize_t
27403 +store_rw_64(struct super_block * s,
27404 +           struct fs_kobject *o, struct fs_kattr * kattr,
27405 +           char * buf, size_t size)
27406 +{
27407 +       super_field_cookie *cookie;
27408 +       __u64 val;
27409 +
27410 +       cookie = getcookie(kattr);
27411 +       if (sscanf(buf, "%lli", &val) == 1)
27412 +               setat(get_super_private(s), cookie->offset, __u64, val);
27413 +       else
27414 +               size = RETERR(-EINVAL);
27415 +       return size;
27416 +}
27417 +#endif
27418 +
27419 +#undef getat
27420 +#undef setat
27421 +
27422 +/*
27423 + * Exporting reiser4 compilation options.
27424 + *
27425 + * reiser4 compilation options are exported through
27426 + * /sys/fs/<dev>/options. Read-only for now. :)
27427 + *
27428 + */
27429 +
27430 +#define SHOW_OPTION(p, buf, option)                    \
27431 +       if (option)                                     \
27432 +               KATTR_PRINT((p), (buf), #option "\n")
27433 +
27434 +static ssize_t
27435 +show_options(struct super_block * s,
27436 +            struct fs_kobject *o, struct fs_kattr * kattr, char * buf)
27437 +{
27438 +       char *p;
27439 +
27440 +       p = buf;
27441 +
27442 +       /*
27443 +        * PLEASE update this when adding new compilation option
27444 +        */
27445 +
27446 +       SHOW_OPTION(p, buf, REISER4_DEBUG);
27447 +       SHOW_OPTION(p, buf, REISER4_DEBUG_MODIFY);
27448 +       SHOW_OPTION(p, buf, REISER4_DEBUG_MEMCPY);
27449 +       SHOW_OPTION(p, buf, REISER4_DEBUG_NODE);
27450 +       SHOW_OPTION(p, buf, REISER4_ZERO_NEW_NODE);
27451 +       SHOW_OPTION(p, buf, REISER4_TRACE);
27452 +       SHOW_OPTION(p, buf, REISER4_LOG);
27453 +       SHOW_OPTION(p, buf, REISER4_STATS);
27454 +       SHOW_OPTION(p, buf, REISER4_DEBUG_OUTPUT);
27455 +       SHOW_OPTION(p, buf, REISER4_LOCKPROF);
27456 +       SHOW_OPTION(p, buf, REISER4_LARGE_KEY);
27457 +       SHOW_OPTION(p, buf, REISER4_PROF);
27458 +       SHOW_OPTION(p, buf, REISER4_COPY_ON_CAPTURE);
27459 +       SHOW_OPTION(p, buf, REISER4_ALL_IN_ONE);
27460 +       SHOW_OPTION(p, buf, REISER4_DEBUG_NODE_INVARIANT);
27461 +       SHOW_OPTION(p, buf, REISER4_DEBUG_SPIN_LOCKS);
27462 +       SHOW_OPTION(p, buf, REISER4_DEBUG_CONTEXTS);
27463 +       SHOW_OPTION(p, buf, REISER4_DEBUG_SIBLING_LIST);
27464 +
27465 +       return (p - buf);
27466 +}
27467 +
27468 +static reiser4_kattr compile_options = {
27469 +       .attr = {
27470 +               .kattr = {
27471 +                        .name = (char *) "options",
27472 +                        .mode = 0444   /* r--r--r-- */
27473 +                },
27474 +               .show = show_options,
27475 +       },
27476 +       .cookie = NULL
27477 +};
27478 +
27479 +/*
27480 + * show a name of device on top of which reiser4 file system exists in
27481 + * /sys/fs/reiser4/<dev>/device.
27482 + */
27483 +
27484 +static ssize_t
27485 +show_device(struct super_block * s,
27486 +           struct fs_kobject *o, struct fs_kattr * kattr, char * buf)
27487 +{
27488 +       char *p;
27489 +
27490 +       p = buf;
27491 +       KATTR_PRINT(p, buf, "%lu\n", (unsigned long)s->s_dev);
27492 +       return (p - buf);
27493 +}
27494 +
27495 +static reiser4_kattr device = {
27496 +       .attr = {
27497 +               .kattr = {
27498 +                        .name = (char *) "device",
27499 +                        .mode = 0444   /* r--r--r-- */
27500 +                },
27501 +               .show = show_device,
27502 +       },
27503 +       .cookie = NULL
27504 +};
27505 +
27506 +#if REISER4_DEBUG
27507 +
27508 +/*
27509 + * debugging code: break into debugger on each write into this file. Useful
27510 + * when event of importance can be detected in the user space, but not in the
27511 + * kernel.
27512 + */
27513 +
27514 +ssize_t store_bugme(struct super_block * s, struct fs_kobject *o,
27515 +                   struct fs_kattr *ka, const char *buf, size_t size)
27516 +{
27517 +       DEBUGON(1);
27518 +       return size;
27519 +}
27520 +
27521 +static reiser4_kattr bugme = {
27522 +       .attr = {
27523 +               .kattr = {
27524 +                        .name = (char *) "bugme",
27525 +                        .mode = 0222   /* -w--w--w- */
27526 +                },
27527 +               .store = store_bugme,
27528 +       },
27529 +       .cookie = NULL
27530 +};
27531 +
27532 +/* REISER4_DEBUG */
27533 +#endif
27534 +
27535 +/*
27536 + * Declare all super-block fields we want to export
27537 + */
27538 +
27539 +DEFINE_SUPER_RO(01, mkfs_id, "%#llx", 32);
27540 +DEFINE_SUPER_RO(02, block_count, "%llu", 64);
27541 +DEFINE_SUPER_RO(03, blocks_used, "%llu", 64);
27542 +DEFINE_SUPER_RO(04, blocks_free_committed, "%llu", 64);
27543 +DEFINE_SUPER_RO(05, blocks_grabbed, "%llu", 64);
27544 +DEFINE_SUPER_RO(06, blocks_fake_allocated_unformatted, "%llu", 64);
27545 +DEFINE_SUPER_RO(07, blocks_fake_allocated, "%llu", 64);
27546 +DEFINE_SUPER_RO(08, blocks_flush_reserved, "%llu", 64);
27547 +DEFINE_SUPER_RO(09, fsuid, "%#llx", 32);
27548 +#if REISER4_DEBUG
27549 +DEFINE_SUPER_RO(10, eflushed, "%llu", 32);
27550 +#endif
27551 +DEFINE_SUPER_RO(11, blocknr_hint_default, "%lli", 64);
27552 +DEFINE_SUPER_RO(12, nr_files_committed, "%llu", 64);
27553 +DEFINE_SUPER_RO(13, tmgr.atom_count, "%llu", 32);
27554 +DEFINE_SUPER_RO(14, tmgr.id_count, "%llu", 32);
27555 +DEFINE_SUPER_RO(15, tmgr.atom_max_size, "%llu", 32);
27556 +DEFINE_SUPER_RO(16, tmgr.atom_max_age, "%llu", 32);
27557 +
27558 +/* tree fields */
27559 +DEFINE_SUPER_RO(17, tree.root_block, "%llu", 64);
27560 +DEFINE_SUPER_RO(18, tree.height, "%llu", 32);
27561 +DEFINE_SUPER_RO(19, tree.znode_epoch, "%llu", 64);
27562 +DEFINE_SUPER_RO(20, tree.carry.new_node_flags, "%#llx", 32);
27563 +DEFINE_SUPER_RO(21, tree.carry.new_extent_flags, "%#llx", 32);
27564 +DEFINE_SUPER_RO(22, tree.carry.paste_flags, "%#llx", 32);
27565 +DEFINE_SUPER_RO(23, tree.carry.insert_flags, "%#llx", 32);
27566 +
27567 +/* not very good. Should be done by the plugin in stead */
27568 +DEFINE_SUPER_RO(24, next_to_use, "%llu", 64);
27569 +DEFINE_SUPER_RO(25, oids_in_use, "%llu", 64);
27570 +
27571 +DEFINE_SUPER_RO(26, entd.flushers, "%llu", 32);
27572 +
27573 +DEFINE_SUPER_RW(27, trace_flags, "%#llx", 32);
27574 +DEFINE_SUPER_RW(28, log_flags, "%#llx", 32);
27575 +
27576 +#define ATTR_NO(n) &kattr_super_ ## n .attr.kattr
27577 +
27578 +static struct attribute * kattr_def_attrs[] = {
27579 +       ATTR_NO(01),
27580 +       ATTR_NO(02),
27581 +       ATTR_NO(03),
27582 +       ATTR_NO(04),
27583 +       ATTR_NO(05),
27584 +       ATTR_NO(06),
27585 +       ATTR_NO(07),
27586 +       ATTR_NO(08),
27587 +       ATTR_NO(09),
27588 +#if REISER4_DEBUG
27589 +       ATTR_NO(10),
27590 +#endif
27591 +       ATTR_NO(11),
27592 +       ATTR_NO(12),
27593 +       ATTR_NO(13),
27594 +       ATTR_NO(14),
27595 +       ATTR_NO(15),
27596 +       ATTR_NO(16),
27597 +       ATTR_NO(17),
27598 +       ATTR_NO(18),
27599 +       ATTR_NO(19),
27600 +       ATTR_NO(20),
27601 +       ATTR_NO(21),
27602 +       ATTR_NO(22),
27603 +       ATTR_NO(23),
27604 +       ATTR_NO(24),
27605 +       ATTR_NO(25),
27606 +       ATTR_NO(26),
27607 +       ATTR_NO(27),
27608 +       ATTR_NO(28),
27609 +/*
27610 +       ATTR_NO(29),
27611 +       ATTR_NO(30),
27612 +*/
27613 +       &compile_options.attr.kattr,
27614 +       &device.attr.kattr,
27615 +#if REISER4_DEBUG
27616 +       &bugme.attr.kattr,
27617 +#endif
27618 +       NULL
27619 +};
27620 +
27621 +struct kobj_type ktype_reiser4 = {
27622 +       .sysfs_ops      = &fs_attr_ops,
27623 +       .default_attrs  = kattr_def_attrs,
27624 +       .release        = NULL
27625 +};
27626 +
27627 +#if REISER4_STATS
27628 +
27629 +/*
27630 + * Statistical counters exporting.
27631 + *
27632 + * When REISER4_STATS mode is on, reiser4 collects a lot of statistics. See
27633 + * stat.[ch] for more details. All these stat-counters are exported through
27634 + * sysfs in /sys/fs/reiser4/<dev>/stats/ directory. This directory contains
27635 + * "global" stat-counters and also level-* sub-directories for per-level
27636 + * counters (that is counters collected for specific levels of reiser4
27637 + * internal tree).
27638 + *
27639 + */
27640 +
27641 +static struct kobj_type ktype_noattr = {
27642 +       .sysfs_ops      = &fs_attr_ops,
27643 +       .default_attrs  = NULL,
27644 +       .release        = NULL
27645 +};
27646 +
27647 +/*
27648 + * register stat-counters for the level @i with sysfs. This is called during
27649 + * mount.
27650 + */
27651 +static int register_level_attrs(reiser4_super_info_data *sbinfo, int i)
27652 +{
27653 +       struct fs_kobject *level   /* file system kobject representing @i-th
27654 +                                   * level*/;
27655 +       struct fs_kobject *parent; /* it's parent in sysfs tree */
27656 +       int result;
27657 +
27658 +       /* first, setup @level */
27659 +       parent = &sbinfo->stats_kobj;
27660 +       sbinfo->level[i].level = i;
27661 +       level = &sbinfo->level[i].kobj;
27662 +       level->kobj.parent = kobject_get(&parent->kobj);
27663 +       if (level->kobj.parent != NULL) {
27664 +               snprintf(level->kobj.name, KOBJ_NAME_LEN, "level-%2.2i", i);
27665 +               level->kobj.ktype = &ktype_noattr;
27666 +               /* register @level with sysfs */
27667 +               result = fs_kobject_register(sbinfo->tree.super, level);
27668 +               if (result == 0)
27669 +                       /* and ultimately populate it with attributes, that
27670 +                        * is, stat-counters */
27671 +                       result = reiser4_populate_kattr_level_dir(&level->kobj);
27672 +       } else
27673 +               result = RETERR(-EBUSY);
27674 +       return result;
27675 +}
27676 +#endif
27677 +
27678 +static decl_subsys(fs, NULL, NULL);
27679 +decl_subsys(reiser4, &ktype_reiser4, NULL);
27680 +
27681 +/*
27682 + * initialization function called once during kernel boot-up, or reiser4
27683 + * module loading.
27684 + */
27685 +reiser4_internal int
27686 +reiser4_sysfs_init_once(void)
27687 +{
27688 +       int result;
27689 +
27690 +       /* add /sys/fs */
27691 +       result = subsystem_register(&fs_subsys);
27692 +       if (result == 0) {
27693 +               kset_set_kset_s(&reiser4_subsys, fs_subsys);
27694 +               /* add /sys/fs/reiser4 */
27695 +               result = subsystem_register(&reiser4_subsys);
27696 +               if (result == 0)
27697 +                       result = init_prof_kobject();
27698 +       }
27699 +       return result;
27700 +}
27701 +
27702 +/*
27703 + * shutdown function dual to reiser4_sysfs_init_once(). Called during module
27704 + * unload
27705 + */
27706 +reiser4_internal void
27707 +reiser4_sysfs_done_once(void)
27708 +{
27709 +       subsystem_unregister(&reiser4_subsys);
27710 +       subsystem_unregister(&fs_subsys);
27711 +       done_prof_kobject();
27712 +}
27713 +
27714 +/*
27715 + * initialization function called during mount of @super
27716 + */
27717 +reiser4_internal int
27718 +reiser4_sysfs_init(struct super_block *super)
27719 +{
27720 +       reiser4_super_info_data *sbinfo;
27721 +       struct fs_kobject *kobj;
27722 +       int result;
27723 +       ON_STATS(struct fs_kobject *stats_kobj);
27724 +
27725 +       sbinfo = get_super_private(super);
27726 +
27727 +       kobj = &sbinfo->kobj;
27728 +
27729 +       /*
27730 +        * setup and register /sys/fs/reiser4/<dev> object
27731 +        */
27732 +       snprintf(kobj->kobj.name, KOBJ_NAME_LEN, "%s", super->s_id);
27733 +       kobj_set_kset_s(&sbinfo->kobj, reiser4_subsys);
27734 +       result = fs_kobject_register(super, kobj);
27735 +       if (result != 0)
27736 +               return result;
27737 +#if REISER4_STATS
27738 +       /* add attributes representing statistical counters */
27739 +       stats_kobj = &sbinfo->stats_kobj;
27740 +       stats_kobj->kobj.parent = kobject_get(&kobj->kobj);
27741 +       snprintf(stats_kobj->kobj.name, KOBJ_NAME_LEN, "stats");
27742 +       stats_kobj->kobj.ktype = &ktype_noattr;
27743 +       result = fs_kobject_register(super, stats_kobj);
27744 +       if (result != 0)
27745 +               return result;
27746 +       result = reiser4_populate_kattr_dir(&stats_kobj->kobj);
27747 +       if (result == 0) {
27748 +               int i;
27749 +
27750 +               for (i = 0; i < sizeof_array(sbinfo->level); ++i) {
27751 +                       result = register_level_attrs(sbinfo, i);
27752 +                       if (result != 0)
27753 +                               break;
27754 +               }
27755 +       }
27756 +#else
27757 +       result = reiser4_populate_kattr_dir(&kobj->kobj);
27758 +#endif
27759 +
27760 +       return result;
27761 +}
27762 +
27763 +reiser4_internal void
27764 +reiser4_sysfs_done(struct super_block *super)
27765 +{
27766 +       reiser4_super_info_data *sbinfo;
27767 +       ON_STATS(int i);
27768 +
27769 +       sbinfo = get_super_private(super);
27770 +#if REISER4_STATS
27771 +       for (i = 0; i < sizeof_array(sbinfo->level); ++i)
27772 +               fs_kobject_unregister(&sbinfo->level[i].kobj);
27773 +       fs_kobject_unregister(&sbinfo->stats_kobj);
27774 +#endif
27775 +       fs_kobject_unregister(&sbinfo->kobj);
27776 +}
27777 +
27778 +/* REISER4_USE_SYSFS */
27779 +#else
27780 +
27781 +/*
27782 + * Below are stubs for !REISER4_USE_SYSFS case. Do nothing.
27783 + */
27784 +
27785 +reiser4_internal int
27786 +reiser4_sysfs_init(struct super_block *super)
27787 +{
27788 +       return 0;
27789 +}
27790 +
27791 +reiser4_internal void
27792 +reiser4_sysfs_done(struct super_block *super)
27793 +{}
27794 +
27795 +reiser4_internal int
27796 +reiser4_sysfs_init_once(void)
27797 +{
27798 +       return 0;
27799 +}
27800 +
27801 +reiser4_internal void
27802 +reiser4_sysfs_done_once(void)
27803 +{}
27804 +
27805 +#endif
27806 +
27807 +/* Make Linus happy.
27808 +   Local variables:
27809 +   c-indentation-style: "K&R"
27810 +   mode-name: "LC"
27811 +   c-basic-offset: 8
27812 +   tab-width: 8
27813 +   fill-column: 120
27814 +   End:
27815 +*/
27816 diff -rupN linux-2.6.8-rc3/fs/reiser4/kattr.h linux-2.6.8-rc3-a/fs/reiser4/kattr.h
27817 --- linux-2.6.8-rc3/fs/reiser4/kattr.h  1970-01-01 03:00:00.000000000 +0300
27818 +++ linux-2.6.8-rc3-a/fs/reiser4/kattr.h        2004-08-05 21:20:53.314611715 +0400
27819 @@ -0,0 +1,69 @@
27820 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
27821 + * reiser4/README */
27822 +
27823 +/* Interface to sysfs' attributes. See kattr.c for comments */
27824 +
27825 +#if !defined( __REISER4_KATTR_H__ )
27826 +#define __REISER4_KATTR_H__
27827 +
27828 +#include <linux/types.h>
27829 +#include <linux/list.h>
27830 +#include <linux/sysfs.h>
27831 +#include <linux/fs.h>
27832 +
27833 +/* fixme: access to sysfs files may cause deadlock. Do not turn for now */
27834 +#define REISER4_USE_SYSFS (0)
27835 +
27836 +#if REISER4_USE_SYSFS
27837 +
27838 +/* helper macros used by kattr code to output information into buffer without
27839 + * caring about overflow checking. */
27840 +#define KATTR_LEFT(p, buf) (PAGE_SIZE - (p - buf) - 1)
27841 +#define KATTR_PRINT(p, buf, ...)                               \
27842 +({                                                             \
27843 +       p += snprintf(p, KATTR_LEFT(p, buf) , ## __VA_ARGS__);  \
27844 +})
27845 +
27846 +struct super_block;
27847 +struct reiser4_kattr;
27848 +typedef struct reiser4_kattr reiser4_kattr;
27849 +
27850 +/*
27851 + * reiser4_kattr represents a sysfs-exported attribute of reiser4 file system.
27852 + */
27853 +struct reiser4_kattr {
27854 +       struct fs_kattr attr; /* file-system attribute used to interact with
27855 +                              * sysfs */
27856 +       void  *cookie;        /* parameter used to avoid code duplication. See
27857 +                              * kattr.c for explanation. */
27858 +};
27859 +
27860 +extern struct kobj_type ktype_reiser4;
27861 +
27862 +#else
27863 +
27864 +struct reiser4_kattr {
27865 +};
27866 +
27867 +typedef struct reiser4_kattr reiser4_kattr;
27868 +#endif /* REISER4_USE_SYSFS */
27869 +
27870 +extern int reiser4_sysfs_init_once(void);
27871 +extern void reiser4_sysfs_done_once(void);
27872 +
27873 +extern int  reiser4_sysfs_init(struct super_block *super);
27874 +extern void reiser4_sysfs_done(struct super_block *super);
27875 +
27876 +/* __REISER4_KATTR_H__ */
27877 +#endif
27878 +
27879 +/* Make Linus happy.
27880 +   Local variables:
27881 +   c-indentation-style: "K&R"
27882 +   mode-name: "LC"
27883 +   c-basic-offset: 8
27884 +   tab-width: 8
27885 +   fill-column: 120
27886 +   scroll-step: 1
27887 +   End:
27888 +*/
27889 diff -rupN linux-2.6.8-rc3/fs/reiser4/kcond.c linux-2.6.8-rc3-a/fs/reiser4/kcond.c
27890 --- linux-2.6.8-rc3/fs/reiser4/kcond.c  1970-01-01 03:00:00.000000000 +0300
27891 +++ linux-2.6.8-rc3-a/fs/reiser4/kcond.c        2004-08-05 21:20:52.784723482 +0400
27892 @@ -0,0 +1,298 @@
27893 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
27894 +
27895 +/* Kernel condition variables implementation.
27896 +
27897 +   This is simplistic (90 LOC mod comments) condition variable
27898 +   implementation. Condition variable is the most natural "synchronization
27899 +   object" in some circumstances.
27900 +
27901 +   Each CS text-book on multi-threading should discuss condition
27902 +   variables. Also see man/info for:
27903 +
27904 +                   pthread_cond_init(3),
27905 +                   pthread_cond_destroy(3),
27906 +                   pthread_cond_signal(3),
27907 +                   pthread_cond_broadcast(3),
27908 +                   pthread_cond_wait(3),
27909 +                   pthread_cond_timedwait(3).
27910 +
27911 +   See comments in kcond_wait().
27912 +
27913 +   TODO
27914 +
27915 +    1. Add an option (to kcond_init?) to make conditional variable async-safe
27916 +    so that signals and broadcasts can be done from interrupt
27917 +    handlers. Requires using spin_lock_irq in kcond_*().
27918 +
27919 +    2. "Predicated" sleeps: add predicate function to the qlink and only wake
27920 +    sleeper if predicate is true. Probably requires additional parameters to
27921 +    the kcond_{signal,broadcast}() to supply cookie to the predicate. Standard
27922 +    wait_queues already have this functionality. Idea is that if one has
27923 +    object behaving like finite state automaton it is possible to use single
27924 +    per-object condition variable to signal all state transitions. Predicates
27925 +    allow waiters to select only transitions they are interested in without
27926 +    going through context switch.
27927 +
27928 +    3. It is relatively easy to add support for sleeping on the several
27929 +    condition variables at once. Does anybody need this?
27930 +
27931 +*/
27932 +
27933 +#include "debug.h"
27934 +#include "kcond.h"
27935 +#include "spin_macros.h"
27936 +
27937 +#include <linux/timer.h>
27938 +#include <linux/spinlock.h>
27939 +
27940 +static void kcond_timeout(unsigned long datum);
27941 +static void kcond_remove(kcond_t * cvar, kcond_queue_link_t * link);
27942 +
27943 +/* initialize condition variable. Initializer for global condition variables
27944 +   is macro in kcond.h  */
27945 +reiser4_internal kcond_t *
27946 +kcond_init(kcond_t * cvar /* cvar to init */ )
27947 +{
27948 +       assert("nikita-1868", cvar != NULL);
27949 +
27950 +       xmemset(cvar, 0, sizeof *cvar);
27951 +       spin_lock_init(&cvar->lock);
27952 +       cvar->queue = NULL;
27953 +       return cvar;
27954 +}
27955 +
27956 +/* destroy condition variable. */
27957 +reiser4_internal int
27958 +kcond_destroy(kcond_t * cvar /* cvar to destroy */ )
27959 +{
27960 +       return kcond_are_waiters(cvar) ? -EBUSY : 0;
27961 +}
27962 +
27963 +/* Wait until condition variable is signalled. Call this with @lock locked.
27964 +   If @signl is true, then sleep on condition variable will be interruptible
27965 +   by signals. -EINTR is returned if sleep were interrupted by signal and 0
27966 +   otherwise.
27967 +
27968 +   kcond_t is just a queue protected by spinlock. Whenever thread is going to
27969 +   sleep on the kcond_t it does the following:
27970 +
27971 +    (1) prepares "queue link" @qlink which is semaphore constructed locally on
27972 +    the stack of the thread going to sleep.
27973 +
27974 +    (2) takes @cvar spinlock
27975 +
27976 +    (3) adds @qlink to the @cvar queue of waiters
27977 +
27978 +    (4) releases @cvar spinlock
27979 +
27980 +    (5) sleeps on semaphore constructed at step (1)
27981 +
27982 +   When @cvar will be signalled or broadcasted all semaphors enqueued to the
27983 +   @cvar queue will be upped and kcond_wait() will return.
27984 +
27985 +   By use of local semaphore for each waiter we avoid races between going to
27986 +   sleep and waking up---endemic plague of condition variables.
27987 +
27988 +   For example, should kcond_broadcast() come in between steps (4) and (5) it
27989 +   would call up() on semaphores already in a queue and hence, down() in the
27990 +   step (5) would return immediately.
27991 +
27992 +*/
27993 +reiser4_internal int
27994 +kcond_wait(kcond_t * cvar /* cvar to wait for */ ,
27995 +          spinlock_t * lock /* lock to use */ ,
27996 +          int signl /* if 0, ignore signals during sleep */ )
27997 +{
27998 +       kcond_queue_link_t qlink;
27999 +       int result;
28000 +
28001 +       assert("nikita-1869", cvar != NULL);
28002 +       assert("nikita-1870", lock != NULL);
28003 +       assert("nikita-1871", check_spin_is_locked(lock));
28004 +
28005 +       spin_lock(&cvar->lock);
28006 +       qlink.next = cvar->queue;
28007 +       cvar->queue = &qlink;
28008 +       init_MUTEX_LOCKED(&qlink.wait);
28009 +       spin_unlock(&cvar->lock);
28010 +       spin_unlock(lock);
28011 +
28012 +       result = 0;
28013 +       if (signl)
28014 +               result = down_interruptible(&qlink.wait);
28015 +       else
28016 +               down(&qlink.wait);
28017 +       spin_lock(&cvar->lock);
28018 +       if (result != 0) {
28019 +               /* if thread was woken up by signal, @qlink is probably still
28020 +                  in the queue, remove it. */
28021 +               kcond_remove(cvar, &qlink);
28022 +       }
28023 +       /* if it wasn't woken up by signal, spinlock here is still useful,
28024 +          because we want to wait until kcond_{broadcast|signal}
28025 +          finishes. Otherwise down() could interleave with up() in such a way
28026 +          that, that kcond_wait() would exit and up() would see garbage in a
28027 +          semaphore.
28028 +       */
28029 +       spin_unlock(&cvar->lock);
28030 +       spin_lock(lock);
28031 +       return result;
28032 +}
28033 +
28034 +typedef struct {
28035 +       kcond_queue_link_t *link;
28036 +       int *woken_up;
28037 +} kcond_timer_arg;
28038 +
28039 +/* like kcond_wait(), but with timeout */
28040 +reiser4_internal int
28041 +kcond_timedwait(kcond_t * cvar /* cvar to wait for */ ,
28042 +               spinlock_t * lock /* lock to use */ ,
28043 +               signed long timeout /* timeout in jiffies */ ,
28044 +               int signl /* if 0, ignore signals during sleep */ )
28045 +{
28046 +       struct timer_list timer;
28047 +       kcond_queue_link_t qlink;
28048 +       int result;
28049 +       int woken_up;
28050 +       kcond_timer_arg targ;
28051 +
28052 +       assert("nikita-2437", cvar != NULL);
28053 +       assert("nikita-2438", lock != NULL);
28054 +       assert("nikita-2439", check_spin_is_locked(lock));
28055 +
28056 +       spin_lock(&cvar->lock);
28057 +       qlink.next = cvar->queue;
28058 +       cvar->queue = &qlink;
28059 +       init_MUTEX_LOCKED(&qlink.wait);
28060 +       spin_unlock(&cvar->lock);
28061 +       spin_unlock(lock);
28062 +
28063 +       assert("nikita-3011", schedulable());
28064 +
28065 +       /* prepare timer */
28066 +       init_timer(&timer);
28067 +       timer.expires = jiffies + timeout;
28068 +       timer.data = (unsigned long) &targ;
28069 +       timer.function = kcond_timeout;
28070 +
28071 +       woken_up = 0;
28072 +
28073 +       targ.link = &qlink;
28074 +       targ.woken_up = &woken_up;
28075 +
28076 +       /* ... and set it up */
28077 +       add_timer(&timer);
28078 +
28079 +       result = 0;
28080 +       if (signl)
28081 +               result = down_interruptible(&qlink.wait);
28082 +       else
28083 +               down(&qlink.wait);
28084 +
28085 +       /* cancel timer */
28086 +       del_timer_sync(&timer);
28087 +
28088 +       if (woken_up)
28089 +               result = -ETIMEDOUT;
28090 +
28091 +       spin_lock(&cvar->lock);
28092 +       if (result != 0) {
28093 +               /* if thread was woken up by signal, or due to time-out,
28094 +                  @qlink is probably still in the queue, remove it. */
28095 +               kcond_remove(cvar, &qlink);
28096 +       }
28097 +       spin_unlock(&cvar->lock);
28098 +
28099 +       spin_lock(lock);
28100 +       return result;
28101 +}
28102 +
28103 +/* Signal condition variable: wake up one waiter, if any. */
28104 +reiser4_internal int
28105 +kcond_signal(kcond_t * cvar /* cvar to signal */ )
28106 +{
28107 +       kcond_queue_link_t *queue_head;
28108 +
28109 +       assert("nikita-1872", cvar != NULL);
28110 +
28111 +       spin_lock(&cvar->lock);
28112 +
28113 +       queue_head = cvar->queue;
28114 +       if (queue_head != NULL) {
28115 +               cvar->queue = queue_head->next;
28116 +               up(&queue_head->wait);
28117 +       }
28118 +       spin_unlock(&cvar->lock);
28119 +       return 1;
28120 +}
28121 +
28122 +/* Broadcast condition variable: wake up all waiters. */
28123 +reiser4_internal int
28124 +kcond_broadcast(kcond_t * cvar /* cvar to broadcast */ )
28125 +{
28126 +       kcond_queue_link_t *queue_head;
28127 +
28128 +       assert("nikita-1875", cvar != NULL);
28129 +
28130 +       spin_lock(&cvar->lock);
28131 +
28132 +       for (queue_head = cvar->queue; queue_head != NULL; queue_head = queue_head->next)
28133 +               up(&queue_head->wait);
28134 +
28135 +       cvar->queue = NULL;
28136 +       spin_unlock(&cvar->lock);
28137 +       return 1;
28138 +}
28139 +
28140 +/* true if there are threads sleeping on @cvar */
28141 +reiser4_internal int
28142 +kcond_are_waiters(kcond_t * cvar /* cvar to query */ )
28143 +{
28144 +       assert("nikita-1877", cvar != NULL);
28145 +       return cvar->queue != NULL;
28146 +}
28147 +
28148 +/* timer expiration function used by kcond_timedwait */
28149 +static void
28150 +kcond_timeout(unsigned long datum)
28151 +{
28152 +       kcond_timer_arg *arg;
28153 +
28154 +       arg = (kcond_timer_arg *) datum;
28155 +       *arg->woken_up = 1;
28156 +       up(&arg->link->wait);
28157 +}
28158 +
28159 +/* helper function to remove @link from @cvar queue */
28160 +static void
28161 +kcond_remove(kcond_t * cvar /* cvar to operate on */ ,
28162 +            kcond_queue_link_t * link /* link to remove */ )
28163 +{
28164 +       kcond_queue_link_t *scan;
28165 +       kcond_queue_link_t *prev;
28166 +
28167 +       assert("nikita-2440", cvar != NULL);
28168 +       assert("nikita-2441", check_spin_is_locked(&cvar->lock));
28169 +
28170 +       for (scan = cvar->queue, prev = NULL; scan != NULL; prev = scan, scan = scan->next) {
28171 +               if (scan == link) {
28172 +                       if (prev == NULL)
28173 +                               cvar->queue = scan->next;
28174 +                       else
28175 +                               prev->next = scan->next;
28176 +                       break;
28177 +               }
28178 +       }
28179 +}
28180 +
28181 +/* Make Linus happy.
28182 +   Local variables:
28183 +   c-indentation-style: "K&R"
28184 +   mode-name: "LC"
28185 +   c-basic-offset: 8
28186 +   tab-width: 8
28187 +   fill-column: 120
28188 +   scroll-step: 1
28189 +   End:
28190 +*/
28191 diff -rupN linux-2.6.8-rc3/fs/reiser4/kcond.h linux-2.6.8-rc3-a/fs/reiser4/kcond.h
28192 --- linux-2.6.8-rc3/fs/reiser4/kcond.h  1970-01-01 03:00:00.000000000 +0300
28193 +++ linux-2.6.8-rc3-a/fs/reiser4/kcond.h        2004-08-05 21:20:53.494573757 +0400
28194 @@ -0,0 +1,59 @@
28195 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
28196 +
28197 +/* Declaration of kernel condition variables and API. See kcond.c for more
28198 +   info. */
28199 +
28200 +#ifndef __KCOND_H__
28201 +#define __KCOND_H__
28202 +
28203 +#include <linux/spinlock.h>
28204 +#include <asm/semaphore.h>
28205 +
28206 +typedef struct kcond_queue_link_s kcond_queue_link_t;
28207 +
28208 +/* condition variable */
28209 +typedef struct kcond_s {
28210 +       /* lock protecting integrity of @queue */
28211 +       spinlock_t lock;
28212 +       /* queue of waiters */
28213 +       kcond_queue_link_t *queue;
28214 +} kcond_t;
28215 +
28216 +/* queue link added to the kcond->queue by each waiter */
28217 +struct kcond_queue_link_s {
28218 +       /* next link in the queue */
28219 +       kcond_queue_link_t *next;
28220 +       /* semaphore to signal on wake up */
28221 +       struct semaphore wait;
28222 +};
28223 +
28224 +extern kcond_t *kcond_init(kcond_t * cvar);
28225 +extern int kcond_destroy(kcond_t * cvar);
28226 +
28227 +extern int kcond_wait(kcond_t * cvar, spinlock_t * lock, int signl);
28228 +extern int kcond_timedwait(kcond_t * cvar, spinlock_t * lock, signed long timeout, int signl);
28229 +extern int kcond_signal(kcond_t * cvar);
28230 +extern int kcond_broadcast(kcond_t * cvar);
28231 +
28232 +extern int kcond_are_waiters(kcond_t * cvar);
28233 +
28234 +extern void kcond_print(kcond_t * cvar);
28235 +
28236 +#define KCOND_STATIC_INIT                      \
28237 +       {                                       \
28238 +               .lock = SPIN_LOCK_UNLOCKED,     \
28239 +               .queue = NULL                   \
28240 +       }
28241 +
28242 +/* __KCOND_H__ */
28243 +#endif
28244 +
28245 +/* Make Linus happy.
28246 +   Local variables:
28247 +   c-indentation-style: "K&R"
28248 +   mode-name: "LC"
28249 +   c-basic-offset: 8
28250 +   tab-width: 8
28251 +   fill-column: 120
28252 +   End:
28253 +*/
28254 diff -rupN linux-2.6.8-rc3/fs/reiser4/key.c linux-2.6.8-rc3-a/fs/reiser4/key.c
28255 --- linux-2.6.8-rc3/fs/reiser4/key.c    1970-01-01 03:00:00.000000000 +0300
28256 +++ linux-2.6.8-rc3-a/fs/reiser4/key.c  2004-08-05 21:20:52.935691639 +0400
28257 @@ -0,0 +1,168 @@
28258 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
28259 +
28260 +/* Key manipulations. */
28261 +
28262 +#include "debug.h"
28263 +#include "key.h"
28264 +#include "super.h"
28265 +#include "reiser4.h"
28266 +
28267 +#include <linux/types.h>       /* for __u??  */
28268 +
28269 +/* Minimal possible key: all components are zero. It is presumed that this is
28270 +   independent of key scheme. */
28271 +static const reiser4_key MINIMAL_KEY = {
28272 +       .el = {
28273 +               {0ull},
28274 +               ON_LARGE_KEY({0ull},)
28275 +               {0ull},
28276 +               {0ull}
28277 +       }
28278 +};
28279 +
28280 +/* Maximal possible key: all components are ~0. It is presumed that this is
28281 +   independent of key scheme. */
28282 +static const reiser4_key MAXIMAL_KEY = {
28283 +       .el = {
28284 +               {~0ull},
28285 +               ON_LARGE_KEY({~0ull},)
28286 +               {~0ull},
28287 +               {~0ull}
28288 +       }
28289 +};
28290 +
28291 +/* Initialise key. */
28292 +reiser4_internal void
28293 +key_init(reiser4_key * key /* key to init */ )
28294 +{
28295 +       assert("nikita-1169", key != NULL);
28296 +       xmemset(key, 0, sizeof *key);
28297 +}
28298 +
28299 +/* minimal possible key in the tree. Return pointer to the static storage. */
28300 +reiser4_internal const reiser4_key *
28301 +min_key(void)
28302 +{
28303 +       return &MINIMAL_KEY;
28304 +}
28305 +
28306 +/* maximum possible key in the tree. Return pointer to the static storage. */
28307 +reiser4_internal const reiser4_key *
28308 +max_key(void)
28309 +{
28310 +       return &MAXIMAL_KEY;
28311 +}
28312 +
28313 +#if REISER4_DEBUG_OUTPUT
28314 +/* debugging aid: print symbolic name of key type */
28315 +static const char *
28316 +type_name(unsigned int key_type /* key type */ )
28317 +{
28318 +       switch (key_type) {
28319 +       case KEY_FILE_NAME_MINOR:
28320 +               return "file name";
28321 +       case KEY_SD_MINOR:
28322 +               return "stat data";
28323 +       case KEY_ATTR_NAME_MINOR:
28324 +               return "attr name";
28325 +       case KEY_ATTR_BODY_MINOR:
28326 +               return "attr body";
28327 +       case KEY_BODY_MINOR:
28328 +               return "file body";
28329 +       default:
28330 +               return "unknown";
28331 +       }
28332 +}
28333 +
28334 +extern char *unpack_string(__u64 value, char *buf);
28335 +
28336 +/* debugging aid: print human readable information about key */
28337 +reiser4_internal void
28338 +print_key(const char *prefix /* prefix to print */ ,
28339 +         const reiser4_key * key /* key to print */ )
28340 +{
28341 +       /* turn bold on */
28342 +       /* printf ("\033[1m"); */
28343 +       if (key == NULL)
28344 +               printk("%s: null key\n", prefix);
28345 +       else {
28346 +               if (REISER4_LARGE_KEY)
28347 +                       printk("%s: (%Lx:%x:%Lx:%Lx:%Lx:%Lx)", prefix,
28348 +                              get_key_locality(key),
28349 +                              get_key_type(key),
28350 +                              get_key_ordering(key),
28351 +                              get_key_band(key),
28352 +                              get_key_objectid(key),
28353 +                              get_key_offset(key));
28354 +               else
28355 +                       printk("%s: (%Lx:%x:%Lx:%Lx:%Lx)", prefix,
28356 +                              get_key_locality(key),
28357 +                              get_key_type(key),
28358 +                              get_key_band(key),
28359 +                              get_key_objectid(key),
28360 +                              get_key_offset(key));
28361 +               /*
28362 +                * if this is a key of directory entry, try to decode part of
28363 +                * a name stored in the key, and output it.
28364 +                */
28365 +               if (get_key_type(key) == KEY_FILE_NAME_MINOR) {
28366 +                       char buf[DE_NAME_BUF_LEN];
28367 +                       char *c;
28368 +
28369 +                       c = buf;
28370 +                       c = unpack_string(get_key_ordering(key), c);
28371 +                       unpack_string(get_key_fulloid(key), c);
28372 +                       printk("[%s", buf);
28373 +                       if (is_longname_key(key))
28374 +                               /*
28375 +                                * only part of the name is stored in the key.
28376 +                                */
28377 +                               printk("...]\n");
28378 +                       else {
28379 +                               /*
28380 +                                * whole name is stored in the key.
28381 +                                */
28382 +                               unpack_string(get_key_offset(key), buf);
28383 +                               printk("%s]\n", buf);
28384 +                       }
28385 +               } else {
28386 +                       printk("[%s]\n", type_name(get_key_type(key)));
28387 +               }
28388 +       }
28389 +       /* turn bold off */
28390 +       /* printf ("\033[m\017"); */
28391 +}
28392 +
28393 +#endif
28394 +
28395 +/* like print_key() but outputs key representation into @buffer. */
28396 +reiser4_internal int
28397 +sprintf_key(char *buffer /* buffer to print key into */ ,
28398 +           const reiser4_key * key /* key to print */ )
28399 +{
28400 +       if (REISER4_LARGE_KEY)
28401 +               return sprintf(buffer, "(%Lx:%x:%Lx:%Lx:%Lx:%Lx)",
28402 +                              get_key_locality(key),
28403 +                              get_key_type(key),
28404 +                              get_key_ordering(key),
28405 +                              get_key_band(key),
28406 +                              get_key_objectid(key),
28407 +                              get_key_offset(key));
28408 +       else
28409 +               return sprintf(buffer, "(%Lx:%x:%Lx:%Lx:%Lx)",
28410 +                              get_key_locality(key),
28411 +                              get_key_type(key),
28412 +                              get_key_band(key),
28413 +                              get_key_objectid(key),
28414 +                              get_key_offset(key));
28415 +}
28416 +
28417 +/* Make Linus happy.
28418 +   Local variables:
28419 +   c-indentation-style: "K&R"
28420 +   mode-name: "LC"
28421 +   c-basic-offset: 8
28422 +   tab-width: 8
28423 +   fill-column: 120
28424 +   End:
28425 +*/
28426 diff -rupN linux-2.6.8-rc3/fs/reiser4/key.h linux-2.6.8-rc3-a/fs/reiser4/key.h
28427 --- linux-2.6.8-rc3/fs/reiser4/key.h    1970-01-01 03:00:00.000000000 +0300
28428 +++ linux-2.6.8-rc3-a/fs/reiser4/key.h  2004-08-05 21:20:53.274620150 +0400
28429 @@ -0,0 +1,389 @@
28430 +/* Copyright 2000, 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
28431 +
28432 +/* Declarations of key-related data-structures and operations on keys. */
28433 +
28434 +#if !defined( __REISER4_KEY_H__ )
28435 +#define __REISER4_KEY_H__
28436 +
28437 +#include "dformat.h"
28438 +#include "forward.h"
28439 +#include "debug.h"
28440 +
28441 +#include <linux/types.h>       /* for __u??  */
28442 +
28443 +/* Operations on keys in reiser4 tree */
28444 +
28445 +/* No access to any of these fields shall be done except via a
28446 +   wrapping macro/function, and that wrapping macro/function shall
28447 +   convert to little endian order.  Compare keys will consider cpu byte order. */
28448 +
28449 +/* A storage layer implementation difference between a regular unix file body and its attributes is in the typedef below
28450 +   which causes all of the attributes of a file to be near in key to all of the other attributes for all of the files
28451 +   within that directory, and not near to the file itself.  It is interesting to consider whether this is the wrong
28452 +   approach, and whether there should be no difference at all. For current usage patterns this choice is probably the
28453 +   right one.  */
28454 +
28455 +/* possible values for minor packing locality (4 bits required) */
28456 +typedef enum {
28457 +       /* file name */
28458 +       KEY_FILE_NAME_MINOR = 0,
28459 +       /* stat-data */
28460 +       KEY_SD_MINOR = 1,
28461 +       /* file attribute name */
28462 +       KEY_ATTR_NAME_MINOR = 2,
28463 +       /* file attribute value */
28464 +       KEY_ATTR_BODY_MINOR = 3,
28465 +       /* file body (tail or extent) */
28466 +       KEY_BODY_MINOR = 4,
28467 +} key_minor_locality;
28468 +
28469 +/* everything stored in the tree has a unique key, which means that the tree is (logically) fully ordered by key.
28470 +   Physical order is determined by dynamic heuristics that attempt to reflect key order when allocating available space,
28471 +   and by the repacker.  It is stylistically better to put aggregation information into the key.  Thus, if you want to
28472 +   segregate extents from tails, it is better to give them distinct minor packing localities rather than changing
28473 +   block_alloc.c to check the node type when deciding where to allocate the node.
28474 +
28475 +   The need to randomly displace new directories and large files disturbs this symmetry unfortunately.  However, it
28476 +   should be noted that this is a need that is not clearly established given the existence of a repacker.  Also, in our
28477 +   current implementation tails have a different minor packing locality from extents, and no files have both extents and
28478 +   tails, so maybe symmetry can be had without performance cost after all.  Symmetry is what we ship for now....
28479 +*/
28480 +
28481 +/* Arbitrary major packing localities can be assigned to objects using
28482 +   the reiser4(filenameA/..packing<=some_number) system call.
28483 +
28484 +   In reiser4, the creat() syscall creates a directory
28485 +
28486 +   whose default flow (that which is referred to if the directory is
28487 +   read as a file) is the traditional unix file body.
28488 +
28489 +   whose directory plugin is the 'filedir'
28490 +
28491 +   whose major packing locality is that of the parent of the object created.
28492 +
28493 +   The static_stat item is a particular commonly used directory
28494 +   compression (the one for normal unix files).
28495 +
28496 +   The filedir plugin checks to see if the static_stat item exists.
28497 +   There is a unique key for static_stat.  If yes, then it uses the
28498 +   static_stat item for all of the values that it contains.  The
28499 +   static_stat item contains a flag for each stat it contains which
28500 +   indicates whether one should look outside the static_stat item for its
28501 +   contents.
28502 +*/
28503 +
28504 +/* offset of fields in reiser4_key. Value of each element of this enum
28505 +    is index within key (thought as array of __u64's) where this field
28506 +    is. */
28507 +typedef enum {
28508 +       /* major "locale", aka dirid. Sits in 1st element */
28509 +       KEY_LOCALITY_INDEX = 0,
28510 +       /* minor "locale", aka item type. Sits in 1st element */
28511 +       KEY_TYPE_INDEX = 0,
28512 +       ON_LARGE_KEY(KEY_ORDERING_INDEX,)
28513 +       /* "object band". Sits in 2nd element */
28514 +       KEY_BAND_INDEX,
28515 +       /* objectid. Sits in 2nd element */
28516 +       KEY_OBJECTID_INDEX = KEY_BAND_INDEX,
28517 +       /* full objectid. Sits in 2nd element */
28518 +       KEY_FULLOID_INDEX = KEY_BAND_INDEX,
28519 +       /* Offset. Sits in 3rd element */
28520 +       KEY_OFFSET_INDEX,
28521 +       /* Name hash. Sits in 3rd element */
28522 +       KEY_HASH_INDEX = KEY_OFFSET_INDEX,
28523 +       KEY_CACHELINE_END = KEY_OFFSET_INDEX,
28524 +       KEY_LAST_INDEX
28525 +} reiser4_key_field_index;
28526 +
28527 +/* key in reiser4 internal "balanced" tree. It is just array of three
28528 +    64bit integers in disk byte order (little-endian by default). This
28529 +    array is actually indexed by reiser4_key_field.  Each __u64 within
28530 +    this array is called "element". Logical key component encoded within
28531 +    elements are called "fields".
28532 +
28533 +    We declare this as union with second component dummy to suppress
28534 +    inconvenient array<->pointer casts implied in C. */
28535 +union reiser4_key {
28536 +       d64 el[KEY_LAST_INDEX];
28537 +       int pad;
28538 +};
28539 +
28540 +/* bitmasks showing where within reiser4_key particular key is
28541 +    stored. */
28542 +typedef enum {
28543 +       /* major locality occupies higher 60 bits of the first element */
28544 +       KEY_LOCALITY_MASK = 0xfffffffffffffff0ull,
28545 +       /* minor locality occupies lower 4 bits of the first element */
28546 +       KEY_TYPE_MASK = 0xfull,
28547 +       /* controversial band occupies higher 4 bits of the 2nd element */
28548 +       KEY_BAND_MASK = 0xf000000000000000ull,
28549 +       /* objectid occupies lower 60 bits of the 2nd element */
28550 +       KEY_OBJECTID_MASK = 0x0fffffffffffffffull,
28551 +       /* full 64bit objectid*/
28552 +       KEY_FULLOID_MASK = 0xffffffffffffffffull,
28553 +       /* offset is just 3rd L.M.Nt itself */
28554 +       KEY_OFFSET_MASK = 0xffffffffffffffffull,
28555 +       /* ordering is whole second element */
28556 +       KEY_ORDERING_MASK = 0xffffffffffffffffull,
28557 +} reiser4_key_field_mask;
28558 +
28559 +/* how many bits key element should be shifted to left to get particular field */
28560 +typedef enum {
28561 +       KEY_LOCALITY_SHIFT = 4,
28562 +       KEY_TYPE_SHIFT = 0,
28563 +       KEY_BAND_SHIFT = 60,
28564 +       KEY_OBJECTID_SHIFT = 0,
28565 +       KEY_FULLOID_SHIFT = 0,
28566 +       KEY_OFFSET_SHIFT = 0,
28567 +       KEY_ORDERING_SHIFT = 0,
28568 +} reiser4_key_field_shift;
28569 +
28570 +static inline __u64
28571 +get_key_el(const reiser4_key * key, reiser4_key_field_index off)
28572 +{
28573 +       assert("nikita-753", key != NULL);
28574 +       assert("nikita-754", off < KEY_LAST_INDEX);
28575 +       return d64tocpu(&key->el[off]);
28576 +}
28577 +
28578 +static inline void
28579 +set_key_el(reiser4_key * key, reiser4_key_field_index off, __u64 value)
28580 +{
28581 +       assert("nikita-755", key != NULL);
28582 +       assert("nikita-756", off < KEY_LAST_INDEX);
28583 +       cputod64(value, &key->el[off]);
28584 +}
28585 +
28586 +/* macro to define getter and setter functions for field F with type T */
28587 +#define DEFINE_KEY_FIELD( L, U, T )                                    \
28588 +static inline T get_key_ ## L ( const reiser4_key *key )               \
28589 +{                                                                      \
28590 +       assert( "nikita-750", key != NULL );                            \
28591 +       return ( T ) ( get_key_el( key, KEY_ ## U ## _INDEX ) &         \
28592 +                KEY_ ## U ## _MASK ) >> KEY_ ## U ## _SHIFT;           \
28593 +}                                                                      \
28594 +                                                                       \
28595 +static inline void set_key_ ## L ( reiser4_key *key, T loc )           \
28596 +{                                                                      \
28597 +       __u64 el;                                                       \
28598 +                                                                       \
28599 +       assert( "nikita-752", key != NULL );                            \
28600 +                                                                       \
28601 +       el = get_key_el( key, KEY_ ## U ## _INDEX );                    \
28602 +       /* clear field bits in the key */                               \
28603 +       el &= ~KEY_ ## U ## _MASK;                                      \
28604 +       /* actually it should be                                        \
28605 +                                                                       \
28606 +          el |= ( loc << KEY_ ## U ## _SHIFT ) & KEY_ ## U ## _MASK;   \
28607 +                                                                       \
28608 +          but we trust user to never pass values that wouldn't fit     \
28609 +          into field. Clearing extra bits is one operation, but this   \
28610 +          function is time-critical.                                   \
28611 +          But check this in assertion. */                              \
28612 +       assert( "nikita-759", ( ( loc << KEY_ ## U ## _SHIFT ) &        \
28613 +               ~KEY_ ## U ## _MASK ) == 0 );                           \
28614 +       el |= ( loc << KEY_ ## U ## _SHIFT );                           \
28615 +       set_key_el( key, KEY_ ## U ## _INDEX, el );                     \
28616 +}
28617 +
28618 +typedef __u64 oid_t;
28619 +
28620 +/* define get_key_locality(), set_key_locality() */
28621 +DEFINE_KEY_FIELD(locality, LOCALITY, oid_t);
28622 +/* define get_key_type(), set_key_type() */
28623 +DEFINE_KEY_FIELD(type, TYPE, key_minor_locality);
28624 +/* define get_key_band(), set_key_band() */
28625 +DEFINE_KEY_FIELD(band, BAND, __u64);
28626 +/* define get_key_objectid(), set_key_objectid() */
28627 +DEFINE_KEY_FIELD(objectid, OBJECTID, oid_t);
28628 +/* define get_key_fulloid(), set_key_fulloid() */
28629 +DEFINE_KEY_FIELD(fulloid, FULLOID, oid_t);
28630 +/* define get_key_offset(), set_key_offset() */
28631 +DEFINE_KEY_FIELD(offset, OFFSET, __u64);
28632 +#if (REISER4_LARGE_KEY)
28633 +/* define get_key_ordering(), set_key_ordering() */
28634 +DEFINE_KEY_FIELD(ordering, ORDERING, __u64);
28635 +#else
28636 +static inline __u64 get_key_ordering(const reiser4_key *key)
28637 +{
28638 +       return 0;
28639 +}
28640 +
28641 +static inline void set_key_ordering(reiser4_key *key, __u64 val)
28642 +{
28643 +}
28644 +#endif
28645 +
28646 +/* key comparison result */
28647 +typedef enum { LESS_THAN = -1, /* if first key is less than second */
28648 +       EQUAL_TO = 0,           /* if keys are equal */
28649 +       GREATER_THAN = +1       /* if first key is greater than second */
28650 +} cmp_t;
28651 +
28652 +void key_init(reiser4_key * key);
28653 +
28654 +/* minimal possible key in the tree. Return pointer to the static storage. */
28655 +extern const reiser4_key *min_key(void);
28656 +extern const reiser4_key *max_key(void);
28657 +
28658 +/* helper macro for keycmp() */
28659 +#define KEY_DIFF(k1, k2, field)                                                        \
28660 +({                                                                             \
28661 +       typeof (get_key_ ## field (k1)) f1;                                     \
28662 +       typeof (get_key_ ## field (k2)) f2;                                     \
28663 +                                                                               \
28664 +       f1 = get_key_ ## field (k1);                                            \
28665 +       f2 = get_key_ ## field (k2);                                            \
28666 +                                                                               \
28667 +       (f1 < f2) ? LESS_THAN : ((f1 == f2) ? EQUAL_TO : GREATER_THAN);         \
28668 +})
28669 +
28670 +/* helper macro for keycmp() */
28671 +#define KEY_DIFF_EL(k1, k2, off)                                               \
28672 +({                                                                             \
28673 +       __u64 e1;                                                               \
28674 +       __u64 e2;                                                               \
28675 +                                                                               \
28676 +       e1 = get_key_el(k1, off);                                               \
28677 +       e2 = get_key_el(k2, off);                                               \
28678 +                                                                               \
28679 +       (e1 < e2) ? LESS_THAN : ((e1 == e2) ? EQUAL_TO : GREATER_THAN);         \
28680 +})
28681 +
28682 +/* compare `k1' and `k2'.  This function is a heart of "key allocation
28683 +    policy". All you need to implement new policy is to add yet another
28684 +    clause here. */
28685 +static inline cmp_t
28686 +keycmp(const reiser4_key * k1 /* first key to compare */ ,
28687 +       const reiser4_key * k2 /* second key to compare */ )
28688 +{
28689 +       cmp_t result;
28690 +
28691 +       /*
28692 +        * This function is the heart of reiser4 tree-routines. Key comparison
28693 +        * is among most heavily used operations in the file system.
28694 +        */
28695 +
28696 +       assert("nikita-439", k1 != NULL);
28697 +       assert("nikita-440", k2 != NULL);
28698 +
28699 +       /* there is no actual branch here: condition is compile time constant
28700 +        * and constant folding and propagation ensures that only one branch
28701 +        * is actually compiled in. */
28702 +
28703 +       if (REISER4_PLANA_KEY_ALLOCATION) {
28704 +               /* if physical order of fields in a key is identical
28705 +                  with logical order, we can implement key comparison
28706 +                  as three 64bit comparisons. */
28707 +               /* logical order of fields in plan-a:
28708 +                  locality->type->objectid->offset. */
28709 +               /* compare locality and type at once */
28710 +               result = KEY_DIFF_EL(k1, k2, 0);
28711 +               if (result == EQUAL_TO) {
28712 +                       /* compare objectid (and band if it's there) */
28713 +                       result = KEY_DIFF_EL(k1, k2, 1);
28714 +                       /* compare offset */
28715 +                       if (result == EQUAL_TO) {
28716 +                               result = KEY_DIFF_EL(k1, k2, 2);
28717 +                               if (REISER4_LARGE_KEY && result == EQUAL_TO) {
28718 +                                       result = KEY_DIFF_EL(k1, k2, 3);
28719 +                               }
28720 +                       }
28721 +               }
28722 +       } else if (REISER4_3_5_KEY_ALLOCATION) {
28723 +               result = KEY_DIFF(k1, k2, locality);
28724 +               if (result == EQUAL_TO) {
28725 +                       result = KEY_DIFF(k1, k2, objectid);
28726 +                       if (result == EQUAL_TO) {
28727 +                               result = KEY_DIFF(k1, k2, type);
28728 +                               if (result == EQUAL_TO)
28729 +                                       result = KEY_DIFF(k1, k2, offset);
28730 +                       }
28731 +               }
28732 +       } else
28733 +               impossible("nikita-441", "Unknown key allocation scheme!");
28734 +       return result;
28735 +}
28736 +
28737 +/* true if @k1 equals @k2 */
28738 +static inline int
28739 +keyeq(const reiser4_key * k1 /* first key to compare */ ,
28740 +      const reiser4_key * k2 /* second key to compare */ )
28741 +{
28742 +       assert("nikita-1879", k1 != NULL);
28743 +       assert("nikita-1880", k2 != NULL);
28744 +       return !memcmp(k1, k2, sizeof *k1);
28745 +}
28746 +
28747 +/* true if @k1 is less than @k2 */
28748 +static inline int
28749 +keylt(const reiser4_key * k1 /* first key to compare */ ,
28750 +      const reiser4_key * k2 /* second key to compare */ )
28751 +{
28752 +       assert("nikita-1952", k1 != NULL);
28753 +       assert("nikita-1953", k2 != NULL);
28754 +       return keycmp(k1, k2) == LESS_THAN;
28755 +}
28756 +
28757 +/* true if @k1 is less than or equal to @k2 */
28758 +static inline int
28759 +keyle(const reiser4_key * k1 /* first key to compare */ ,
28760 +      const reiser4_key * k2 /* second key to compare */ )
28761 +{
28762 +       assert("nikita-1954", k1 != NULL);
28763 +       assert("nikita-1955", k2 != NULL);
28764 +       return keycmp(k1, k2) != GREATER_THAN;
28765 +}
28766 +
28767 +/* true if @k1 is greater than @k2 */
28768 +static inline int
28769 +keygt(const reiser4_key * k1 /* first key to compare */ ,
28770 +      const reiser4_key * k2 /* second key to compare */ )
28771 +{
28772 +       assert("nikita-1959", k1 != NULL);
28773 +       assert("nikita-1960", k2 != NULL);
28774 +       return keycmp(k1, k2) == GREATER_THAN;
28775 +}
28776 +
28777 +/* true if @k1 is greater than or equal to @k2 */
28778 +static inline int
28779 +keyge(const reiser4_key * k1 /* first key to compare */ ,
28780 +      const reiser4_key * k2 /* second key to compare */ )
28781 +{
28782 +       assert("nikita-1956", k1 != NULL);
28783 +       assert("nikita-1957", k2 != NULL);      /* October  4: sputnik launched
28784 +                                                * November 3: Laika */
28785 +       return keycmp(k1, k2) != LESS_THAN;
28786 +}
28787 +
28788 +static inline void
28789 +prefetchkey(reiser4_key *key)
28790 +{
28791 +       prefetch(key);
28792 +       prefetch(&key->el[KEY_CACHELINE_END]);
28793 +}
28794 +
28795 +/* (%Lx:%x:%Lx:%Lx:%Lx:%Lx) =
28796 +           1 + 16 + 1 + 1 + 1 + 1 + 1 + 16 + 1 + 16 + 1 + 16 + 1 */
28797 +/* size of a buffer suitable to hold human readable key representation */
28798 +#define KEY_BUF_LEN (80)
28799 +
28800 +extern int sprintf_key(char *buffer, const reiser4_key * key);
28801 +#if REISER4_DEBUG_OUTPUT
28802 +extern void print_key(const char *prefix, const reiser4_key * key);
28803 +#else
28804 +#define print_key(p,k) noop
28805 +#endif
28806 +
28807 +/* __FS_REISERFS_KEY_H__ */
28808 +#endif
28809 +
28810 +/* Make Linus happy.
28811 +   Local variables:
28812 +   c-indentation-style: "K&R"
28813 +   mode-name: "LC"
28814 +   c-basic-offset: 8
28815 +   tab-width: 8
28816 +   fill-column: 120
28817 +   End:
28818 +*/
28819 diff -rupN linux-2.6.8-rc3/fs/reiser4/ktxnmgrd.c linux-2.6.8-rc3-a/fs/reiser4/ktxnmgrd.c
28820 --- linux-2.6.8-rc3/fs/reiser4/ktxnmgrd.c       1970-01-01 03:00:00.000000000 +0300
28821 +++ linux-2.6.8-rc3-a/fs/reiser4/ktxnmgrd.c     2004-08-05 21:20:53.269621205 +0400
28822 @@ -0,0 +1,274 @@
28823 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
28824 +/* Transaction manager daemon. */
28825 +
28826 +/*
28827 + * ktxnmgrd is a kernel daemon responsible for committing transactions. It is
28828 + * needed/important for the following reasons:
28829 + *
28830 + *     1. in reiser4 atom is not committed immediately when last transaction
28831 + *     handle closes, unless atom is either too old or too large (see
28832 + *     atom_should_commit()). This is done to avoid committing too frequently.
28833 + *     because:
28834 + *
28835 + *     2. sometimes we don't want to commit atom when closing last transaction
28836 + *     handle even if it is old and fat enough. For example, because we are at
28837 + *     this point under directory semaphore, and committing would stall all
28838 + *     accesses to this directory.
28839 + *
28840 + * ktxnmgrd binds its time sleeping on condition variable. When is awakes
28841 + * either due to (tunable) timeout or because it was explicitly woken up by
28842 + * call to ktxnmgrd_kick(), it scans list of all atoms and commits ones
28843 + * eligible.
28844 + *
28845 + */
28846 +
28847 +#include "debug.h"
28848 +#include "kcond.h"
28849 +#include "txnmgr.h"
28850 +#include "tree.h"
28851 +#include "ktxnmgrd.h"
28852 +#include "super.h"
28853 +#include "reiser4.h"
28854 +
28855 +#include <linux/sched.h>       /* for struct task_struct */
28856 +#include <linux/suspend.h>
28857 +#include <linux/kernel.h>
28858 +#include <linux/writeback.h>
28859 +
28860 +static int scan_mgr(txn_mgr * mgr);
28861 +
28862 +reiser4_internal int
28863 +init_ktxnmgrd_context(txn_mgr * mgr)
28864 +{
28865 +       ktxnmgrd_context * ctx;
28866 +
28867 +       assert ("zam-1013", mgr != NULL);
28868 +       assert ("zam-1014", mgr->daemon == NULL);
28869 +
28870 +       ctx = reiser4_kmalloc(sizeof(ktxnmgrd_context), GFP_KERNEL);
28871 +       if (ctx == NULL)
28872 +               return RETERR(-ENOMEM);
28873 +
28874 +       assert("nikita-2442", ctx != NULL);
28875 +
28876 +       xmemset(ctx, 0, sizeof *ctx);
28877 +       init_completion(&ctx->finish);
28878 +       kcond_init(&ctx->startup);
28879 +       kcond_init(&ctx->wait);
28880 +       spin_lock_init(&ctx->guard);
28881 +       ctx->timeout = REISER4_TXNMGR_TIMEOUT;
28882 +       mgr->daemon = ctx;
28883 +       return 0;
28884 +}
28885 +
28886 +/* change current->comm so that ps, top, and friends will see changed
28887 +   state. This serves no useful purpose whatsoever, but also costs
28888 +   nothing. May be it will make lonely system administrator feeling less alone
28889 +   at 3 A.M.
28890 +*/
28891 +#define set_comm( state )                                              \
28892 +       snprintf( current -> comm, sizeof( current -> comm ),   \
28893 +                 "%s:%s:%s", __FUNCTION__, (super)->s_id, ( state ) )
28894 +
28895 +/* The background transaction manager daemon, started as a kernel thread
28896 +   during reiser4 initialization. */
28897 +static int
28898 +ktxnmgrd(void *arg)
28899 +{
28900 +       struct task_struct *me;
28901 +       struct super_block * super;
28902 +       ktxnmgrd_context *ctx;
28903 +       txn_mgr * mgr;
28904 +
28905 +       /* standard kernel thread prologue */
28906 +       me = current;
28907 +       /* reparent_to_init() is done by daemonize() */
28908 +       daemonize(__FUNCTION__);
28909 +
28910 +       /* block all signals */
28911 +       spin_lock_irq(&me->sighand->siglock);
28912 +       siginitsetinv(&me->blocked, 0);
28913 +       recalc_sigpending();
28914 +       spin_unlock_irq(&me->sighand->siglock);
28915 +
28916 +       /* do_fork() just copies task_struct into the new
28917 +          thread. ->fs_context shouldn't be copied of course. This shouldn't
28918 +          be a problem for the rest of the code though.
28919 +       */
28920 +       me->journal_info = NULL;
28921 +
28922 +       mgr = arg;
28923 +       ctx = mgr->daemon;
28924 +       spin_lock(&ctx->guard);
28925 +       ctx->tsk = me;
28926 +       super = container_of(mgr, reiser4_super_info_data, tmgr)->tree.super;
28927 +       kcond_broadcast(&ctx->startup);
28928 +       while (1) {
28929 +               int result;
28930 +
28931 +               /* software suspend support. */
28932 +               if (me->flags & PF_FREEZE) {
28933 +                       spin_unlock(&ctx->guard);
28934 +                       refrigerator(PF_FREEZE/*PF_IOTHREAD*/);
28935 +                       spin_lock(&ctx->guard);
28936 +               }
28937 +
28938 +               set_comm("wait");
28939 +               /* wait for @ctx -> timeout or explicit wake up.
28940 +
28941 +                  kcond_wait() is called with last argument 1 enabling wakeup
28942 +                  by signals so that this thread is not counted in
28943 +                  load-average. This doesn't require any special handling,
28944 +                  because all signals were blocked.
28945 +               */
28946 +               result = kcond_timedwait(&ctx->wait,
28947 +                                        &ctx->guard, ctx->timeout, 1);
28948 +
28949 +               if (result != -ETIMEDOUT && result != -EINTR && result != 0) {
28950 +                       /* some other error */
28951 +                       warning("nikita-2443", "Error: %i", result);
28952 +                       continue;
28953 +               }
28954 +
28955 +               /* we are asked to exit */
28956 +               if (ctx->done)
28957 +                       break;
28958 +
28959 +               set_comm(result ? "timed" : "run");
28960 +
28961 +               /* wait timed out or ktxnmgrd was woken up by explicit request
28962 +                  to commit something. Scan list of atoms in txnmgr and look
28963 +                  for too old atoms.
28964 +               */
28965 +               do {
28966 +                       ctx->rescan = 0;
28967 +                       scan_mgr(mgr);
28968 +                       spin_lock(&ctx->guard);
28969 +                       if (ctx->rescan) {
28970 +                               /* the list could be modified while ctx
28971 +                                  spinlock was released, we have to
28972 +                                  repeat scanning from the
28973 +                                  beginning  */
28974 +                               break;
28975 +                       }
28976 +               } while (ctx->rescan);
28977 +       }
28978 +
28979 +       spin_unlock(&ctx->guard);
28980 +
28981 +       complete_and_exit(&ctx->finish, 0);
28982 +       /* not reached. */
28983 +       return 0;
28984 +}
28985 +
28986 +#undef set_comm
28987 +
28988 +reiser4_internal void
28989 +ktxnmgrd_kick(txn_mgr * mgr)
28990 +{
28991 +       assert("nikita-3234", mgr != NULL);
28992 +       assert("nikita-3235", mgr->daemon != NULL);
28993 +       kcond_signal(&mgr->daemon->wait);
28994 +}
28995 +
28996 +reiser4_internal int
28997 +is_current_ktxnmgrd(void)
28998 +{
28999 +       return (get_current_super_private()->tmgr.daemon->tsk == current);
29000 +}
29001 +
29002 +/* scan one transaction manager for old atoms; should be called with ktxnmgrd
29003 + * spinlock, releases this spin lock at exit */
29004 +static int
29005 +scan_mgr(txn_mgr * mgr)
29006 +{
29007 +       int              ret;
29008 +       reiser4_context  ctx;
29009 +       reiser4_tree    *tree;
29010 +
29011 +       assert("nikita-2454", mgr != NULL);
29012 +
29013 +       /* NOTE-NIKITA this only works for atoms embedded into super blocks. */
29014 +       tree = &container_of(mgr, reiser4_super_info_data, tmgr)->tree;
29015 +       assert("nikita-2455", tree != NULL);
29016 +       assert("nikita-2456", tree->super != NULL);
29017 +
29018 +       init_context(&ctx, tree->super);
29019 +
29020 +       ret = commit_some_atoms(mgr);
29021 +
29022 +       reiser4_exit_context(&ctx);
29023 +       return ret;
29024 +}
29025 +
29026 +
29027 +reiser4_internal int start_ktxnmgrd (txn_mgr * mgr)
29028 +{
29029 +       ktxnmgrd_context * ctx;
29030 +
29031 +       assert("nikita-2448", mgr != NULL);
29032 +       assert("zam-1015", mgr->daemon != NULL);
29033 +
29034 +       ctx = mgr->daemon;
29035 +
29036 +       spin_lock(&ctx->guard);
29037 +
29038 +       ctx->rescan = 1;
29039 +       ctx->done = 0;
29040 +
29041 +       spin_unlock(&ctx->guard);
29042 +
29043 +       kernel_thread(ktxnmgrd, mgr, CLONE_KERNEL);
29044 +
29045 +       spin_lock(&ctx->guard);
29046 +
29047 +       /* daemon thread is not yet initialized */
29048 +       if (ctx->tsk == NULL)
29049 +               /* wait until initialization completes */
29050 +               kcond_wait(&ctx->startup, &ctx->guard, 0);
29051 +
29052 +       assert("nikita-2452", ctx->tsk != NULL);
29053 +
29054 +       spin_unlock(&ctx->guard);
29055 +       return 0;
29056 +}
29057 +
29058 +reiser4_internal void stop_ktxnmgrd (txn_mgr * mgr)
29059 +{
29060 +       ktxnmgrd_context * ctx;
29061 +
29062 +       assert ("zam-1016", mgr != NULL);
29063 +       assert ("zam-1017", mgr->daemon != NULL);
29064 +
29065 +       ctx = mgr->daemon;
29066 +
29067 +       spin_lock(&ctx->guard);
29068 +       ctx->tsk = NULL;
29069 +       ctx->done = 1;
29070 +       spin_unlock(&ctx->guard);
29071 +
29072 +       kcond_signal(&ctx->wait);
29073 +
29074 +       /* wait until daemon finishes */
29075 +       wait_for_completion(&ctx->finish);
29076 +}
29077 +
29078 +reiser4_internal void
29079 +done_ktxnmgrd_context (txn_mgr * mgr)
29080 +{
29081 +       assert ("zam-1011", mgr != NULL);
29082 +       assert ("zam-1012", mgr->daemon != NULL);
29083 +
29084 +       reiser4_kfree(mgr->daemon);
29085 +       mgr->daemon = NULL;
29086 +}
29087 +
29088 +/* Make Linus happy.
29089 +   Local variables:
29090 +   c-indentation-style: "K&R"
29091 +   mode-name: "LC"
29092 +   c-basic-offset: 8
29093 +   tab-width: 8
29094 +   fill-column: 120
29095 +   End:
29096 +*/
29097 diff -rupN linux-2.6.8-rc3/fs/reiser4/ktxnmgrd.h linux-2.6.8-rc3-a/fs/reiser4/ktxnmgrd.h
29098 --- linux-2.6.8-rc3/fs/reiser4/ktxnmgrd.h       1970-01-01 03:00:00.000000000 +0300
29099 +++ linux-2.6.8-rc3-a/fs/reiser4/ktxnmgrd.h     2004-08-05 21:20:53.177640606 +0400
29100 @@ -0,0 +1,63 @@
29101 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
29102 + * reiser4/README */
29103 +
29104 +/* Transaction manager daemon. See ktxnmgrd.c for comments. */
29105 +
29106 +#ifndef __KTXNMGRD_H__
29107 +#define __KTXNMGRD_H__
29108 +
29109 +#include "kcond.h"
29110 +#include "txnmgr.h"
29111 +#include "spin_macros.h"
29112 +
29113 +#include <linux/fs.h>
29114 +#include <linux/completion.h>
29115 +#include <linux/spinlock.h>
29116 +#include <asm/atomic.h>
29117 +#include <linux/sched.h>       /* for struct task_struct */
29118 +
29119 +/* in this structure all data necessary to start up, shut down and communicate
29120 + * with ktxnmgrd are kept. */
29121 +struct ktxnmgrd_context {
29122 +       /* conditional variable used to synchronize start up of ktxnmgrd */
29123 +       kcond_t startup;
29124 +       /* completion used to synchronize shut down of ktxnmgrd */
29125 +       struct completion finish;
29126 +       /* condition variable on which ktxnmgrd sleeps */
29127 +       kcond_t wait;
29128 +       /* spin lock protecting all fields of this structure */
29129 +       spinlock_t guard;
29130 +       /* timeout of sleeping on ->wait */
29131 +       signed long timeout;
29132 +       /* kernel thread running ktxnmgrd */
29133 +       struct task_struct *tsk;
29134 +       /* list of all file systems served by this ktxnmgrd */
29135 +       txn_mgrs_list_head queue;
29136 +       /* is ktxnmgrd being shut down? */
29137 +       int done:1;
29138 +       /* should ktxnmgrd repeat scanning of atoms? */
29139 +       int rescan:1;
29140 +};
29141 +
29142 +extern int  init_ktxnmgrd_context(txn_mgr *);
29143 +extern void done_ktxnmgrd_context(txn_mgr *);
29144 +
29145 +extern int  start_ktxnmgrd(txn_mgr *);
29146 +extern void stop_ktxnmgrd(txn_mgr *);
29147 +
29148 +extern void ktxnmgrd_kick(txn_mgr * mgr);
29149 +
29150 +extern int is_current_ktxnmgrd(void);
29151 +
29152 +/* __KTXNMGRD_H__ */
29153 +#endif
29154 +
29155 +/* Make Linus happy.
29156 +   Local variables:
29157 +   c-indentation-style: "K&R"
29158 +   mode-name: "LC"
29159 +   c-basic-offset: 8
29160 +   tab-width: 8
29161 +   fill-column: 120
29162 +   End:
29163 +*/
29164 diff -rupN linux-2.6.8-rc3/fs/reiser4/lib.h linux-2.6.8-rc3-a/fs/reiser4/lib.h
29165 --- linux-2.6.8-rc3/fs/reiser4/lib.h    1970-01-01 03:00:00.000000000 +0300
29166 +++ linux-2.6.8-rc3-a/fs/reiser4/lib.h  2004-08-05 21:20:52.786723060 +0400
29167 @@ -0,0 +1,75 @@
29168 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
29169 +
29170 +#if !defined (__FS_REISER4_LIB_H__)
29171 +#define __FS_REISER4_LIB_H__
29172 +
29173 +/* These 2 functions of 64 bit numbers division were taken from
29174 +   include/sound/pcm.h */
29175 +
29176 +/* Helper function for 64 bits numbers division. */
29177 +static inline void
29178 +divl(__u32 high, __u32 low, __u32 div, __u32 * q, __u32 * r)
29179 +{
29180 +       __u64 n = (__u64) high << 32 | low;
29181 +       __u64 d = (__u64) div << 31;
29182 +       __u32 q1 = 0;
29183 +       int c = 32;
29184 +
29185 +       while (n > 0xffffffffU) {
29186 +               q1 <<= 1;
29187 +               if (n >= d) {
29188 +                       n -= d;
29189 +                       q1 |= 1;
29190 +               }
29191 +               d >>= 1;
29192 +               c--;
29193 +       }
29194 +       q1 <<= c;
29195 +       if (n) {
29196 +               low = n;
29197 +               *q = q1 | (low / div);
29198 +               if (r)
29199 +                       *r = low % div;
29200 +       } else {
29201 +               if (r)
29202 +                       *r = 0;
29203 +               *q = q1;
29204 +       }
29205 +       return;
29206 +}
29207 +
29208 +/* Function for 64 bits numbers division. */
29209 +static inline __u64
29210 +div64_32(__u64 n, __u32 div, __u32 * rem)
29211 +{
29212 +       __u32 low, high;
29213 +
29214 +       low = n & 0xffffffff;
29215 +       high = n >> 32;
29216 +       if (high) {
29217 +               __u32 high1 = high % div;
29218 +               __u32 low1 = low;
29219 +               high /= div;
29220 +               divl(high1, low1, div, &low, rem);
29221 +               return (__u64) high << 32 | low;
29222 +       } else {
29223 +               if (rem)
29224 +                       *rem = low % div;
29225 +               return low / div;
29226 +       }
29227 +
29228 +       return 0;
29229 +}
29230 +
29231 +#endif /* __FS_REISER4_LIB_H__ */
29232 +
29233 +/* Make Linus happy.
29234 +   Local variables:
29235 +   c-indentation-style: "K&R"
29236 +   mode-name: "LC"
29237 +   c-basic-offset: 8
29238 +   tab-width: 8
29239 +   fill-column: 120
29240 +   scroll-step: 1
29241 +   End:
29242 +*/
29243 diff -rupN linux-2.6.8-rc3/fs/reiser4/linux-5_reiser4_syscall.patch linux-2.6.8-rc3-a/fs/reiser4/linux-5_reiser4_syscall.patch
29244 --- linux-2.6.8-rc3/fs/reiser4/linux-5_reiser4_syscall.patch    1970-01-01 03:00:00.000000000 +0300
29245 +++ linux-2.6.8-rc3-a/fs/reiser4/linux-5_reiser4_syscall.patch  2004-08-05 21:20:53.285617831 +0400
29246 @@ -0,0 +1,38 @@
29247 +===== arch/um/kernel/sys_call_table.c 1.5 vs edited =====
29248 +--- 1.5/arch/um/kernel/sys_call_table.c        Wed Nov  6 17:36:22 2002
29249 ++++ edited/arch/um/kernel/sys_call_table.c     Fri Dec  6 22:15:35 2002
29250 +@@ -232,6 +232,7 @@
29251 + extern syscall_handler_t sys_io_cancel;
29252 + extern syscall_handler_t sys_exit_group;
29253 + extern syscall_handler_t sys_lookup_dcookie;
29254 ++extern syscall_handler_t sys_eriser4;
29255 +
29256 + #if CONFIG_NFSD
29257 + #define NFSSERVCTL sys_nfsserctl
29258 +@@ -483,6 +484,7 @@
29259 +       [ __NR_free_hugepages ] = sys_ni_syscall,
29260 +       [ __NR_exit_group ] = sys_exit_group,
29261 +       [ __NR_lookup_dcookie ] = sys_lookup_dcookie,
29262 ++      [ __NR_reiser4_sys_call ] = sys_reiser4,
29263 +
29264 +       ARCH_SYSCALLS
29265 +       [ LAST_SYSCALL + 1 ... NR_syscalls ] =
29266 +===== include/asm-i386/unistd.h 1.19 vs edited =====
29267 +--- 1.19/include/asm-i386/unistd.h     Thu Oct 31 18:28:28 2002
29268 ++++ edited/include/asm-i386/unistd.h   Fri Dec  6 22:45:24 2002
29269 +@@ -262,6 +262,7 @@
29270 + #define __NR_sys_epoll_ctl    255
29271 + #define __NR_sys_epoll_wait   256
29272 + #define __NR_remap_file_pages 257
29273 ++#define __NR_reiser4_sys_call 258
29274 +
29275 +
29276 + /* user-visible error numbers are in the range -1 - -124: see <asm-i386/errno.h> */
29277 +@@ -378,6 +379,7 @@
29278 + static inline _syscall1(int,close,int,fd)
29279 + static inline _syscall1(int,_exit,int,exitcode)
29280 + static inline _syscall3(pid_t,waitpid,pid_t,pid,int *,wait_stat,int,options)
29281 ++static inline _syscall1(long,_reiser4_sys_call,char*,p_strIng)
29282 +
29283 + #endif
29284 +
29285 diff -rupN linux-2.6.8-rc3/fs/reiser4/lnode.c linux-2.6.8-rc3-a/fs/reiser4/lnode.c
29286 --- linux-2.6.8-rc3/fs/reiser4/lnode.c  1970-01-01 03:00:00.000000000 +0300
29287 +++ linux-2.6.8-rc3-a/fs/reiser4/lnode.c        2004-08-05 21:20:53.179640184 +0400
29288 @@ -0,0 +1,431 @@
29289 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
29290 +
29291 +/* Lnode manipulation functions. */
29292 +/* Lnode is light-weight node used as common data-structure by both VFS access
29293 +   paths and reiser4() system call processing.
29294 +
29295 +   One of the main targets of reiser4() system call is to allow manipulation
29296 +   on potentially huge number of objects. This makes use of inode in reiser4()
29297 +   impossible. On the other hand there is a need to synchronize reiser4() and
29298 +   VFS access.
29299 +
29300 +   To do this small object (lnode) is allocated (on the stack if possible) for
29301 +   each object involved into reiser4() system call. Such lnode only contains
29302 +   lock, information necessary to link it into global hash table, and
29303 +   condition variable to wake up waiters (see below).
29304 +
29305 +   In other words, lnode is handle that reiser4 keeps for a file system object
29306 +   while object is being actively used. For example, when read is performed by
29307 +   reiser4_read(), lnode exists for inode being read. When reiser4_read()
29308 +   exits lnode is deleted, but inode is still there in the inode cache.
29309 +
29310 +   As lnode only exists while object is being actively manipulated by some
29311 +   threads, it follows that lnodes can always live on the stack of such
29312 +   threads.
29313 +
29314 +   Case-by-case:
29315 +
29316 +     A. access through VFS (reiser4_{read|write|truncate|*}()):
29317 +
29318 +       1. operation starts with inode supplied by VFS.
29319 +
29320 +       2. lget( &local_lnode, LNODE_INODE, inode -> i_ino ) is called. This,
29321 +       if necessary, will wait until sys_reiser4() access to this file is
29322 +       finished, and
29323 +
29324 +       3. add lnode to the per super block hash table.
29325 +
29326 +     B. creation of new inode in reiser4_iget():
29327 +
29328 +       1. create new empty inode (iget(), or icreate())
29329 +
29330 +       2. step A.3. A.2 is not necessary, because we are creating new object
29331 +       and parent is in VFS access (hence sys_reiser4() cannot add/delete
29332 +       objects in parent).
29333 +
29334 +       3. read stat data from disk and initialise inode
29335 +
29336 +     C. sys_reiser4() access:
29337 +
29338 +       1. check for existing inode in a hash-table.
29339 +
29340 +          Rationale: if inode is already here it is advantageous to use it,
29341 +          because it already has information from stat data.
29342 +
29343 +          If inode is found proceed as in case A.
29344 +
29345 +       2. otherwise, lget( &local_lnode, LNODE_LW, oid ) is called.
29346 +
29347 +
29348 +   NOT FINISHED.
29349 +
29350 +
29351 +
29352 +
29353 +
29354 +
29355 +
29356 +   INTERNAL NOTES:
29357 +
29358 +   1. fs/inode.c:inode_lock is not static: we can use it. Good.
29359 +
29360 +   2. but fs/inode.c:find_inode() is. Either write own version, or remove
29361 +   static and EXPORT_SYMBOL-ize it.
29362 +
29363 +
29364 +
29365 +*/
29366 +
29367 +#include "debug.h"
29368 +#include "kcond.h"
29369 +#include "key.h"
29370 +#include "kassign.h"
29371 +#include "plugin/plugin_header.h"
29372 +#include "plugin/plugin_set.h"
29373 +#include "lnode.h"
29374 +#include "super.h"
29375 +#include "reiser4.h"
29376 +
29377 +#include <linux/fs.h>          /* for struct super_block  */
29378 +#include <linux/spinlock.h>
29379 +
29380 +static reiser4_key *lnode_dentry_key(const lnode * node, reiser4_key * result);
29381 +static reiser4_key *lnode_inode_key(const lnode * node, reiser4_key * result);
29382 +static reiser4_key *lnode_lw_key(const lnode * node, reiser4_key * result);
29383 +static int lnode_inode_eq(const lnode * node1, const lnode * node2);
29384 +static int lnode_lw_eq(const lnode * node1, const lnode * node2);
29385 +
29386 +#if REISER4_DEBUG
29387 +static int lnode_valid_type(lnode_type type);
29388 +#endif
29389 +
29390 +/* Common operations for various types of lnodes.
29391 +
29392 +   NOTE-NIKITA consider making this plugin. */
29393 +static struct {
29394 +       /* get a key of the corresponding file system object */
29395 +       reiser4_key *(*key) (const lnode * node, reiser4_key * result);
29396 +       /* get a plugin suitable for the corresponding file system object */
29397 +       int (*get_plugins) (const lnode * node, plugin_set * area);
29398 +       /* set a plugin suitable for the corresponding file system object */
29399 +       int (*set_plugins) (lnode * node, const plugin_set * area);
29400 +       /* true if @node1 and @node2 refer to the same object */
29401 +       int (*eq) (const lnode * node1, const lnode * node2);
29402 +} lnode_ops[LNODE_NR_TYPES] = {
29403 +       [LNODE_DENTRY] = {
29404 +               .key = lnode_dentry_key,
29405 +               .get_plugins = NULL,
29406 +               .set_plugins = NULL,
29407 +               .eq = NULL
29408 +       },
29409 +       [LNODE_INODE] = {
29410 +               .key = lnode_inode_key,
29411 +               .get_plugins = NULL,
29412 +               .set_plugins = NULL,
29413 +               .eq = lnode_inode_eq
29414 +       },
29415 +       /*
29416 +       [LNODE_PSEUDO] = {
29417 +               .key = NULL,
29418 +               .get_plugins = NULL,
29419 +               .set_plugins = NULL,
29420 +               .eq = NULL
29421 +       },
29422 +       */
29423 +       [LNODE_LW] = {
29424 +               .key = lnode_lw_key,
29425 +               .get_plugins = NULL,
29426 +               .set_plugins = NULL,
29427 +               .eq = lnode_lw_eq
29428 +       }
29429 +};
29430 +
29431 +/* hash table support */
29432 +
29433 +/* compare two block numbers for equality. Used by hash-table macros */
29434 +/* Audited by: green(2002.06.15) */
29435 +static inline int
29436 +oid_eq(const oid_t * o1 /* first oid to compare */ ,
29437 +       const oid_t * o2 /* second oid to compare */ )
29438 +{
29439 +       return *o1 == *o2;
29440 +}
29441 +
29442 +/* Hash znode by block number. Used by hash-table macros */
29443 +/* Audited by: green(2002.06.15) */
29444 +static inline __u32
29445 +oid_hash(ln_hash_table *table, const oid_t * o /* oid to hash */ )
29446 +{
29447 +       return *o & (LNODE_HTABLE_BUCKETS - 1);
29448 +}
29449 +
29450 +/* The hash table definition */
29451 +#define KMALLOC(size) kmalloc((size), GFP_KERNEL)
29452 +#define KFREE(ptr, size) kfree(ptr)
29453 +TYPE_SAFE_HASH_DEFINE(ln, lnode, oid_t, h.oid, h.link, oid_hash, oid_eq);
29454 +#undef KFREE
29455 +#undef KMALLOC
29456 +
29457 +ln_hash_table lnode_htable;
29458 +spinlock_t    lnode_guard = SPIN_LOCK_UNLOCKED;
29459 +
29460 +
29461 +/* true if @required lnode type is @compatible with @set lnode type. If lnode
29462 +   types are incompatible, then thread trying to obtain @required type of
29463 +   access will wait until all references (lnodes) of the @set type to the file
29464 +   system object are released.
29465 +
29466 +   For example, thread trying to manipulate object through VFS (@required type
29467 +   is LNODE_INODE) will wait if object is currently manipulated through
29468 +   reiser4() call (that is, there are lnodes with type LNODE_LW).
29469 +
29470 +*/
29471 +/* Audited by: green(2002.06.15) */
29472 +reiser4_internal int
29473 +lnode_compatible_type(lnode_type required /* required lnode type */ ,
29474 +                     lnode_type set /* lnode type already set */ )
29475 +{
29476 +       return !((set == LNODE_LW) && (required != LNODE_INODE));
29477 +}
29478 +
29479 +/* initialise lnode module for @super. */
29480 +/* Audited by: green(2002.06.15) */
29481 +reiser4_internal int
29482 +lnodes_init(void)
29483 +{
29484 +       ln_hash_init(&lnode_htable, LNODE_HTABLE_BUCKETS, NULL);
29485 +       return 0;
29486 +}
29487 +
29488 +/* free lnode resources associated with @super. */
29489 +/* Audited by: green(2002.06.15) */
29490 +reiser4_internal int
29491 +lnodes_done(void)
29492 +{
29493 +       ln_hash_done(&lnode_htable);
29494 +       return 0;
29495 +}
29496 +
29497 +/* Acquire handle to file system object.
29498 +
29499 +   First check whether there is already lnode for this oid in a hash table.
29500 +   If no---initialise @node and add it into the hash table. If hash table
29501 +   already contains lnode with such oid, and incompatible type, wait until
29502 +   said lnode is deleted. If compatible lnode is found in the hash table,
29503 +   increase its reference counter and return.
29504 +
29505 +
29506 +
29507 +
29508 +*/
29509 +/* Audited by: green(2002.06.15) */
29510 +reiser4_internal lnode *
29511 +lget(                 /*lnode * node ,  lnode to add to the hash table */
29512 +     lnode_type type /* lnode type */ , oid_t oid /* objectid */ )
29513 +{
29514 +       lnode *result;
29515 +
29516 +       //      assert("nikita-1862", node != NULL);
29517 +       assert("nikita-1866", lnode_valid_type(type));
29518 +
29519 +       spin_lock(&lnode_guard);
29520 +       /* check hash table */
29521 +       while ((result = ln_hash_find(&lnode_htable, &oid)) != 0) {
29522 +               if (!lnode_compatible_type(type, result->h.type)) {
29523 +                       int ret;
29524 +
29525 +                       /* if lnode is of incompatible type, wait until all
29526 +                          incompatible users go away. For example, if we are
29527 +                          requesting lnode for VFS access (and our @type is
29528 +                          LNODE_INODE), wait until all reiser4() system call
29529 +                          manipulations with this object finish.
29530 +                       */
29531 +                       ret = kcond_wait(&result->h.cvar, &lnode_guard, 1);
29532 +                       if (ret != 0) {
29533 +                               result = ERR_PTR(ret);
29534 +                               break;
29535 +                       }
29536 +               } else {
29537 +                       /* compatible lnode found in the hash table. Just
29538 +                          return it. */
29539 +                       ++result->h.ref;
29540 +                       break;
29541 +               }
29542 +       }
29543 +       if (result == NULL) {
29544 +               /* lnode wasn't found in the hash table, initialise @node and
29545 +                  add it into hash table. */
29546 +               result = ( lnode * ) kmalloc( sizeof( lnode ), GFP_KERNEL);
29547 +               xmemset(result, 0, sizeof( lnode ));
29548 +               result->h.type = type;
29549 +               result->h.oid = oid;
29550 +               kcond_init(&result->h.cvar);
29551 +               result->h.ref = 1;
29552 +               ln_hash_insert(&lnode_htable, result);
29553 +       }
29554 +       spin_unlock(&lnode_guard);
29555 +       return result;
29556 +}
29557 +
29558 +/* release reference to file system object */
29559 +/* Audited by: green(2002.06.15) */
29560 +reiser4_internal void
29561 +lput(lnode * node /* lnode to release */ )
29562 +{
29563 +       assert("nikita-1864", node != NULL);
29564 +       assert("nikita-1961", lnode_valid_type(node->h.type));  /* man in
29565 +                                                                * a
29566 +                                                                * space */
29567 +       spin_lock(&lnode_guard);
29568 +       assert("nikita-1878", ln_hash_find(&lnode_htable, &node->h.oid) == node);
29569 +       if (--node->h.ref == 0) {
29570 +               ln_hash_remove(&lnode_htable, node);
29571 +               kcond_broadcast(&node->h.cvar);
29572 +               kfree(node);
29573 +       }
29574 +       spin_unlock(&lnode_guard);
29575 +}
29576 +
29577 +reiser4_internal lnode *
29578 +lref(lnode * node)
29579 +{
29580 +       assert("nikita-3241", node != NULL);
29581 +       assert("nikita-3242", lnode_valid_type(node->h.type));
29582 +
29583 +       spin_lock(&lnode_guard);
29584 +       ++ node->h.ref;
29585 +       spin_unlock(&lnode_guard);
29586 +       return node;
29587 +}
29588 +
29589 +/* true if @node1 and @node2 refer to the same object */
29590 +/* Audited by: green(2002.06.15) */
29591 +reiser4_internal int
29592 +lnode_eq(const lnode * node1 /* first node to compare */ ,
29593 +        const lnode * node2 /* second node to compare */ )
29594 +{
29595 +       assert("nikita-1921", node1 != NULL);
29596 +       assert("nikita-1922", node2 != NULL);   /* Finnegans Wake started */
29597 +
29598 +       if (node1->h.oid != node2->h.oid)
29599 +               return 0;
29600 +       else if (node1->h.type != node2->h.type)
29601 +               return 0;
29602 +       else
29603 +               return lnode_ops[node1->h.type].eq(node1, node2);
29604 +}
29605 +
29606 +/* return key of object behind @node */
29607 +/* Audited by: green(2002.06.15) */
29608 +reiser4_internal reiser4_key *
29609 +lnode_key(const lnode * node /* lnode to query */ ,
29610 +         reiser4_key * result /* result */ )
29611 +{
29612 +       assert("nikita-1849", node != NULL);
29613 +       assert("nikita-1855", lnode_valid_type(node->h.type));
29614 +       return lnode_ops[node->h.type].key(node, result);
29615 +}
29616 +
29617 +/* return plugins of object behind @node */
29618 +/* Audited by: green(2002.06.15) */
29619 +reiser4_internal int
29620 +get_lnode_plugins(const lnode * node /* lnode to query */ ,
29621 +                 plugin_set * area /* result */ )
29622 +{
29623 +       assert("nikita-1853", node != NULL);
29624 +       assert("nikita-1858", lnode_valid_type(node->h.type));
29625 +       return lnode_ops[node->h.type].get_plugins(node, area);
29626 +}
29627 +
29628 +/* set plugins of object behind @node */
29629 +/* Audited by: green(2002.06.15) */
29630 +reiser4_internal int
29631 +set_lnode_plugins(lnode * node /* lnode to modify */ ,
29632 +                 const plugin_set * area /* plugins to install */ )
29633 +{
29634 +       assert("nikita-1859", node != NULL);
29635 +       assert("nikita-1860", lnode_valid_type(node->h.type));
29636 +       return lnode_ops[node->h.type].set_plugins(node, area);
29637 +}
29638 +
29639 +#if REISER4_DEBUG
29640 +/* true if @type is valid lnode type */
29641 +/* Audited by: green(2002.06.15) */
29642 +static int
29643 +lnode_valid_type(lnode_type type /* would-be lnode type */ )
29644 +{
29645 +       return type < LNODE_NR_TYPES;
29646 +}
29647 +#endif
29648 +
29649 +/* return key of object behind dentry-based @node */
29650 +reiser4_internal reiser4_key *
29651 +lnode_dentry_key(const lnode * node /* lnode to query */ ,
29652 +               reiser4_key * result /* result */ )
29653 +{
29654 +       return build_sd_key(node->dentry.dentry->d_inode, result);
29655 +}
29656 +
29657 +
29658 +
29659 +/* return key of object behind inode-based @node */
29660 +/* Audited by: green(2002.06.15) */
29661 +static reiser4_key *
29662 +lnode_inode_key(const lnode * node /* lnode to query */ ,
29663 +               reiser4_key * result /* result */ )
29664 +{
29665 +       return build_sd_key(node->inode.inode, result);
29666 +}
29667 +
29668 +/* return key of object behind lighweight @node */
29669 +/* Audited by: green(2002.06.15) */
29670 +static reiser4_key *
29671 +lnode_lw_key(const lnode * node /* lnode to query */ ,
29672 +            reiser4_key * result /* result */ )
29673 +{
29674 +       *result = node->lw.key;
29675 +       return result;
29676 +}
29677 +
29678 +/* compare two inodes */
29679 +/* Audited by: green(2002.06.15) */
29680 +static int
29681 +lnode_inode_eq(const lnode * node1 /* first node to compare */ ,
29682 +              const lnode * node2 /* second node to compare */ )
29683 +{
29684 +       assert("nikita-1923", node1 != NULL);
29685 +       assert("nikita-1924", node2 != NULL);
29686 +
29687 +       assert("nikita-1927", node1->inode.inode != NULL);
29688 +       assert("nikita-1928", node2->inode.inode != NULL);
29689 +
29690 +       return (node1->inode.inode == node2->inode.inode);
29691 +
29692 +}
29693 +
29694 +/* compare two lw objects */
29695 +/* Audited by: green(2002.06.15) */
29696 +static int
29697 +lnode_lw_eq(const lnode * node1 UNUSED_ARG     /* first node to
29698 +                                                * compare */ ,
29699 +           const lnode * node2 UNUSED_ARG      /* second node to
29700 +                                                * compare */ )
29701 +{
29702 +       assert("nikita-1925", node1 != NULL);
29703 +       assert("nikita-1926", node2 != NULL);
29704 +
29705 +       /* we only get there if oids are equal */
29706 +       assert("nikita-1929", node1->h.oid == node2->h.oid);
29707 +       assert("nikita-1930", keyeq(&node1->lw.key, &node2->lw.key));
29708 +       return 1;
29709 +}
29710 +
29711 +/* Make Linus happy.
29712 +   Local variables:
29713 +   c-indentation-style: "K&R"
29714 +   mode-name: "LC"
29715 +   c-basic-offset: 8
29716 +   tab-width: 8
29717 +   fill-column: 120
29718 +   End:
29719 +*/
29720 diff -rupN linux-2.6.8-rc3/fs/reiser4/lnode.h linux-2.6.8-rc3-a/fs/reiser4/lnode.h
29721 --- linux-2.6.8-rc3/fs/reiser4/lnode.h  1970-01-01 03:00:00.000000000 +0300
29722 +++ linux-2.6.8-rc3-a/fs/reiser4/lnode.h        2004-08-05 21:20:53.278619307 +0400
29723 @@ -0,0 +1,121 @@
29724 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
29725 +
29726 +/* Declaration of lnode (light-weight node). */
29727 +
29728 +#ifndef __LNODE_H__
29729 +#define __LNODE_H__
29730 +
29731 +#include "forward.h"
29732 +#include "dformat.h"
29733 +#include "kcond.h"
29734 +#include "type_safe_hash.h"
29735 +#include "plugin/plugin_header.h"
29736 +#include "plugin/plugin_set.h"
29737 +#include "key.h"
29738 +
29739 +#include <linux/types.h>       /* for __u??  */
29740 +#include <linux/fs.h>          /* for struct super_block, etc.  */
29741 +#include <linux/dcache.h>      /* for struct super_block, etc.  */
29742 +
29743 +typedef enum {
29744 +       LNODE_DENTRY,
29745 +       LNODE_INODE,
29746 +       LNODE_REISER4_INODE,
29747 +       LNODE_LW,
29748 +       LNODE_PSEUDO,
29749 +       LNODE_NR_TYPES
29750 +} lnode_type;
29751 +
29752 +typedef union lnode lnode;
29753 +
29754 +/* declare hash table of lnode_lw's */
29755 +TYPE_SAFE_HASH_DECLARE(ln, lnode);
29756 +
29757 +/* common part of various lnode types */
29758 +typedef struct lnode_header {
29759 +       /* lnode type. Taken from lnode_type enum. Never changed after
29760 +          initialisation, so needs no locking.  */
29761 +       __u8 type;
29762 +       /* unused. Alignment requires this anyway. */
29763 +       __u8 flags;
29764 +       /* condition variable to wake up waiters */
29765 +       kcond_t cvar;
29766 +       /* hash table linkage. Updated under hash-table spinlock. */
29767 +       ln_hash_link link;
29768 +       /* objectid of underlying file system object. Never changed after
29769 +          initialisation, so needs no locking.  */
29770 +       oid_t oid;
29771 +       /* reference counter. Updated under hash-table spinlock. */
29772 +       int ref;
29773 +} lnode_header;
29774 +
29775 +typedef struct lnode_dentry {
29776 +       lnode_header h;
29777 +       struct dentry *dentry;
29778 +       struct vfsmount *mnt;
29779 +} lnode_dentry;
29780 +
29781 +typedef struct lnode_inode {
29782 +       lnode_header h;
29783 +       struct inode *inode;
29784 +} lnode_inode;
29785 +
29786 +typedef struct lnode_reiser4_inode {
29787 +       lnode_header h;
29788 +       struct reiser4_inode *inode;
29789 +} lnode_reiser4_inode;
29790 +
29791 +typedef struct lnode_lw {
29792 +       lnode_header h;
29793 +       struct super_block * lw_sb;
29794 +       reiser4_key key;
29795 +} lnode_lw;
29796 +
29797 +struct assign_result {
29798 +       loff_t len ;
29799 +       int return_code ;
29800 +};
29801 +
29802 +typedef struct lnode_pseudo {
29803 +       lnode_header h;
29804 +       struct assign_result rez;
29805 +
29806 +       //      lnode *host;
29807 +       /* something to identify pseudo file type, like name or plugin */
29808 +} lnode_pseudo;
29809 +
29810 +union lnode {
29811 +       lnode_header h;
29812 +       lnode_dentry dentry;
29813 +       lnode_inode inode;
29814 +       lnode_reiser4_inode reiser4_inode;
29815 +       lnode_lw lw;
29816 +       lnode_pseudo pseudo;
29817 +};
29818 +
29819 +extern int lnodes_init(void);
29820 +extern int lnodes_done(void);
29821 +
29822 +extern lnode *lget( lnode_type type, oid_t oid);
29823 +extern void lput(lnode * node);
29824 +extern int lnode_eq(const lnode * node1, const lnode * node2);
29825 +extern lnode *lref(lnode * node);
29826 +
29827 +extern struct inode *inode_by_lnode(const lnode * node);
29828 +extern reiser4_key *lnode_key(const lnode * node, reiser4_key * result);
29829 +
29830 +extern int get_lnode_plugins(const lnode * node, plugin_set * area);
29831 +extern int set_lnode_plugins(lnode * node, const plugin_set * area);
29832 +
29833 +/* __LNODE_H__ */
29834 +#endif
29835 +
29836 +/* Make Linus happy.
29837 +   Local variables:
29838 +   c-indentation-style: "K&R"
29839 +   mode-name: "LC"
29840 +   c-basic-offset: 8
29841 +   tab-width: 8
29842 +   fill-column: 120
29843 +   End:
29844 +*/
29845 diff -rupN linux-2.6.8-rc3/fs/reiser4/lock.c linux-2.6.8-rc3-a/fs/reiser4/lock.c
29846 --- linux-2.6.8-rc3/fs/reiser4/lock.c   1970-01-01 03:00:00.000000000 +0300
29847 +++ linux-2.6.8-rc3-a/fs/reiser4/lock.c 2004-08-05 21:20:53.196636599 +0400
29848 @@ -0,0 +1,1505 @@
29849 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
29850 + * reiser4/README */
29851 +
29852 +/* Traditional deadlock avoidance is achieved by acquiring all locks in a single
29853 +   order.  V4 balances the tree from the bottom up, and searches the tree from
29854 +   the top down, and that is really the way we want it, so tradition won't work
29855 +   for us.
29856 +
29857 +   Instead we have two lock orderings, a high priority lock ordering, and a low
29858 +   priority lock ordering.  Each node in the tree has a lock in its znode.
29859 +
29860 +   Suppose we have a set of processes which lock (R/W) tree nodes. Each process
29861 +   has a set (maybe empty) of already locked nodes ("process locked set"). Each
29862 +   process may have a pending lock request to a node locked by another process.
29863 +   Note: we lock and unlock, but do not transfer locks: it is possible
29864 +   transferring locks instead would save some bus locking....
29865 +
29866 +   Deadlock occurs when we have a loop constructed from process locked sets and
29867 +   lock request vectors.
29868 +
29869 +
29870 +   NOTE: The reiser4 "tree" is a tree on disk, but its cached representation in
29871 +   memory is extended with "znodes" with which we connect nodes with their left
29872 +   and right neighbors using sibling pointers stored in the znodes.  When we
29873 +   perform balancing operations we often go from left to right and from right to
29874 +   left.
29875 +
29876 +
29877 +   +-P1-+          +-P3-+
29878 +   |+--+|   V1     |+--+|
29879 +   ||N1|| -------> ||N3||
29880 +   |+--+|          |+--+|
29881 +   +----+          +----+
29882 +     ^               |
29883 +     |V2             |V3
29884 +     |               v
29885 +   +---------P2---------+
29886 +   |+--+            +--+|
29887 +   ||N2|  --------  |N4||
29888 +   |+--+            +--+|
29889 +   +--------------------+
29890 +
29891 +   We solve this by ensuring that only low priority processes lock in top to
29892 +   bottom order and from right to left, and high priority processes lock from
29893 +   bottom to top and left to right.
29894 +
29895 +   ZAM-FIXME-HANS: order not just node locks in this way, order atom locks, and
29896 +   kill those damn busy loops.
29897 +   ANSWER(ZAM): atom locks (which are introduced by ASTAGE_CAPTURE_WAIT atom
29898 +   stage) cannot be ordered that way. There are no rules what nodes can belong
29899 +   to the atom and what nodes cannot.  We cannot define what is right or left
29900 +   direction, what is top or bottom.  We can take immediate parent or side
29901 +   neighbor of one node, but nobody guarantees that, say, left neighbor node is
29902 +   not a far right neighbor for other nodes from the same atom.  It breaks
29903 +   deadlock avoidance rules and hi-low priority locking cannot be applied for
29904 +   atom locks.
29905 +
29906 +   How does it help to avoid deadlocks ?
29907 +
29908 +   Suppose we have a deadlock with n processes. Processes from one priority
29909 +   class never deadlock because they take locks in one consistent
29910 +   order.
29911 +
29912 +   So, any possible deadlock loop must have low priority as well as high
29913 +   priority processes.  There are no other lock priority levels except low and
29914 +   high. We know that any deadlock loop contains at least one node locked by a
29915 +   low priority process and requested by a high priority process. If this
29916 +   situation is caught and resolved it is sufficient to avoid deadlocks.
29917 +
29918 +   V4 DEADLOCK PREVENTION ALGORITHM IMPLEMENTATION.
29919 +
29920 +   The deadlock prevention algorithm is based on comparing
29921 +   priorities of node owners (processes which keep znode locked) and
29922 +   requesters (processes which want to acquire a lock on znode).  We
29923 +   implement a scheme where low-priority owners yield locks to
29924 +   high-priority requesters. We created a signal passing system that
29925 +   is used to ask low-priority processes to yield one or more locked
29926 +   znodes.
29927 +
29928 +   The condition when a znode needs to change its owners is described by the
29929 +   following formula:
29930 +
29931 +   #############################################
29932 +   #                                           #
29933 +   # (number of high-priority requesters) >  0 #
29934 +   #                AND                        #
29935 +   # (numbers of high-priority owners)    == 0 #
29936 +   #                                           #
29937 +   #############################################
29938 +
29939 +     Note that a low-priority process
29940 +     delays node releasing if another high-priority process owns this node.  So, slightly more strictly speaking, to have a deadlock capable cycle you must have a loop in which a high priority process is waiting on a low priority process to yield a node, which is slightly different from saying a high priority process is waiting on a node owned by a low priority process.
29941 +
29942 +   It is enough to avoid deadlocks if we prevent any low-priority process from
29943 +   falling asleep if its locked set contains a node which satisfies the
29944 +   deadlock condition.
29945 +
29946 +   That condition is implicitly or explicitly checked in all places where new
29947 +   high-priority requests may be added or removed from node request queue or
29948 +   high-priority process takes or releases a lock on node. The main
29949 +   goal of these checks is to never lose the moment when node becomes "has
29950 +   wrong owners" and send "must-yield-this-lock" signals to its low-pri owners
29951 +   at that time.
29952 +
29953 +   The information about received signals is stored in the per-process
29954 +   structure (lock stack) and analyzed before a low-priority process goes to
29955 +   sleep but after a "fast" attempt to lock a node fails. Any signal wakes
29956 +   sleeping process up and forces him to re-check lock status and received
29957 +   signal info. If "must-yield-this-lock" signals were received the locking
29958 +   primitive (longterm_lock_znode()) fails with -E_DEADLOCK error code.
29959 +
29960 +   V4 LOCKING DRAWBACKS
29961 +
29962 +   If we have already balanced on one level, and we are propagating our changes upward to a higher level, it could be
29963 +   very messy to surrender all locks on the lower level because we put so much computational work into it, and reverting
29964 +   them to their state before they were locked might be very complex.  We also don't want to acquire all locks before
29965 +   performing balancing because that would either be almost as much work as the balancing, or it would be too
29966 +   conservative and lock too much.  We want balancing to be done only at high priority.  Yet, we might want to go to the
29967 +   left one node and use some of its empty space... So we make one attempt at getting the node to the left using
29968 +   try_lock, and if it fails we do without it, because we didn't really need it, it was only a nice to have.
29969 +
29970 +   LOCK STRUCTURES DESCRIPTION
29971 +
29972 +   The following data structures are used in the reiser4 locking
29973 +   implementation:
29974 +
29975 +   All fields related to long-term locking are stored in znode->lock.
29976 +
29977 +   The lock stack is a per thread object.  It owns all znodes locked by the
29978 +   thread. One znode may be locked by several threads in case of read lock or
29979 +   one znode may be write locked by one thread several times. The special link
29980 +   objects (lock handles) support n<->m relation between znodes and lock
29981 +   owners.
29982 +
29983 +   <Thread 1>                       <Thread 2>
29984 +
29985 +   +---------+                     +---------+
29986 +   |  LS1    |                    |  LS2    |
29987 +   +---------+                    +---------+
29988 +       ^                                ^
29989 +       |---------------+                +----------+
29990 +       v               v                v          v
29991 +   +---------+      +---------+    +---------+   +---------+
29992 +   |  LH1    |      |   LH2   |           |  LH3    |   |   LH4   |
29993 +   +---------+     +---------+    +---------+   +---------+
29994 +       ^                   ^            ^           ^
29995 +       |                   +------------+           |
29996 +       v                   v                        v
29997 +   +---------+      +---------+                  +---------+
29998 +   |  Z1     |     |   Z2    |                  |  Z3     |
29999 +   +---------+     +---------+                  +---------+
30000 +
30001 +   Thread 1 locked znodes Z1 and Z2, thread 2 locked znodes Z2 and Z3. The picture above shows that lock stack LS1 has a
30002 +   list of 2 lock handles LH1 and LH2, lock stack LS2 has a list with lock handles LH3 and LH4 on it.  Znode Z1 is
30003 +   locked by only one thread, znode has only one lock handle LH1 on its list, similar situation is for Z3 which is
30004 +   locked by the thread 2 only. Z2 is locked (for read) twice by different threads and two lock handles are on its
30005 +   list. Each lock handle represents a single relation of a locking of a znode by a thread. Locking of a znode is an
30006 +   establishing of a locking relation between the lock stack and the znode by adding of a new lock handle to a list of
30007 +   lock handles, the lock stack.  The lock stack links all lock handles for all znodes locked by the lock stack.  The znode
30008 +   list groups all lock handles for all locks stacks which locked the znode.
30009 +
30010 +   Yet another relation may exist between znode and lock owners.  If lock
30011 +   procedure cannot immediately take lock on an object it adds the lock owner
30012 +   on special `requestors' list belongs to znode.  That list represents a
30013 +   queue of pending lock requests.  Because one lock owner may request only
30014 +   only one lock object at a time, it is a 1->n relation between lock objects
30015 +   and a lock owner implemented as it is described above. Full information
30016 +   (priority, pointers to lock and link objects) about each lock request is
30017 +   stored in lock owner structure in `request' field.
30018 +
30019 +   SHORT_TERM LOCKING
30020 +
30021 +   This is a list of primitive operations over lock stacks / lock handles /
30022 +   znodes and locking descriptions for them.
30023 +
30024 +   1. locking / unlocking which is done by two list insertion/deletion, one
30025 +      to/from znode's list of lock handles, another one is to/from lock stack's
30026 +      list of lock handles.  The first insertion is protected by
30027 +      znode->lock.guard spinlock.  The list owned by the lock stack can be
30028 +      modified only by thread who owns the lock stack and nobody else can
30029 +      modify/read it. There is nothing to be protected by a spinlock or
30030 +      something else.
30031 +
30032 +   2. adding/removing a lock request to/from znode requesters list. The rule is
30033 +      that znode->lock.guard spinlock should be taken for this.
30034 +
30035 +   3. we can traverse list of lock handles and use references to lock stacks who
30036 +      locked given znode if znode->lock.guard spinlock is taken.
30037 +
30038 +   4. If a lock stack is associated with a znode as a lock requestor or lock
30039 +      owner its existence is guaranteed by znode->lock.guard spinlock.  Some its
30040 +      (lock stack's) fields should be protected from being accessed in parallel
30041 +      by two or more threads. Please look at  lock_stack structure definition
30042 +      for the info how those fields are protected. */
30043 +
30044 +/* Znode lock and capturing intertwining. */
30045 +/* In current implementation we capture formatted nodes before locking
30046 +   them. Take a look on longterm lock znode, try_capture() request precedes
30047 +   locking requests.  The longterm_lock_znode function unconditionally captures
30048 +   znode before even checking of locking conditions.
30049 +
30050 +   Another variant is to capture znode after locking it.  It was not tested, but
30051 +   at least one deadlock condition is supposed to be there.  One thread has
30052 +   locked a znode (Node-1) and calls try_capture() for it.  Try_capture() sleeps
30053 +   because znode's atom has CAPTURE_WAIT state.  Second thread is a flushing
30054 +   thread, its current atom is the atom Node-1 belongs to. Second thread wants
30055 +   to lock Node-1 and sleeps because Node-1 is locked by the first thread.  The
30056 +   described situation is a deadlock. */
30057 +
30058 +#include "debug.h"
30059 +#include "txnmgr.h"
30060 +#include "znode.h"
30061 +#include "jnode.h"
30062 +#include "tree.h"
30063 +#include "plugin/node/node.h"
30064 +#include "super.h"
30065 +
30066 +#include <linux/spinlock.h>
30067 +
30068 +#if REISER4_DEBUG
30069 +static int request_is_deadlock_safe(znode *, znode_lock_mode,
30070 +                                   znode_lock_request);
30071 +#endif
30072 +
30073 +#define ADDSTAT(node, counter)                                                 \
30074 +       reiser4_stat_inc_at_level(znode_get_level(node), znode.counter)
30075 +
30076 +/* Returns a lock owner associated with current thread */
30077 +reiser4_internal lock_stack *
30078 +get_current_lock_stack(void)
30079 +{
30080 +       return &get_current_context()->stack;
30081 +}
30082 +
30083 +/* Wakes up all low priority owners informing them about possible deadlock */
30084 +static void
30085 +wake_up_all_lopri_owners(znode * node)
30086 +{
30087 +       lock_handle *handle;
30088 +
30089 +       assert("nikita-1824", rw_zlock_is_locked(&node->lock));
30090 +       for_all_type_safe_list(owners, &node->lock.owners, handle) {
30091 +               spin_lock_stack(handle->owner);
30092 +
30093 +               assert("nikita-1832", handle->node == node);
30094 +               /* count this signal in owner->nr_signaled */
30095 +               if (!handle->signaled) {
30096 +                       handle->signaled = 1;
30097 +                       atomic_inc(&handle->owner->nr_signaled);
30098 +               }
30099 +               /* Wake up a single process */
30100 +               __reiser4_wake_up(handle->owner);
30101 +
30102 +               spin_unlock_stack(handle->owner);
30103 +       }
30104 +}
30105 +
30106 +/* Adds a lock to a lock owner, which means creating a link to the lock and
30107 +   putting the link into the two lists all links are on (the doubly linked list
30108 +   that forms the lock_stack, and the doubly linked list of links attached
30109 +   to a lock.
30110 +*/
30111 +static inline void
30112 +link_object(lock_handle * handle, lock_stack * owner, znode * node)
30113 +{
30114 +       assert("jmacd-810", handle->owner == NULL);
30115 +       assert("nikita-1828", owner == get_current_lock_stack());
30116 +       assert("nikita-1830", rw_zlock_is_locked(&node->lock));
30117 +
30118 +       handle->owner = owner;
30119 +       handle->node = node;
30120 +
30121 +       assert("reiser4-4", ergo(locks_list_empty(&owner->locks), owner->nr_locks == 0));
30122 +       locks_list_push_back(&owner->locks, handle);
30123 +       owner->nr_locks ++;
30124 +
30125 +       owners_list_push_front(&node->lock.owners, handle);
30126 +       handle->signaled = 0;
30127 +}
30128 +
30129 +/* Breaks a relation between a lock and its owner */
30130 +static inline void
30131 +unlink_object(lock_handle * handle)
30132 +{
30133 +       assert("zam-354", handle->owner != NULL);
30134 +       assert("nikita-1608", handle->node != NULL);
30135 +       assert("nikita-1633", rw_zlock_is_locked(&handle->node->lock));
30136 +       assert("nikita-1829", handle->owner == get_current_lock_stack());
30137 +
30138 +       assert("reiser4-5", handle->owner->nr_locks > 0);
30139 +       locks_list_remove_clean(handle);
30140 +       handle->owner->nr_locks --;
30141 +       assert("reiser4-6", ergo(locks_list_empty(&handle->owner->locks), handle->owner->nr_locks == 0));
30142 +
30143 +       owners_list_remove_clean(handle);
30144 +
30145 +       /* indicates that lock handle is free now */
30146 +       handle->owner = NULL;
30147 +}
30148 +
30149 +/* Actually locks an object knowing that we are able to do this */
30150 +static void
30151 +lock_object(lock_stack * owner)
30152 +{
30153 +       lock_request *request;
30154 +       znode        *node;
30155 +       assert("nikita-1839", owner == get_current_lock_stack());
30156 +
30157 +       request = &owner->request;
30158 +       node    = request->node;
30159 +       assert("nikita-1834", rw_zlock_is_locked(&node->lock));
30160 +       if (request->mode == ZNODE_READ_LOCK) {
30161 +               node->lock.nr_readers++;
30162 +       } else {
30163 +               /* check that we don't switched from read to write lock */
30164 +               assert("nikita-1840", node->lock.nr_readers <= 0);
30165 +               /* We allow recursive locking; a node can be locked several
30166 +                  times for write by same process */
30167 +               node->lock.nr_readers--;
30168 +       }
30169 +
30170 +       link_object(request->handle, owner, node);
30171 +
30172 +       if (owner->curpri) {
30173 +               node->lock.nr_hipri_owners++;
30174 +       }
30175 +       ON_TRACE(TRACE_LOCKS,
30176 +                "%spri lock: %p node: %p: hipri_owners: %u: nr_readers: %d\n",
30177 +                owner->curpri ? "hi" : "lo", owner, node, node->lock.nr_hipri_owners, node->lock.nr_readers);
30178 +}
30179 +
30180 +/* Check for recursive write locking */
30181 +static int
30182 +recursive(lock_stack * owner)
30183 +{
30184 +       int ret;
30185 +       znode *node;
30186 +
30187 +       node = owner->request.node;
30188 +
30189 +       /* Owners list is not empty for a locked node */
30190 +       assert("zam-314", !owners_list_empty(&node->lock.owners));
30191 +       assert("nikita-1841", owner == get_current_lock_stack());
30192 +       assert("nikita-1848", rw_zlock_is_locked(&node->lock));
30193 +
30194 +       ret = (owners_list_front(&node->lock.owners)->owner == owner);
30195 +
30196 +       /* Recursive read locking should be done usual way */
30197 +       assert("zam-315", !ret || owner->request.mode == ZNODE_WRITE_LOCK);
30198 +       /* mixing of read/write locks is not allowed */
30199 +       assert("zam-341", !ret || znode_is_wlocked(node));
30200 +
30201 +       return ret;
30202 +}
30203 +
30204 +#if REISER4_DEBUG
30205 +/* Returns true if the lock is held by the calling thread. */
30206 +int
30207 +znode_is_any_locked(const znode * node)
30208 +{
30209 +       lock_handle *handle;
30210 +       lock_stack *stack;
30211 +       int ret;
30212 +
30213 +       if (!znode_is_locked(node)) {
30214 +               return 0;
30215 +       }
30216 +
30217 +       stack = get_current_lock_stack();
30218 +
30219 +       spin_lock_stack(stack);
30220 +
30221 +       ret = 0;
30222 +
30223 +       for_all_type_safe_list(locks, &stack->locks, handle) {
30224 +               if (handle->node == node) {
30225 +                       ret = 1;
30226 +                       break;
30227 +               }
30228 +       }
30229 +
30230 +       spin_unlock_stack(stack);
30231 +
30232 +       return ret;
30233 +}
30234 +
30235 +#endif
30236 +
30237 +/* Returns true if a write lock is held by the calling thread. */
30238 +reiser4_internal int
30239 +znode_is_write_locked(const znode * node)
30240 +{
30241 +       lock_stack *stack;
30242 +       lock_handle *handle;
30243 +
30244 +       assert("jmacd-8765", node != NULL);
30245 +
30246 +       if (!znode_is_wlocked(node)) {
30247 +               return 0;
30248 +       }
30249 +
30250 +       stack = get_current_lock_stack();
30251 +
30252 +       /* If it is write locked, then all owner handles must equal the current stack. */
30253 +       handle = owners_list_front(&node->lock.owners);
30254 +
30255 +       return (handle->owner == stack);
30256 +}
30257 +
30258 +/* This "deadlock" condition is the essential part of reiser4 locking
30259 +   implementation. This condition is checked explicitly by calling
30260 +   check_deadlock_condition() or implicitly in all places where znode lock
30261 +   state (set of owners and request queue) is changed. Locking code is
30262 +   designed to use this condition to trigger procedure of passing object from
30263 +   low priority owner(s) to high priority one(s).
30264 +
30265 +   The procedure results in passing an event (setting lock_handle->signaled
30266 +   flag) and counting this event in nr_signaled field of owner's lock stack
30267 +   object and wakeup owner's process.
30268 +*/
30269 +static inline int
30270 +check_deadlock_condition(znode * node)
30271 +{
30272 +       assert("nikita-1833", rw_zlock_is_locked(&node->lock));
30273 +       return node->lock.nr_hipri_requests > 0 && node->lock.nr_hipri_owners == 0;
30274 +}
30275 +
30276 +/* checks lock/request compatibility */
30277 +static int
30278 +check_lock_object(lock_stack * owner)
30279 +{
30280 +       znode *node = owner->request.node;
30281 +
30282 +       assert("nikita-1842", owner == get_current_lock_stack());
30283 +       assert("nikita-1843", rw_zlock_is_locked(&node->lock));
30284 +
30285 +       /* See if the node is disconnected. */
30286 +       if (unlikely(ZF_ISSET(node, JNODE_IS_DYING))) {
30287 +               ON_TRACE(TRACE_LOCKS, "attempt to lock dying znode: %p", node);
30288 +               return RETERR(-EINVAL);
30289 +       }
30290 +
30291 +       /* Do not ever try to take a lock if we are going in low priority
30292 +          direction and a node have a high priority request without high
30293 +          priority owners. */
30294 +       if (unlikely(!owner->curpri && check_deadlock_condition(node))) {
30295 +               return RETERR(-E_REPEAT);
30296 +       }
30297 +
30298 +       if (unlikely(!is_lock_compatible(node, owner->request.mode))) {
30299 +               return RETERR(-E_REPEAT);
30300 +       }
30301 +
30302 +       return 0;
30303 +}
30304 +
30305 +/* check for lock/request compatibility and update tree statistics */
30306 +static int
30307 +can_lock_object(lock_stack * owner)
30308 +{
30309 +       int result;
30310 +       znode *node = owner->request.node;
30311 +
30312 +       result = check_lock_object(owner);
30313 +       if (REISER4_STATS && znode_get_level(node) > 0) {
30314 +               if (result != 0)
30315 +                       ADDSTAT(node, lock_contented);
30316 +               else
30317 +                       ADDSTAT(node, lock_uncontented);
30318 +       }
30319 +       return result;
30320 +}
30321 +
30322 +/* Setting of a high priority to the process. It clears "signaled" flags
30323 +   because znode locked by high-priority process can't satisfy our "deadlock
30324 +   condition". */
30325 +static void
30326 +set_high_priority(lock_stack * owner)
30327 +{
30328 +       assert("nikita-1846", owner == get_current_lock_stack());
30329 +       /* Do nothing if current priority is already high */
30330 +       if (!owner->curpri) {
30331 +               /* We don't need locking for owner->locks list, because, this
30332 +                * function is only called with the lock stack of the current
30333 +                * thread, and no other thread can play with owner->locks list
30334 +                * and/or change ->node pointers of lock handles in this list.
30335 +                *
30336 +                * (Interrupts also are not involved.)
30337 +                */
30338 +               lock_handle *item = locks_list_front(&owner->locks);
30339 +               while (!locks_list_end(&owner->locks, item)) {
30340 +                       znode *node = item->node;
30341 +
30342 +                       WLOCK_ZLOCK(&node->lock);
30343 +
30344 +                       node->lock.nr_hipri_owners++;
30345 +
30346 +                       ON_TRACE(TRACE_LOCKS,
30347 +                                "set_hipri lock: %p node: %p: hipri_owners after: %u nr_readers: %d\n",
30348 +                                item, node, node->lock.nr_hipri_owners, node->lock.nr_readers);
30349 +
30350 +                       /* we can safely set signaled to zero, because
30351 +                          previous statement (nr_hipri_owners ++) guarantees
30352 +                          that signaled will be never set again. */
30353 +                       item->signaled = 0;
30354 +                       WUNLOCK_ZLOCK(&node->lock);
30355 +
30356 +                       item = locks_list_next(item);
30357 +               }
30358 +               owner->curpri = 1;
30359 +               atomic_set(&owner->nr_signaled, 0);
30360 +       }
30361 +}
30362 +
30363 +/* Sets a low priority to the process. */
30364 +static void
30365 +set_low_priority(lock_stack * owner)
30366 +{
30367 +       assert("nikita-3075", owner == get_current_lock_stack());
30368 +       /* Do nothing if current priority is already low */
30369 +       if (owner->curpri) {
30370 +               /* scan all locks (lock handles) held by @owner, which is
30371 +                  actually current thread, and check whether we are reaching
30372 +                  deadlock possibility anywhere.
30373 +               */
30374 +               lock_handle *handle = locks_list_front(&owner->locks);
30375 +               while (!locks_list_end(&owner->locks, handle)) {
30376 +                       znode *node = handle->node;
30377 +                       WLOCK_ZLOCK(&node->lock);
30378 +                       /* this thread just was hipri owner of @node, so
30379 +                          nr_hipri_owners has to be greater than zero. */
30380 +                       ON_TRACE(TRACE_LOCKS,
30381 +                                "set_lopri lock: %p node: %p: hipri_owners before: %u nr_readers: %d\n",
30382 +                                handle, node, node->lock.nr_hipri_owners, node->lock.nr_readers);
30383 +                       assert("nikita-1835", node->lock.nr_hipri_owners > 0);
30384 +                       node->lock.nr_hipri_owners--;
30385 +                       /* If we have deadlock condition, adjust a nr_signaled
30386 +                          field. It is enough to set "signaled" flag only for
30387 +                          current process, other low-pri owners will be
30388 +                          signaled and waken up after current process unlocks
30389 +                          this object and any high-priority requestor takes
30390 +                          control. */
30391 +                       if (check_deadlock_condition(node)
30392 +                           && !handle->signaled) {
30393 +                               handle->signaled = 1;
30394 +                               atomic_inc(&owner->nr_signaled);
30395 +                       }
30396 +                       WUNLOCK_ZLOCK(&node->lock);
30397 +                       handle = locks_list_next(handle);
30398 +               }
30399 +               owner->curpri = 0;
30400 +       }
30401 +}
30402 +
30403 +#define MAX_CONVOY_SIZE ((NR_CPUS - 1))
30404 +
30405 +/* helper function used by longterm_unlock_znode() to wake up requestor(s). */
30406 +/*
30407 + * In certain multi threaded work loads jnode spin lock is the most
30408 + * contented one. Wake up of threads waiting for znode is, thus,
30409 + * important to do right. There are three well known strategies:
30410 + *
30411 + *  (1) direct hand-off. Hasn't been tried.
30412 + *
30413 + *  (2) wake all (thundering herd). This degrades performance in our
30414 + *      case.
30415 + *
30416 + *  (3) wake one. Simplest solution where requestor in the front of
30417 + *      requestors list is awaken under znode spin lock is not very
30418 + *      good on the SMP, because first thing requestor will try to do
30419 + *      after waking up on another CPU is to acquire znode spin lock
30420 + *      that is still held by this thread. As an optimization we grab
30421 + *      lock stack spin lock, release znode spin lock and wake
30422 + *      requestor. done_context() synchronize against stack spin lock
30423 + *      to avoid (impossible) case where requestor has been waked by
30424 + *      some other thread (wake_up_all_lopri_owners(), or something
30425 + *      similar) and managed to exit before we waked it up.
30426 + *
30427 + *      Effect of this optimization wasn't big, after all.
30428 + *
30429 + */
30430 +static void
30431 +wake_up_requestor(znode *node)
30432 +{
30433 +#if NR_CPUS > 2
30434 +       requestors_list_head *creditors;
30435 +       lock_stack           *convoy[MAX_CONVOY_SIZE];
30436 +       int                   convoyused;
30437 +       int                   convoylimit;
30438 +
30439 +       assert("nikita-3180", node != NULL);
30440 +       assert("nikita-3181", rw_zlock_is_locked(&node->lock));
30441 +
30442 +       ADDSTAT(node, wakeup);
30443 +
30444 +       convoyused = 0;
30445 +       convoylimit = min(num_online_cpus() - 1, MAX_CONVOY_SIZE);
30446 +       creditors = &node->lock.requestors;
30447 +       if (!requestors_list_empty(creditors)) {
30448 +               convoy[0] = requestors_list_front(creditors);
30449 +               convoyused = 1;
30450 +               ADDSTAT(node, wakeup_found);
30451 +               /*
30452 +                * it has been verified experimentally, that there are no
30453 +                * convoys on the leaf level.
30454 +                */
30455 +               if (znode_get_level(node) != LEAF_LEVEL &&
30456 +                   convoy[0]->request.mode == ZNODE_READ_LOCK &&
30457 +                   convoylimit > 1) {
30458 +                       lock_stack *item;
30459 +
30460 +                       ADDSTAT(node, wakeup_found_read);
30461 +                       for (item = requestors_list_next(convoy[0]);
30462 +                                 ! requestors_list_end(creditors, item);
30463 +                            item = requestors_list_next(item)) {
30464 +                               ADDSTAT(node, wakeup_scan);
30465 +                               if (item->request.mode == ZNODE_READ_LOCK) {
30466 +                                       ADDSTAT(node, wakeup_convoy);
30467 +                                       convoy[convoyused] = item;
30468 +                                       ++ convoyused;
30469 +                                       /*
30470 +                                        * it is safe to spin lock multiple
30471 +                                        * lock stacks here, because lock
30472 +                                        * stack cannot sleep on more than one
30473 +                                        * requestors queue.
30474 +                                        */
30475 +                                       /*
30476 +                                        * use raw spin_lock in stead of macro
30477 +                                        * wrappers, because spin lock
30478 +                                        * profiling code cannot cope with so
30479 +                                        * many locks held at the same time.
30480 +                                        */
30481 +                                       spin_lock(&item->sguard.lock);
30482 +                                       if (convoyused == convoylimit)
30483 +                                               break;
30484 +                               }
30485 +                       }
30486 +               }
30487 +               spin_lock(&convoy[0]->sguard.lock);
30488 +       }
30489 +
30490 +       WUNLOCK_ZLOCK(&node->lock);
30491 +
30492 +       while (convoyused > 0) {
30493 +               -- convoyused;
30494 +               __reiser4_wake_up(convoy[convoyused]);
30495 +               spin_unlock(&convoy[convoyused]->sguard.lock);
30496 +       }
30497 +#else
30498 +       /* uniprocessor case: keep it simple */
30499 +       if (!requestors_list_empty(&node->lock.requestors)) {
30500 +               lock_stack *requestor;
30501 +
30502 +               requestor = requestors_list_front(&node->lock.requestors);
30503 +               reiser4_wake_up(requestor);
30504 +       }
30505 +
30506 +       WUNLOCK_ZLOCK(&node->lock);
30507 +#endif
30508 +}
30509 +
30510 +#undef MAX_CONVOY_SIZE
30511 +
30512 +/* release long-term lock, acquired by longterm_lock_znode() */
30513 +reiser4_internal void
30514 +longterm_unlock_znode(lock_handle * handle)
30515 +{
30516 +       znode *node = handle->node;
30517 +       lock_stack *oldowner = handle->owner;
30518 +       int hipri;
30519 +       int readers;
30520 +       int rdelta;
30521 +       int youdie;
30522 +
30523 +       /*
30524 +        * this is time-critical and highly optimized code. Modify carefully.
30525 +        */
30526 +
30527 +       assert("jmacd-1021", handle != NULL);
30528 +       assert("jmacd-1022", handle->owner != NULL);
30529 +       assert("nikita-1392", LOCK_CNT_GTZ(long_term_locked_znode));
30530 +
30531 +       assert("zam-130", oldowner == get_current_lock_stack());
30532 +
30533 +       LOCK_CNT_DEC(long_term_locked_znode);
30534 +
30535 +       ADDSTAT(node, unlock);
30536 +
30537 +       /*
30538 +        * to minimize amount of operations performed under lock, pre-compute
30539 +        * all variables used within critical section. This makes code
30540 +        * obscure.
30541 +        */
30542 +
30543 +       /* was this lock of hi or lo priority */
30544 +       hipri   = oldowner->curpri ? -1 : 0;
30545 +       /* number of readers */
30546 +       readers = node->lock.nr_readers;
30547 +       /* +1 if write lock, -1 if read lock */
30548 +       rdelta  = (readers > 0) ? -1 : +1;
30549 +       /* true if node is to die and write lock is released */
30550 +       youdie  = ZF_ISSET(node, JNODE_HEARD_BANSHEE) && (readers < 0);
30551 +
30552 +       WLOCK_ZLOCK(&node->lock);
30553 +
30554 +       assert("zam-101", znode_is_locked(node));
30555 +
30556 +       /* Adjust a number of high priority owners of this lock */
30557 +       node->lock.nr_hipri_owners += hipri;
30558 +       assert("nikita-1836", node->lock.nr_hipri_owners >= 0);
30559 +
30560 +       ON_TRACE(TRACE_LOCKS,
30561 +                "%spri unlock: %p node: %p: hipri_owners: %u nr_readers %d\n",
30562 +                oldowner->curpri ? "hi" : "lo",
30563 +                handle,
30564 +                node,
30565 +                node->lock.nr_hipri_owners,
30566 +                node->lock.nr_readers);
30567 +
30568 +       /* Handle znode deallocation on last write-lock release. */
30569 +       if (znode_is_wlocked_once(node)) {
30570 +               if (youdie) {
30571 +                       forget_znode(handle);
30572 +                       assert("nikita-2191", znode_invariant(node));
30573 +                       zput(node);
30574 +                       return;
30575 +               }
30576 +               znode_post_write(node);
30577 +       }
30578 +       if (znode_is_rlocked(node))
30579 +               ON_STATS(znode_at_read(node));
30580 +
30581 +       if (handle->signaled)
30582 +               atomic_dec(&oldowner->nr_signaled);
30583 +
30584 +       /* Unlocking means owner<->object link deletion */
30585 +       unlink_object(handle);
30586 +
30587 +       /* This is enough to be sure whether an object is completely
30588 +          unlocked. */
30589 +       node->lock.nr_readers += rdelta;
30590 +
30591 +       /* If the node is locked it must have an owners list.  Likewise, if
30592 +          the node is unlocked it must have an empty owners list. */
30593 +       assert("zam-319", equi(znode_is_locked(node),
30594 +                              !owners_list_empty(&node->lock.owners)));
30595 +
30596 +#if REISER4_DEBUG
30597 +       if (!znode_is_locked(node))
30598 +               ++ node->times_locked;
30599 +#endif
30600 +
30601 +       /* If there are pending lock requests we wake up a requestor */
30602 +       if (!znode_is_wlocked(node))
30603 +               wake_up_requestor(node);
30604 +       else
30605 +               WUNLOCK_ZLOCK(&node->lock);
30606 +
30607 +       assert("nikita-3182", rw_zlock_is_not_locked(&node->lock));
30608 +       /* minus one reference from handle->node */
30609 +       handle->node = NULL;
30610 +       assert("nikita-2190", znode_invariant(node));
30611 +       ON_DEBUG(check_lock_data());
30612 +       ON_DEBUG(check_lock_node_data(node));
30613 +       zput(node);
30614 +}
30615 +
30616 +/* final portion of longterm-unlock*/
30617 +static int
30618 +lock_tail(lock_stack *owner, int wake_up_next, int ok, znode_lock_mode mode)
30619 +{
30620 +       znode *node = owner->request.node;
30621 +
30622 +       assert("jmacd-807", rw_zlock_is_locked(&node->lock));
30623 +
30624 +       /* If we broke with (ok == 0) it means we can_lock, now do it. */
30625 +       if (ok == 0) {
30626 +               lock_object(owner);
30627 +               owner->request.mode = 0;
30628 +               if (mode == ZNODE_READ_LOCK)
30629 +                       wake_up_next = 1;
30630 +               if (REISER4_DEBUG_MODIFY) {
30631 +                       if (znode_is_wlocked_once(node))
30632 +                               znode_post_write(node);
30633 +                       else if (znode_is_rlocked(node))
30634 +                               ON_STATS(znode_at_read(node));
30635 +               }
30636 +       }
30637 +
30638 +       if (wake_up_next)
30639 +               wake_up_requestor(node);
30640 +       else
30641 +               WUNLOCK_ZLOCK(&node->lock);
30642 +
30643 +       if (ok == 0) {
30644 +               /* count a reference from lockhandle->node
30645 +
30646 +                  znode was already referenced at the entry to this function,
30647 +                  hence taking spin-lock here is not necessary (see comment
30648 +                  in the zref()).
30649 +               */
30650 +               zref(node);
30651 +
30652 +               LOCK_CNT_INC(long_term_locked_znode);
30653 +               if (REISER4_DEBUG_NODE && mode == ZNODE_WRITE_LOCK) {
30654 +                       node_check(node, 0);
30655 +                       ON_DEBUG_MODIFY(znode_pre_write(node));
30656 +               }
30657 +       }
30658 +
30659 +       ON_DEBUG(check_lock_data());
30660 +       ON_DEBUG(check_lock_node_data(node));
30661 +       return ok;
30662 +}
30663 +
30664 +/*
30665 + * version of longterm_znode_lock() optimized for the most common case: read
30666 + * lock without any special flags. This is the kind of lock that any tree
30667 + * traversal takes on the root node of the tree, which is very frequent.
30668 + */
30669 +static int
30670 +longterm_lock_tryfast(lock_stack * owner)
30671 +{
30672 +       int          result;
30673 +       int          wake_up_next      = 0;
30674 +       znode       *node;
30675 +       zlock       *lock;
30676 +
30677 +       node = owner->request.node;
30678 +       lock = &node->lock;
30679 +
30680 +       assert("nikita-3340", schedulable());
30681 +       assert("nikita-3341", request_is_deadlock_safe(node,
30682 +                                                      ZNODE_READ_LOCK,
30683 +                                                      ZNODE_LOCK_LOPRI));
30684 +
30685 +       result = UNDER_RW(zlock, lock, read, can_lock_object(owner));
30686 +
30687 +       if (likely(result != -EINVAL)) {
30688 +               spin_lock_znode(node);
30689 +               result = try_capture(
30690 +                       ZJNODE(node), ZNODE_READ_LOCK, 0, 1/* can copy on capture */);
30691 +               spin_unlock_znode(node);
30692 +               WLOCK_ZLOCK(lock);
30693 +               if (unlikely(result != 0)) {
30694 +                       owner->request.mode = 0;
30695 +                       wake_up_next = 1;
30696 +               } else {
30697 +                       result = can_lock_object(owner);
30698 +                       if (unlikely(result == -E_REPEAT)) {
30699 +                               /* fall back to longterm_lock_znode() */
30700 +                               WUNLOCK_ZLOCK(lock);
30701 +                               return 1;
30702 +                       }
30703 +               }
30704 +               return lock_tail(owner, wake_up_next, result, ZNODE_READ_LOCK);
30705 +       } else
30706 +               return 1;
30707 +}
30708 +
30709 +/* locks given lock object */
30710 +reiser4_internal int
30711 +longterm_lock_znode(
30712 +       /* local link object (allocated by lock owner thread, usually on its own
30713 +        * stack) */
30714 +       lock_handle * handle,
30715 +       /* znode we want to lock. */
30716 +       znode * node,
30717 +       /* {ZNODE_READ_LOCK, ZNODE_WRITE_LOCK}; */
30718 +       znode_lock_mode mode,
30719 +       /* {0, -EINVAL, -E_DEADLOCK}, see return codes description. */
30720 +       znode_lock_request request)
30721 +{
30722 +       int          ret;
30723 +       int          hipri             = (request & ZNODE_LOCK_HIPRI) != 0;
30724 +       int          wake_up_next      = 0;
30725 +       int          non_blocking      = 0;
30726 +       int          has_atom;
30727 +       txn_capture  cap_flags;
30728 +       zlock       *lock;
30729 +       txn_handle  *txnh;
30730 +       tree_level   level;
30731 +
30732 +       /* Get current process context */
30733 +       lock_stack *owner = get_current_lock_stack();
30734 +
30735 +       /* Check that the lock handle is initialized and isn't already being
30736 +        * used. */
30737 +       assert("jmacd-808", handle->owner == NULL);
30738 +       assert("nikita-3026", schedulable());
30739 +       assert("nikita-3219", request_is_deadlock_safe(node, mode, request));
30740 +       /* long term locks are not allowed in the VM contexts (->writepage(),
30741 +        * prune_{d,i}cache()).
30742 +        *
30743 +        * FIXME this doesn't work due to unused-dentry-with-unlinked-inode
30744 +        * bug caused by d_splice_alias() only working for directories.
30745 +        */
30746 +       assert("nikita-3547", 1 || ((current->flags & PF_MEMALLOC) == 0));
30747 +
30748 +       cap_flags = 0;
30749 +       if (request & ZNODE_LOCK_NONBLOCK) {
30750 +               cap_flags |= TXN_CAPTURE_NONBLOCKING;
30751 +               non_blocking = 1;
30752 +       }
30753 +
30754 +       if (request & ZNODE_LOCK_DONT_FUSE)
30755 +               cap_flags |= TXN_CAPTURE_DONT_FUSE;
30756 +
30757 +       /* If we are changing our process priority we must adjust a number
30758 +          of high priority owners for each znode that we already lock */
30759 +       if (hipri) {
30760 +               set_high_priority(owner);
30761 +       } else {
30762 +               set_low_priority(owner);
30763 +       }
30764 +
30765 +       level = znode_get_level(node);
30766 +       ADDSTAT(node, lock);
30767 +
30768 +       /* Fill request structure with our values. */
30769 +       owner->request.mode = mode;
30770 +       owner->request.handle = handle;
30771 +       owner->request.node = node;
30772 +
30773 +       txnh = get_current_context()->trans;
30774 +       lock = &node->lock;
30775 +
30776 +       if (mode == ZNODE_READ_LOCK && request == 0) {
30777 +               ret = longterm_lock_tryfast(owner);
30778 +               if (ret <= 0)
30779 +                       return ret;
30780 +       }
30781 +
30782 +       has_atom = (txnh->atom != NULL);
30783 +
30784 +       /* update statistics */
30785 +       if (REISER4_STATS) {
30786 +               if (mode == ZNODE_READ_LOCK)
30787 +                       ADDSTAT(node, lock_read);
30788 +               else
30789 +                       ADDSTAT(node, lock_write);
30790 +
30791 +               if (hipri)
30792 +                       ADDSTAT(node, lock_hipri);
30793 +               else
30794 +                       ADDSTAT(node, lock_lopri);
30795 +       }
30796 +
30797 +       /* Synchronize on node's zlock guard lock. */
30798 +       WLOCK_ZLOCK(lock);
30799 +
30800 +       if (znode_is_locked(node) &&
30801 +           mode == ZNODE_WRITE_LOCK && recursive(owner))
30802 +               return lock_tail(owner, 0, 0, mode);
30803 +
30804 +       for (;;) {
30805 +               ADDSTAT(node, lock_iteration);
30806 +
30807 +               /* Check the lock's availability: if it is unavaiable we get
30808 +                  E_REPEAT, 0 indicates "can_lock", otherwise the node is
30809 +                  invalid.  */
30810 +               ret = can_lock_object(owner);
30811 +
30812 +               if (unlikely(ret == -EINVAL)) {
30813 +                       /* @node is dying. Leave it alone. */
30814 +                       /* wakeup next requestor to support lock invalidating */
30815 +                       wake_up_next = 1;
30816 +                       ADDSTAT(node, lock_dying);
30817 +                       break;
30818 +               }
30819 +
30820 +               if (unlikely(ret == -E_REPEAT && non_blocking)) {
30821 +                       /* either locking of @node by the current thread will
30822 +                        * lead to the deadlock, or lock modes are
30823 +                        * incompatible. */
30824 +                       ADDSTAT(node, lock_cannot_lock);
30825 +                       break;
30826 +               }
30827 +
30828 +               assert("nikita-1844", (ret == 0) || ((ret == -E_REPEAT) && !non_blocking));
30829 +               /* If we can get the lock... Try to capture first before
30830 +                  taking the lock.*/
30831 +
30832 +               /* first handle commonest case where node and txnh are already
30833 +                * in the same atom. */
30834 +               /* safe to do without taking locks, because:
30835 +                *
30836 +                * 1. read of aligned word is atomic with respect to writes to
30837 +                * this word
30838 +                *
30839 +                * 2. false negatives are handled in try_capture().
30840 +                *
30841 +                * 3. false positives are impossible.
30842 +                *
30843 +                * PROOF: left as an exercise to the curious reader.
30844 +                *
30845 +                * Just kidding. Here is one:
30846 +                *
30847 +                * At the time T0 txnh->atom is stored in txnh_atom.
30848 +                *
30849 +                * At the time T1 node->atom is stored in node_atom.
30850 +                *
30851 +                * At the time T2 we observe that
30852 +                *
30853 +                *     txnh_atom != NULL && node_atom == txnh_atom.
30854 +                *
30855 +                * Imagine that at this moment we acquire node and txnh spin
30856 +                * lock in this order. Suppose that under spin lock we have
30857 +                *
30858 +                *     node->atom != txnh->atom,                       (S1)
30859 +                *
30860 +                * at the time T3.
30861 +                *
30862 +                * txnh->atom != NULL still, because txnh is open by the
30863 +                * current thread.
30864 +                *
30865 +                * Suppose node->atom == NULL, that is, node was un-captured
30866 +                * between T1, and T3. But un-capturing of formatted node is
30867 +                * always preceded by the call to invalidate_lock(), which
30868 +                * marks znode as JNODE_IS_DYING under zlock spin
30869 +                * lock. Contradiction, because can_lock_object() above checks
30870 +                * for JNODE_IS_DYING. Hence, node->atom != NULL at T3.
30871 +                *
30872 +                * Suppose that node->atom != node_atom, that is, atom, node
30873 +                * belongs to was fused into another atom: node_atom was fused
30874 +                * into node->atom. Atom of txnh was equal to node_atom at T2,
30875 +                * which means that under spin lock, txnh->atom == node->atom,
30876 +                * because txnh->atom can only follow fusion
30877 +                * chain. Contradicts S1.
30878 +                *
30879 +                * The same for hypothesis txnh->atom != txnh_atom. Hence,
30880 +                * node->atom == node_atom == txnh_atom == txnh->atom. Again
30881 +                * contradicts S1. Hence S1 is false. QED.
30882 +                *
30883 +                */
30884 +
30885 +               if (likely(has_atom && ZJNODE(node)->atom == txnh->atom)) {
30886 +                       ADDSTAT(node, lock_no_capture);
30887 +               } else {
30888 +                       /*
30889 +                        * unlock zlock spin lock here. It is possible for
30890 +                        * longterm_unlock_znode() to sneak in here, but there
30891 +                        * is no harm: invalidate_lock() will mark znode as
30892 +                        * JNODE_IS_DYING and this will be noted by
30893 +                        * can_lock_object() below.
30894 +                        */
30895 +                       WUNLOCK_ZLOCK(lock);
30896 +                       spin_lock_znode(node);
30897 +                       ret = try_capture(
30898 +                               ZJNODE(node), mode, cap_flags, 1/* can copy on capture*/);
30899 +                       spin_unlock_znode(node);
30900 +                       WLOCK_ZLOCK(lock);
30901 +                       if (unlikely(ret != 0)) {
30902 +                               /* In the failure case, the txnmgr releases
30903 +                                  the znode's lock (or in some cases, it was
30904 +                                  released a while ago).  There's no need to
30905 +                                  reacquire it so we should return here,
30906 +                                  avoid releasing the lock. */
30907 +                               owner->request.mode = 0;
30908 +                               /* next requestor may not fail */
30909 +                               wake_up_next = 1;
30910 +                               break;
30911 +                       }
30912 +
30913 +                       /* Check the lock's availability again -- this is
30914 +                          because under some circumstances the capture code
30915 +                          has to release and reacquire the znode spinlock. */
30916 +                       ret = can_lock_object(owner);
30917 +               }
30918 +
30919 +               /* This time, a return of (ret == 0) means we can lock, so we
30920 +                  should break out of the loop. */
30921 +               if (likely(ret != -E_REPEAT || non_blocking)) {
30922 +                       ADDSTAT(node, lock_can_lock);
30923 +                       break;
30924 +               }
30925 +
30926 +               /* Lock is unavailable, we have to wait. */
30927 +
30928 +               /* By having semaphore initialization here we cannot lose
30929 +                  wakeup signal even if it comes after `nr_signaled' field
30930 +                  check. */
30931 +               ret = prepare_to_sleep(owner);
30932 +               if (unlikely(ret != 0)) {
30933 +                       break;
30934 +               }
30935 +
30936 +               assert("nikita-1837", rw_zlock_is_locked(&node->lock));
30937 +               if (hipri) {
30938 +                       /* If we are going in high priority direction then
30939 +                          increase high priority requests counter for the
30940 +                          node */
30941 +                       lock->nr_hipri_requests++;
30942 +                       /* If there are no high priority owners for a node,
30943 +                          then immediately wake up low priority owners, so
30944 +                          they can detect possible deadlock */
30945 +                       if (lock->nr_hipri_owners == 0)
30946 +                               wake_up_all_lopri_owners(node);
30947 +                       /* And prepare a lock request */
30948 +                       requestors_list_push_front(&lock->requestors, owner);
30949 +               } else {
30950 +                       /* If we are going in low priority direction then we
30951 +                          set low priority to our process. This is the only
30952 +                          case  when a process may become low priority */
30953 +                       /* And finally prepare a lock request */
30954 +                       requestors_list_push_back(&lock->requestors, owner);
30955 +               }
30956 +
30957 +               /* Ok, here we have prepared a lock request, so unlock
30958 +                  a znode ...*/
30959 +               WUNLOCK_ZLOCK(lock);
30960 +               /* ... and sleep */
30961 +               go_to_sleep(owner, level);
30962 +
30963 +               WLOCK_ZLOCK(lock);
30964 +
30965 +               if (hipri) {
30966 +                       assert("nikita-1838", lock->nr_hipri_requests > 0);
30967 +                       lock->nr_hipri_requests--;
30968 +               }
30969 +
30970 +               requestors_list_remove(owner);
30971 +       }
30972 +
30973 +       assert("jmacd-807/a", rw_zlock_is_locked(&node->lock));
30974 +       return lock_tail(owner, wake_up_next, ret, mode);
30975 +}
30976 +
30977 +/* lock object invalidation means changing of lock object state to `INVALID'
30978 +   and waiting for all other processes to cancel theirs lock requests. */
30979 +reiser4_internal void
30980 +invalidate_lock(lock_handle * handle   /* path to lock
30981 +                                          * owner and lock
30982 +                                          * object is being
30983 +                                          * invalidated. */ )
30984 +{
30985 +       znode *node = handle->node;
30986 +       lock_stack *owner = handle->owner;
30987 +       lock_stack *rq;
30988 +
30989 +       assert("zam-325", owner == get_current_lock_stack());
30990 +       assert("zam-103", znode_is_write_locked(node));
30991 +       assert("nikita-1393", !ZF_ISSET(node, JNODE_LEFT_CONNECTED));
30992 +       assert("nikita-1793", !ZF_ISSET(node, JNODE_RIGHT_CONNECTED));
30993 +       assert("nikita-1394", ZF_ISSET(node, JNODE_HEARD_BANSHEE));
30994 +       assert("nikita-3097", znode_is_wlocked_once(node));
30995 +       assert("nikita-3338", rw_zlock_is_locked(&node->lock));
30996 +
30997 +       if (handle->signaled)
30998 +               atomic_dec(&owner->nr_signaled);
30999 +
31000 +       ZF_SET(node, JNODE_IS_DYING);
31001 +       unlink_object(handle);
31002 +       node->lock.nr_readers = 0;
31003 +
31004 +       /* all requestors will be informed that lock is invalidated. */
31005 +       for_all_type_safe_list(requestors, &node->lock.requestors, rq) {
31006 +               reiser4_wake_up(rq);
31007 +       }
31008 +
31009 +       /* We use that each unlock() will wakeup first item from requestors
31010 +          list; our lock stack is the last one. */
31011 +       while (!requestors_list_empty(&node->lock.requestors)) {
31012 +               requestors_list_push_back(&node->lock.requestors, owner);
31013 +
31014 +               prepare_to_sleep(owner);
31015 +
31016 +               WUNLOCK_ZLOCK(&node->lock);
31017 +               go_to_sleep(owner, znode_get_level(node));
31018 +               WLOCK_ZLOCK(&node->lock);
31019 +
31020 +               requestors_list_remove(owner);
31021 +       }
31022 +
31023 +       WUNLOCK_ZLOCK(&node->lock);
31024 +}
31025 +
31026 +/* Initializes lock_stack. */
31027 +reiser4_internal void
31028 +init_lock_stack(lock_stack * owner     /* pointer to
31029 +                                          * allocated
31030 +                                          * structure. */ )
31031 +{
31032 +       /* xmemset(,0,) is done already as a part of reiser4 context
31033 +        * initialization */
31034 +       /* xmemset(owner, 0, sizeof (lock_stack)); */
31035 +       locks_list_init(&owner->locks);
31036 +       requestors_list_clean(owner);
31037 +       spin_stack_init(owner);
31038 +       owner->curpri = 1;
31039 +       sema_init(&owner->sema, 0);
31040 +}
31041 +
31042 +/* Initializes lock object. */
31043 +reiser4_internal void
31044 +reiser4_init_lock(zlock * lock /* pointer on allocated
31045 +                                  * uninitialized lock object
31046 +                                  * structure. */ )
31047 +{
31048 +       xmemset(lock, 0, sizeof (zlock));
31049 +       rw_zlock_init(lock);
31050 +       requestors_list_init(&lock->requestors);
31051 +       owners_list_init(&lock->owners);
31052 +}
31053 +
31054 +/* lock handle initialization */
31055 +reiser4_internal void
31056 +init_lh(lock_handle * handle)
31057 +{
31058 +       xmemset(handle, 0, sizeof *handle);
31059 +       locks_list_clean(handle);
31060 +       owners_list_clean(handle);
31061 +}
31062 +
31063 +/* freeing of lock handle resources */
31064 +reiser4_internal void
31065 +done_lh(lock_handle * handle)
31066 +{
31067 +       assert("zam-342", handle != NULL);
31068 +       if (handle->owner != NULL)
31069 +               longterm_unlock_znode(handle);
31070 +}
31071 +
31072 +/* What kind of lock? */
31073 +reiser4_internal znode_lock_mode lock_mode(lock_handle * handle)
31074 +{
31075 +       if (handle->owner == NULL) {
31076 +               return ZNODE_NO_LOCK;
31077 +       } else if (znode_is_rlocked(handle->node)) {
31078 +               return ZNODE_READ_LOCK;
31079 +       } else {
31080 +               return ZNODE_WRITE_LOCK;
31081 +       }
31082 +}
31083 +
31084 +/* Transfer a lock handle (presumably so that variables can be moved between stack and
31085 +   heap locations). */
31086 +static void
31087 +move_lh_internal(lock_handle * new, lock_handle * old, int unlink_old)
31088 +{
31089 +       znode *node = old->node;
31090 +       lock_stack *owner = old->owner;
31091 +       int signaled;
31092 +
31093 +       /* locks_list, modified by link_object() is not protected by
31094 +          anything. This is valid because only current thread ever modifies
31095 +          locks_list of its lock_stack.
31096 +       */
31097 +       assert("nikita-1827", owner == get_current_lock_stack());
31098 +       assert("nikita-1831", new->owner == NULL);
31099 +
31100 +       WLOCK_ZLOCK(&node->lock);
31101 +
31102 +       signaled = old->signaled;
31103 +       if (unlink_old) {
31104 +               unlink_object(old);
31105 +       } else {
31106 +               if (node->lock.nr_readers > 0) {
31107 +                       node->lock.nr_readers += 1;
31108 +               } else {
31109 +                       node->lock.nr_readers -= 1;
31110 +               }
31111 +               if (signaled) {
31112 +                       atomic_inc(&owner->nr_signaled);
31113 +               }
31114 +               if (owner->curpri) {
31115 +                       node->lock.nr_hipri_owners += 1;
31116 +               }
31117 +               LOCK_CNT_INC(long_term_locked_znode);
31118 +
31119 +               zref(node);
31120 +       }
31121 +       link_object(new, owner, node);
31122 +       new->signaled = signaled;
31123 +
31124 +       WUNLOCK_ZLOCK(&node->lock);
31125 +}
31126 +
31127 +reiser4_internal void
31128 +move_lh(lock_handle * new, lock_handle * old)
31129 +{
31130 +       move_lh_internal(new, old, /*unlink_old */ 1);
31131 +}
31132 +
31133 +reiser4_internal void
31134 +copy_lh(lock_handle * new, lock_handle * old)
31135 +{
31136 +       move_lh_internal(new, old, /*unlink_old */ 0);
31137 +}
31138 +
31139 +/* after getting -E_DEADLOCK we unlock znodes until this function returns false */
31140 +reiser4_internal int
31141 +check_deadlock(void)
31142 +{
31143 +       lock_stack *owner = get_current_lock_stack();
31144 +       return atomic_read(&owner->nr_signaled) != 0;
31145 +}
31146 +
31147 +/* Before going to sleep we re-check "release lock" requests which might come from threads with hi-pri lock
31148 +   priorities. */
31149 +reiser4_internal int
31150 +prepare_to_sleep(lock_stack * owner)
31151 +{
31152 +       assert("nikita-1847", owner == get_current_lock_stack());
31153 +       /* NOTE(Zam): We cannot reset the lock semaphore here because it may
31154 +          clear wake-up signal. The initial design was to re-check all
31155 +          conditions under which we continue locking, release locks or sleep
31156 +          until conditions are changed. However, even lock.c does not follow
31157 +          that design.  So, wake-up signal which is stored in semaphore state
31158 +          could we loosen by semaphore reset.  The less complex scheme without
31159 +          resetting the semaphore is enough to not to loose wake-ups.
31160 +
31161 +       if (0) {
31162 +
31163 +                  NOTE-NIKITA: I commented call to sema_init() out hoping
31164 +                  that it is the reason or thread sleeping in
31165 +                  down(&owner->sema) without any other thread running.
31166 +
31167 +                  Anyway, it is just an optimization: is semaphore is not
31168 +                  reinitialised at this point, in the worst case
31169 +                  longterm_lock_znode() would have to iterate its loop once
31170 +                  more.
31171 +               spin_lock_stack(owner);
31172 +               sema_init(&owner->sema, 0);
31173 +               spin_unlock_stack(owner);
31174 +       }
31175 +       */
31176 +
31177 +       /* We return -E_DEADLOCK if one or more "give me the lock" messages are
31178 +        * counted in nr_signaled */
31179 +       if (unlikely(atomic_read(&owner->nr_signaled) != 0)) {
31180 +               assert("zam-959", !owner->curpri);
31181 +               return RETERR(-E_DEADLOCK);
31182 +       }
31183 +       return 0;
31184 +}
31185 +
31186 +/* Wakes up a single thread */
31187 +reiser4_internal void
31188 +__reiser4_wake_up(lock_stack * owner)
31189 +{
31190 +       up(&owner->sema);
31191 +}
31192 +
31193 +/* Puts a thread to sleep */
31194 +reiser4_internal void
31195 +__go_to_sleep(lock_stack * owner
31196 +#if REISER4_STATS
31197 +           , int node_level
31198 +#endif
31199 +)
31200 +{
31201 +#if REISER4_STATS
31202 +       unsigned long sleep_start = jiffies;
31203 +#endif
31204 +       /* Well, we might sleep here, so holding of any spinlocks is no-no */
31205 +       assert("nikita-3027", schedulable());
31206 +       /* return down_interruptible(&owner->sema); */
31207 +       down(&owner->sema);
31208 +#if REISER4_STATS
31209 +       switch (node_level) {
31210 +           case ADD_TO_SLEPT_IN_WAIT_EVENT:
31211 +                   reiser4_stat_add(txnmgr.slept_in_wait_event, jiffies - sleep_start);
31212 +                   break;
31213 +           case ADD_TO_SLEPT_IN_WAIT_ATOM:
31214 +                   reiser4_stat_add(txnmgr.slept_in_wait_atom, jiffies - sleep_start);
31215 +                   break;
31216 +           default:
31217 +                   reiser4_stat_add_at_level(node_level, time_slept,
31218 +                                             jiffies - sleep_start);
31219 +       }
31220 +#endif
31221 +}
31222 +
31223 +reiser4_internal int
31224 +lock_stack_isclean(lock_stack * owner)
31225 +{
31226 +       if (locks_list_empty(&owner->locks)) {
31227 +               assert("zam-353", atomic_read(&owner->nr_signaled) == 0);
31228 +               return 1;
31229 +       }
31230 +
31231 +       return 0;
31232 +}
31233 +
31234 +#if REISER4_DEBUG_OUTPUT
31235 +/* Debugging help */
31236 +reiser4_internal void
31237 +print_lock_stack(const char *prefix, lock_stack * owner)
31238 +{
31239 +       lock_handle *handle;
31240 +
31241 +       spin_lock_stack(owner);
31242 +
31243 +       printk("%s:\n", prefix);
31244 +       printk(".... nr_signaled %d\n", atomic_read(&owner->nr_signaled));
31245 +       printk(".... curpri %s\n", owner->curpri ? "high" : "low");
31246 +
31247 +       if (owner->request.mode != 0) {
31248 +               printk(".... current request: %s", owner->request.mode == ZNODE_WRITE_LOCK ? "write" : "read");
31249 +               print_address("", znode_get_block(owner->request.node));
31250 +       }
31251 +
31252 +       printk(".... current locks:\n");
31253 +
31254 +       for_all_type_safe_list(locks, &owner->locks, handle) {
31255 +               if (handle->node != NULL)
31256 +                       print_address(znode_is_rlocked(handle->node) ?
31257 +                                     "......  read" : "...... write", znode_get_block(handle->node));
31258 +       }
31259 +
31260 +       spin_unlock_stack(owner);
31261 +}
31262 +#endif
31263 +
31264 +#if REISER4_DEBUG
31265 +
31266 +/*
31267 + * debugging functions
31268 + */
31269 +
31270 +/* check consistency of locking data-structures hanging of the @stack */
31271 +void
31272 +check_lock_stack(lock_stack * stack)
31273 +{
31274 +       spin_lock_stack(stack);
31275 +       /* check that stack->locks is not corrupted */
31276 +       locks_list_check(&stack->locks);
31277 +       spin_unlock_stack(stack);
31278 +}
31279 +
31280 +/* check consistency of locking data structures */
31281 +void
31282 +check_lock_data(void)
31283 +{
31284 +       check_lock_stack(&get_current_context()->stack);
31285 +}
31286 +
31287 +/* check consistency of locking data structures for @node */
31288 +void
31289 +check_lock_node_data(znode * node)
31290 +{
31291 +       RLOCK_ZLOCK(&node->lock);
31292 +       owners_list_check(&node->lock.owners);
31293 +       requestors_list_check(&node->lock.requestors);
31294 +       RUNLOCK_ZLOCK(&node->lock);
31295 +}
31296 +
31297 +/* check that given lock request is dead lock safe. This check is, of course,
31298 + * not exhaustive. */
31299 +static int
31300 +request_is_deadlock_safe(znode * node, znode_lock_mode mode,
31301 +                        znode_lock_request request)
31302 +{
31303 +       lock_stack *owner;
31304 +
31305 +       owner = get_current_lock_stack();
31306 +       /*
31307 +        * check that hipri lock request is not issued when there are locked
31308 +        * nodes at the higher levels.
31309 +        */
31310 +       if (request & ZNODE_LOCK_HIPRI && !(request & ZNODE_LOCK_NONBLOCK) &&
31311 +           znode_get_level(node) != 0) {
31312 +               lock_handle *item;
31313 +
31314 +               for_all_type_safe_list(locks, &owner->locks, item) {
31315 +                       znode *other = item->node;
31316 +
31317 +                       if (znode_get_level(other) == 0)
31318 +                               continue;
31319 +                       if (znode_get_level(other) > znode_get_level(node))
31320 +                               return 0;
31321 +               }
31322 +       }
31323 +       return 1;
31324 +}
31325 +
31326 +#endif
31327 +
31328 +/* return pointer to static storage with name of lock_mode. For
31329 +    debugging */
31330 +reiser4_internal const char *
31331 +lock_mode_name(znode_lock_mode lock /* lock mode to get name of */ )
31332 +{
31333 +       if (lock == ZNODE_READ_LOCK)
31334 +               return "read";
31335 +       else if (lock == ZNODE_WRITE_LOCK)
31336 +               return "write";
31337 +       else {
31338 +               static char buf[30];
31339 +
31340 +               sprintf(buf, "unknown: %i", lock);
31341 +               return buf;
31342 +       }
31343 +}
31344 +
31345 +/* Make Linus happy.
31346 +   Local variables:
31347 +   c-indentation-style: "K&R"
31348 +   mode-name: "LC"
31349 +   c-basic-offset: 8
31350 +   tab-width: 8
31351 +   fill-column: 120
31352 +   End:
31353 +*/
31354 diff -rupN linux-2.6.8-rc3/fs/reiser4/lock.h linux-2.6.8-rc3-a/fs/reiser4/lock.h
31355 --- linux-2.6.8-rc3/fs/reiser4/lock.h   1970-01-01 03:00:00.000000000 +0300
31356 +++ linux-2.6.8-rc3-a/fs/reiser4/lock.h 2004-08-05 21:20:53.291616565 +0400
31357 @@ -0,0 +1,270 @@
31358 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
31359 +
31360 +/* Long term locking data structures. See lock.c for details. */
31361 +
31362 +#ifndef __LOCK_H__
31363 +#define __LOCK_H__
31364 +
31365 +#include "forward.h"
31366 +#include "debug.h"
31367 +#include "dformat.h"
31368 +#include "spin_macros.h"
31369 +#include "key.h"
31370 +#include "coord.h"
31371 +#include "type_safe_list.h"
31372 +#include "plugin/node/node.h"
31373 +#include "jnode.h"
31374 +#include "readahead.h"
31375 +
31376 +#include <linux/types.h>
31377 +#include <linux/spinlock.h>
31378 +#include <linux/pagemap.h>     /* for PAGE_CACHE_SIZE */
31379 +#include <asm/atomic.h>
31380 +#include <asm/semaphore.h>
31381 +
31382 +/* per-znode lock requests queue; list items are lock owner objects
31383 +   which want to lock given znode.
31384 +
31385 +   Locking: protected by znode spin lock. */
31386 +TYPE_SAFE_LIST_DECLARE(requestors);
31387 +/* per-znode list of lock handles for this znode
31388 +
31389 +   Locking: protected by znode spin lock. */
31390 +TYPE_SAFE_LIST_DECLARE(owners);
31391 +/* per-owner list of lock handles that point to locked znodes which
31392 +   belong to one lock owner
31393 +
31394 +   Locking: this list is only accessed by the thread owning the lock stack this
31395 +   list is attached to. Hence, no locking is necessary.
31396 +*/
31397 +TYPE_SAFE_LIST_DECLARE(locks);
31398 +
31399 +/* Per-znode lock object */
31400 +struct zlock {
31401 +       reiser4_rw_data guard;
31402 +       /* The number of readers if positive; the number of recursively taken
31403 +          write locks if negative. Protected by zlock spin lock. */
31404 +       int nr_readers;
31405 +       /* A number of processes (lock_stacks) that have this object
31406 +          locked with high priority */
31407 +       unsigned nr_hipri_owners;
31408 +       /* A number of attempts to lock znode in high priority direction */
31409 +       unsigned nr_hipri_requests;
31410 +       /* A linked list of lock_handle objects that contains pointers
31411 +          for all lock_stacks which have this lock object locked */
31412 +       owners_list_head owners;
31413 +       /* A linked list of lock_stacks that wait for this lock */
31414 +       requestors_list_head requestors;
31415 +};
31416 +
31417 +#define rw_ordering_pred_zlock(lock)                   \
31418 +         (lock_counters()->spin_locked_stack == 0)
31419 +
31420 +/* Define spin_lock_zlock, spin_unlock_zlock, etc. */
31421 +RW_LOCK_FUNCTIONS(zlock, zlock, guard);
31422 +
31423 +#define lock_is_locked(lock)          ((lock)->nr_readers != 0)
31424 +#define lock_is_rlocked(lock)         ((lock)->nr_readers > 0)
31425 +#define lock_is_wlocked(lock)         ((lock)->nr_readers < 0)
31426 +#define lock_is_wlocked_once(lock)    ((lock)->nr_readers == -1)
31427 +#define lock_can_be_rlocked(lock)     ((lock)->nr_readers >=0)
31428 +#define lock_mode_compatible(lock, mode) \
31429 +             (((mode) == ZNODE_WRITE_LOCK && !lock_is_locked(lock)) \
31430 +           || ((mode) == ZNODE_READ_LOCK && lock_can_be_rlocked(lock)))
31431 +
31432 +
31433 +/* Since we have R/W znode locks we need additional bidirectional `link'
31434 +   objects to implement n<->m relationship between lock owners and lock
31435 +   objects. We call them `lock handles'.
31436 +
31437 +   Locking: see lock.c/"SHORT-TERM LOCKING"
31438 +*/
31439 +struct lock_handle {
31440 +       /* This flag indicates that a signal to yield a lock was passed to
31441 +          lock owner and counted in owner->nr_signalled
31442 +
31443 +          Locking: this is accessed under spin lock on ->node.
31444 +       */
31445 +       int signaled;
31446 +       /* A link to owner of a lock */
31447 +       lock_stack *owner;
31448 +       /* A link to znode locked */
31449 +       znode *node;
31450 +       /* A list of all locks for a process */
31451 +       locks_list_link locks_link;
31452 +       /* A list of all owners for a znode */
31453 +       owners_list_link owners_link;
31454 +};
31455 +
31456 +typedef struct lock_request {
31457 +       /* A pointer to uninitialized link object */
31458 +       lock_handle *handle;
31459 +       /* A pointer to the object we want to lock */
31460 +       znode *node;
31461 +       /* Lock mode (ZNODE_READ_LOCK or ZNODE_WRITE_LOCK) */
31462 +       znode_lock_mode mode;
31463 +} lock_request;
31464 +
31465 +/* A lock stack structure for accumulating locks owned by a process */
31466 +struct lock_stack {
31467 +       /* A guard lock protecting a lock stack */
31468 +       reiser4_spin_data sguard;
31469 +       /* number of znodes which were requested by high priority processes */
31470 +       atomic_t nr_signaled;
31471 +       /* Current priority of a process
31472 +
31473 +          This is only accessed by the current thread and thus requires no
31474 +          locking.
31475 +       */
31476 +       int curpri;
31477 +       /* A list of all locks owned by this process. Elements can be added to
31478 +        * this list only by the current thread. ->node pointers in this list
31479 +        * can be only changed by the current thread. */
31480 +       locks_list_head locks;
31481 +       int nr_locks; /* number of lock handles in the above list */
31482 +       /* When lock_stack waits for the lock, it puts itself on double-linked
31483 +          requestors list of that lock */
31484 +       requestors_list_link requestors_link;
31485 +       /* Current lock request info.
31486 +
31487 +          This is only accessed by the current thread and thus requires no
31488 +          locking.
31489 +       */
31490 +       lock_request request;
31491 +       /* It is a lock_stack's synchronization object for when process sleeps
31492 +          when requested lock not on this lock_stack but which it wishes to
31493 +          add to this lock_stack is not immediately available. It is used
31494 +          instead of wait_queue_t object due to locking problems (lost wake
31495 +          up). "lost wakeup" occurs when process is waken up before he actually
31496 +          becomes 'sleepy' (through sleep_on()). Using of semaphore object is
31497 +          simplest way to avoid that problem.
31498 +
31499 +          A semaphore is used in the following way: only the process that is
31500 +          the owner of the lock_stack initializes it (to zero) and calls
31501 +          down(sema) on it. Usually this causes the process to sleep on the
31502 +          semaphore. Other processes may wake him up by calling up(sema). The
31503 +          advantage to a semaphore is that up() and down() calls are not
31504 +          required to preserve order. Unlike wait_queue it works when process
31505 +          is woken up before getting to sleep.
31506 +
31507 +          NOTE-NIKITA: Transaction manager is going to have condition variables
31508 +          (&kcondvar_t) anyway, so this probably will be replaced with
31509 +          one in the future.
31510 +
31511 +          After further discussion, Nikita has shown me that Zam's implementation is
31512 +          exactly a condition variable.  The znode's {zguard,requestors_list} represents
31513 +          condition variable and the lock_stack's {sguard,semaphore} guards entry and
31514 +          exit from the condition variable's wait queue.  But the existing code can't
31515 +          just be replaced with a more general abstraction, and I think its fine the way
31516 +          it is. */
31517 +       struct semaphore sema;
31518 +};
31519 +
31520 +/* defining of list manipulation functions for lists above */
31521 +TYPE_SAFE_LIST_DEFINE(requestors, lock_stack, requestors_link);
31522 +TYPE_SAFE_LIST_DEFINE(owners, lock_handle, owners_link);
31523 +TYPE_SAFE_LIST_DEFINE(locks, lock_handle, locks_link);
31524 +
31525 +/*
31526 +  User-visible znode locking functions
31527 +*/
31528 +
31529 +extern int longterm_lock_znode   (lock_handle * handle,
31530 +                                 znode * node,
31531 +                                 znode_lock_mode mode,
31532 +                                 znode_lock_request request);
31533 +
31534 +extern void longterm_unlock_znode(lock_handle * handle);
31535 +
31536 +extern int check_deadlock(void);
31537 +
31538 +extern lock_stack *get_current_lock_stack(void);
31539 +
31540 +extern void init_lock_stack(lock_stack * owner);
31541 +extern void reiser4_init_lock(zlock * lock);
31542 +
31543 +extern void init_lh(lock_handle *);
31544 +extern void move_lh(lock_handle * new, lock_handle * old);
31545 +extern void copy_lh(lock_handle * new, lock_handle * old);
31546 +extern void done_lh(lock_handle *);
31547 +extern znode_lock_mode lock_mode(lock_handle *);
31548 +
31549 +extern int prepare_to_sleep(lock_stack * owner);
31550 +
31551 +#if REISER4_STATS
31552 +
31553 +#define ADD_TO_SLEPT_IN_WAIT_EVENT (-1)
31554 +#define ADD_TO_SLEPT_IN_WAIT_ATOM  (-2)
31555 +
31556 +/* if REISER4_STATS __go_to_sleep() accepts additional parameter @level for
31557 + * gathering per-level sleep statistics. The go_to_sleep wrapper hides the
31558 + * __go_to_sleep() function prototypes difference. */
31559 +void __go_to_sleep(lock_stack*, int);
31560 +#define go_to_sleep(owner, level) __go_to_sleep(owner, level);
31561 +
31562 +#else
31563 +
31564 +void __go_to_sleep(lock_stack*);
31565 +#define go_to_sleep(owner, level) __go_to_sleep(owner)
31566 +
31567 +#endif
31568 +
31569 +extern void __reiser4_wake_up(lock_stack * owner);
31570 +
31571 +extern int lock_stack_isclean(lock_stack * owner);
31572 +
31573 +/* zlock object state check macros: only used in assertions.  Both forms imply that the
31574 +   lock is held by the current thread. */
31575 +extern int znode_is_write_locked(const znode * node);
31576 +
31577 +#if REISER4_DEBUG
31578 +#define spin_ordering_pred_stack_addendum (1)
31579 +#else
31580 +#define spin_ordering_pred_stack_addendum              \
31581 +        ((lock_counters()->rw_locked_dk == 0) &&       \
31582 +         (lock_counters()->rw_locked_tree == 0))
31583 +#endif
31584 +/* lock ordering is: first take zlock spin lock, then lock stack spin lock */
31585 +#define spin_ordering_pred_stack(stack)                                \
31586 +       ((lock_counters()->spin_locked_stack == 0) &&           \
31587 +        (lock_counters()->spin_locked_txnmgr == 0) &&          \
31588 +        (lock_counters()->spin_locked_super == 0) &&           \
31589 +        (lock_counters()->spin_locked_inode_object == 0) &&    \
31590 +        (lock_counters()->rw_locked_cbk_cache == 0) && \
31591 +        (lock_counters()->spin_locked_epoch == 0) &&           \
31592 +        (lock_counters()->spin_locked_super_eflush == 0) &&    \
31593 +        spin_ordering_pred_stack_addendum)
31594 +
31595 +/* Same for lock_stack */
31596 +SPIN_LOCK_FUNCTIONS(stack, lock_stack, sguard);
31597 +
31598 +static inline void
31599 +reiser4_wake_up(lock_stack * owner)
31600 +{
31601 +       spin_lock_stack(owner);
31602 +       __reiser4_wake_up(owner);
31603 +       spin_unlock_stack(owner);
31604 +}
31605 +
31606 +const char *lock_mode_name(znode_lock_mode lock);
31607 +
31608 +#if REISER4_DEBUG
31609 +extern void check_lock_data(void);
31610 +extern void check_lock_node_data(znode * node);
31611 +#else
31612 +#define check_lock_data() noop
31613 +#define check_lock_node_data() noop
31614 +#endif
31615 +
31616 +/* __LOCK_H__ */
31617 +#endif
31618 +
31619 +/* Make Linus happy.
31620 +   Local variables:
31621 +   c-indentation-style: "K&R"
31622 +   mode-name: "LC"
31623 +   c-basic-offset: 8
31624 +   tab-width: 8
31625 +   fill-column: 120
31626 +   End:
31627 +*/
31628 diff -rupN linux-2.6.8-rc3/fs/reiser4/log.c linux-2.6.8-rc3-a/fs/reiser4/log.c
31629 --- linux-2.6.8-rc3/fs/reiser4/log.c    1970-01-01 03:00:00.000000000 +0300
31630 +++ linux-2.6.8-rc3-a/fs/reiser4/log.c  2004-08-05 21:20:53.080661061 +0400
31631 @@ -0,0 +1,518 @@
31632 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
31633 + * reiser4/README */
31634 +
31635 +/* Tree-tracing facility. Copied from reiserfs v3.x patch, never released */
31636 +
31637 +/*
31638 + * Tree-tracing is enabled by REISER4_EVENT_LOG compile option, and
31639 + * log_file=<path> mount option.
31640 + *
31641 + * File at <path> is opened (created if needed) and filled with log records
31642 + * while file system is mounted.
31643 + *
31644 + * Special path /dev/null disables logging.
31645 + *
31646 + *
31647 + * Special path /dev/console is interpreted as outputting log records through
31648 + * printk().
31649 + *
31650 + * Low-level functions to output log record are write_log() and
31651 + * write_log_raw(). Various macros defined in log.h are used as wrappers.
31652 + *
31653 + * Output to log file is buffered to reduce overhead, but as target file
31654 + * system (one where log file lives) also buffers data in memory, tracing
31655 + * can distort file system behavior significantly. It has been experimentally
31656 + * found that optimal was to log is by using log_file=<pipe> and piping
31657 + * log records to another host (through netcat(1) or similar, for example).
31658 + *
31659 + */
31660 +
31661 +#include "forward.h"
31662 +#include "debug.h"
31663 +#include "key.h"
31664 +#include "log.h"
31665 +#include "super.h"
31666 +#include "inode.h"
31667 +#include "page_cache.h" /* for jprivate() */
31668 +
31669 +#include <asm/uaccess.h>
31670 +#include <linux/types.h>
31671 +#include <linux/fs.h>          /* for struct super_block  */
31672 +#include <linux/slab.h>
31673 +#include <linux/bio.h>
31674 +#include <linux/vmalloc.h>
31675 +
31676 +#if REISER4_LOG
31677 +
31678 +static int log_flush(reiser4_log_file * log);
31679 +static int free_space(reiser4_log_file * log, size_t * len);
31680 +static int lock_log(reiser4_log_file * log);
31681 +static void unlock_log(reiser4_log_file * log);
31682 +
31683 +/* helper macro: lock log file, return with error if locking failed. */
31684 +#define LOCK_OR_FAIL( log )                    \
31685 +({                                             \
31686 +       int __result;                           \
31687 +                                               \
31688 +       __result = lock_log( log );             \
31689 +       if( __result != 0 )                     \
31690 +               return __result;                \
31691 +})
31692 +
31693 +/* open log file. This is called by mount, when log_file=<path> option is
31694 + * used. */
31695 +int
31696 +open_log_file(struct super_block *super,
31697 +               const char *file_name,
31698 +               size_t size,
31699 +               reiser4_log_file * log)
31700 +{
31701 +       int gfp_mask;
31702 +
31703 +       assert("nikita-2498", file_name != NULL);
31704 +       assert("nikita-2499", log != NULL);
31705 +       assert("nikita-2500", size > 0);
31706 +
31707 +       xmemset(log, 0, sizeof *log);
31708 +
31709 +       spin_lock_init(&log->lock);
31710 +       INIT_LIST_HEAD(&log->wait);
31711 +
31712 +       /* special case: disable logging */
31713 +       if (!strcmp(file_name, "/dev/null")) {
31714 +               log->type = log_to_bucket;
31715 +               return 0;
31716 +       }
31717 +       log->buf = vmalloc(size);
31718 +       if (log->buf == NULL)
31719 +               return RETERR(-ENOMEM);
31720 +       log->size = size;
31721 +
31722 +       /* special case: log through printk() */
31723 +       if (!strcmp(file_name, "/dev/console")) {
31724 +               log->type = log_to_console;
31725 +               return 0;
31726 +       }
31727 +       log->fd = filp_open(file_name, O_CREAT | O_WRONLY, S_IFREG | S_IWUSR);
31728 +       if (IS_ERR(log->fd)) {
31729 +               warning("nikita-2501", "cannot open log file '%s': %li", file_name, PTR_ERR(log->fd));
31730 +               log->fd = NULL;
31731 +               return PTR_ERR(log->fd);
31732 +       }
31733 +       if (log->fd->f_dentry->d_inode->i_sb == super) {
31734 +               warning("nikita-2506", "Refusing to log onto logd fs");
31735 +               return RETERR(-EINVAL);
31736 +       }
31737 +       log->fd->f_dentry->d_inode->i_flags |= S_NOATIME;
31738 +       log->fd->f_flags |= O_APPEND;
31739 +
31740 +       /* avoid complications with calling memory allocator by ->write()
31741 +        * method of target file system, but setting GFP_NOFS bit in
31742 +        * mapping->gfp_mask */
31743 +       gfp_mask = mapping_gfp_mask(log->fd->f_dentry->d_inode->i_mapping);
31744 +       gfp_mask &= ~__GFP_FS;
31745 +       gfp_mask |= GFP_NOFS;
31746 +       mapping_set_gfp_mask(log->fd->f_dentry->d_inode->i_mapping, gfp_mask);
31747 +       log->type = log_to_file;
31748 +       return 0;
31749 +}
31750 +
31751 +/* write message (formatted according to @format) into log file @file */
31752 +int
31753 +write_log(reiser4_log_file * file, const char *format, ...)
31754 +{
31755 +       size_t len;
31756 +       int result;
31757 +       va_list args;
31758 +
31759 +       if (file == NULL || file->type == log_to_bucket ||
31760 +           file->buf == NULL || file->disabled > 0)
31761 +               return 0;
31762 +
31763 +       va_start(args, format);
31764 +       len = vsnprintf((char *) format, 0, format, args) + 1;
31765 +       va_end(args);
31766 +
31767 +       LOCK_OR_FAIL(file);
31768 +       result = free_space(file, &len);
31769 +       if (result == 0) {
31770 +               va_start(args, format);
31771 +               file->used += vsnprintf(file->buf + file->used,
31772 +                                       file->size - file->used, format, args);
31773 +               va_end(args);
31774 +       }
31775 +       unlock_log(file);
31776 +       return result;
31777 +}
31778 +
31779 +/* write buffer @data into @file */
31780 +int
31781 +write_log_raw(reiser4_log_file * file, const void *data, size_t len)
31782 +{
31783 +       int result;
31784 +
31785 +       if (file == NULL || file->type == log_to_bucket ||
31786 +           file->buf == NULL || file->disabled > 0)
31787 +               return 0;
31788 +
31789 +       LOCK_OR_FAIL(file);
31790 +       result = free_space(file, &len);
31791 +       if (result == 0) {
31792 +               xmemcpy(file->buf + file->used, data, (size_t) len);
31793 +               file->used += len;
31794 +       }
31795 +       unlock_log(file);
31796 +       return result;
31797 +}
31798 +
31799 +/* close log file. This is called by umount. */
31800 +void
31801 +close_log_file(reiser4_log_file * log)
31802 +{
31803 +       if (log->type == log_to_file && lock_log(log) == 0) {
31804 +               log_flush(log);
31805 +               unlock_log(log);
31806 +       }
31807 +       if (log->fd != NULL)
31808 +               filp_close(log->fd, NULL);
31809 +       if (log->buf != NULL) {
31810 +               vfree(log->buf);
31811 +               log->buf = NULL;
31812 +       }
31813 +}
31814 +
31815 +/* temporary suspend (or resume) tracing */
31816 +int
31817 +hold_log(reiser4_log_file * file, int flag)
31818 +{
31819 +       if (flag)
31820 +               return lock_log(file);
31821 +       else {
31822 +               unlock_log(file);
31823 +               return 0;
31824 +       }
31825 +}
31826 +
31827 +/* disable or enable tracing */
31828 +int
31829 +disable_log(reiser4_log_file * file, int flag)
31830 +{
31831 +       LOCK_OR_FAIL(file);
31832 +       file->disabled += flag ? +1 : -1;
31833 +       unlock_log(file);
31834 +       return 0;
31835 +}
31836 +
31837 +#define START_KERNEL_IO                                \
31838 +        {                                      \
31839 +               mm_segment_t __ski_old_fs;      \
31840 +                                               \
31841 +               __ski_old_fs = get_fs();        \
31842 +               set_fs( KERNEL_DS )
31843 +
31844 +#define END_KERNEL_IO                          \
31845 +               set_fs( __ski_old_fs );         \
31846 +       }
31847 +
31848 +struct __wlink {
31849 +       struct list_head link;
31850 +       struct semaphore sema;
31851 +};
31852 +
31853 +/* lock log file for exclusive use */
31854 +static int
31855 +lock_log(reiser4_log_file * log)
31856 +{
31857 +       int ret = 0;
31858 +
31859 +       spin_lock(&log->lock);
31860 +
31861 +       while (log->long_term) {
31862 +               /* sleep on a semaphore */
31863 +               struct __wlink link;
31864 +               sema_init(&link.sema, 0);
31865 +               list_add(&link.link, &log->wait);
31866 +               spin_unlock(&log->lock);
31867 +
31868 +               ret = down_interruptible(&link.sema);
31869 +
31870 +               spin_lock(&log->lock);
31871 +               list_del(&link.link);
31872 +       }
31873 +
31874 +       return ret;
31875 +}
31876 +
31877 +/* unlock log file */
31878 +static void
31879 +unlock_log(reiser4_log_file * log)
31880 +{
31881 +       spin_unlock(&log->lock);
31882 +}
31883 +
31884 +static void convert_to_longterm (reiser4_log_file * log)
31885 +{
31886 +       assert ("zam-833", log->long_term == 0);
31887 +       log->long_term = 1;
31888 +       spin_unlock(&log->lock);
31889 +}
31890 +
31891 +static void convert_to_shortterm (reiser4_log_file * log)
31892 +{
31893 +       struct list_head * pos;
31894 +
31895 +       spin_lock(&log->lock);
31896 +       assert ("zam-834", log->long_term);
31897 +       log->long_term = 0;
31898 +       list_for_each(pos, &log->wait) {
31899 +               struct __wlink * link;
31900 +               link = list_entry(pos, struct __wlink, link);
31901 +               up(&link->sema);
31902 +       }
31903 +}
31904 +
31905 +/*
31906 + * flush content of the file->buf to the logging target. Free space in buffer.
31907 + */
31908 +static int
31909 +log_flush(reiser4_log_file * file)
31910 +{
31911 +       int result;
31912 +
31913 +       result = 0;
31914 +       switch (file->type) {
31915 +       case log_to_file:{
31916 +               struct file *fd;
31917 +
31918 +               convert_to_longterm(file);
31919 +
31920 +               /*
31921 +                * if logging to the file, call vfs_write() until all data are
31922 +                * written
31923 +                */
31924 +
31925 +               fd = file->fd;
31926 +               if (fd && fd->f_op != NULL && fd->f_op->write != NULL) {
31927 +                       int written;
31928 +
31929 +                       written = 0;
31930 +                       START_KERNEL_IO;
31931 +                       while (file->used > 0) {
31932 +                               result = vfs_write(fd, file->buf + written,
31933 +                                                  file->used, &fd->f_pos);
31934 +                               if (result > 0) {
31935 +                                       file->used -= result;
31936 +                                       written += result;
31937 +                               } else {
31938 +                                       static int log_io_failed = 0;
31939 +
31940 +                                       if (IS_POW(log_io_failed))
31941 +                                               warning("nikita-2502",
31942 +                                                       "Error writing log: %i",
31943 +                                                       result);
31944 +                                       ++ log_io_failed;
31945 +                                       break;
31946 +                               }
31947 +                       }
31948 +                       END_KERNEL_IO;
31949 +               } else {
31950 +                       warning("nikita-2504", "no ->write() in log-file");
31951 +                       result = RETERR(-EINVAL);
31952 +               }
31953 +
31954 +               convert_to_shortterm(file);
31955 +
31956 +               break;
31957 +       }
31958 +       default:
31959 +               warning("nikita-2505",
31960 +                       "unknown log-file type: %i. Dumping to console",
31961 +                       file->type);
31962 +       case log_to_console:
31963 +               if (file->buf != NULL)
31964 +                       printk(file->buf);
31965 +       case log_to_bucket:
31966 +               file->used = 0;
31967 +               break;
31968 +       }
31969 +
31970 +       return result;
31971 +}
31972 +
31973 +/*
31974 + * free *@len bytes in the file->buf
31975 + */
31976 +static int
31977 +free_space(reiser4_log_file * file, size_t * len)
31978 +{
31979 +       if (*len > file->size) {
31980 +               warning("nikita-2503",
31981 +                       "log record too large: %i > %i. Truncating",
31982 +                       *len, file->size);
31983 +               *len = file->size;
31984 +       }
31985 +       while (*len > file->size - file->used) {
31986 +               int result;
31987 +
31988 +               /* flushing can sleep, so loop */
31989 +               result = log_flush(file);
31990 +               if (result < 0)
31991 +                       return result;
31992 +       }
31993 +       return 0;
31994 +}
31995 +
31996 +/*
31997 + * log tree operation @op on the @tree.
31998 + */
31999 +void
32000 +write_tree_log(reiser4_tree * tree, reiser4_log_op op, ...)
32001 +{
32002 +       va_list args;
32003 +       char buf[200];
32004 +       char *rest;
32005 +       reiser4_key *key;
32006 +
32007 +       if (unlikely(in_interrupt() || in_irq())) {
32008 +               printk("cannot write log from interrupt\n");
32009 +               return;
32010 +       }
32011 +
32012 +       /*
32013 +        * For each operation arguments are provided by the caller. Number and
32014 +        * type of arguments depends on operation type. Use va_args to extract
32015 +        * them.
32016 +        */
32017 +
32018 +       /*
32019 +        * tree_cut:    opcode, key_from, key_to
32020 +        *
32021 +        * tree_lookup: opcode, key
32022 +        *
32023 +        * tree_insert: opcode, item_data, coord, flags
32024 +        *
32025 +        * tree_paste:  opcode, item_data, coord, flags
32026 +        *
32027 +        * tree_cached: opcode
32028 +        *
32029 +        * tree_exit:   opcode
32030 +        *
32031 +        */
32032 +       va_start(args, op);
32033 +
32034 +       rest = buf;
32035 +       rest += sprintf(rest, "....tree %c ", op);
32036 +
32037 +       if (op != tree_cached && op != tree_exit) {
32038 +               key = va_arg(args, reiser4_key *);
32039 +               rest += sprintf_key(rest, key);
32040 +               *rest++ = ' ';
32041 +               *rest = '\0';
32042 +
32043 +               switch (op) {
32044 +               case tree_cut: {
32045 +                       reiser4_key *to;
32046 +
32047 +                       to = va_arg(args, reiser4_key *);
32048 +                       rest += sprintf_key(rest, to);
32049 +                       break;
32050 +               }
32051 +               case tree_lookup:
32052 +               default:
32053 +                       break;
32054 +               case tree_insert:
32055 +               case tree_paste: {
32056 +                       reiser4_item_data *data;
32057 +                       coord_t *coord;
32058 +                       __u32 flags;
32059 +
32060 +                       data = va_arg(args, reiser4_item_data *);
32061 +                       coord = va_arg(args, coord_t *);
32062 +                       flags = va_arg(args, __u32);
32063 +
32064 +                       rest += sprintf(rest, "%s (%u,%u) %x",
32065 +                                       data->iplug->h.label,
32066 +                                       coord->item_pos, coord->unit_pos, flags);
32067 +               }
32068 +               }
32069 +       }
32070 +       va_end(args);
32071 +       write_current_logf(WRITE_TREE_LOG, "%s", buf);
32072 +}
32073 +
32074 +/* construct in @buf jnode description to be output in the log */
32075 +char *
32076 +jnode_short_info(const jnode *j, char *buf)
32077 +{
32078 +       if (j == NULL) {
32079 +               sprintf(buf, "null");
32080 +       } else {
32081 +               sprintf(buf, "%i %c %c %i",
32082 +                       jnode_get_level(j),
32083 +                       jnode_is_znode(j) ? 'Z' :
32084 +                       jnode_is_unformatted(j) ? 'J' : '?',
32085 +                       JF_ISSET(j, JNODE_OVRWR) ? 'O' :
32086 +                       JF_ISSET(j, JNODE_RELOC) ? 'R' : ' ',
32087 +                       j->atom ? j->atom->atom_id : -1);
32088 +       }
32089 +       return buf;
32090 +}
32091 +
32092 +
32093 +/* write jnode description in the log */
32094 +void
32095 +write_node_log(const jnode *node)
32096 +{
32097 +       char jbuf[100];
32098 +
32099 +       jnode_short_info(node, jbuf);
32100 +       write_current_logf(WRITE_NODE_LOG, ".....node %s %s",
32101 +                          sprint_address(jnode_get_block(node)), jbuf);
32102 +}
32103 +
32104 +/* write page description in the log */
32105 +void
32106 +write_page_log(const struct address_space *mapping, unsigned long index)
32107 +{
32108 +       write_current_logf(WRITE_PAGE_LOG, ".....page %llu %lu", get_inode_oid(mapping->host),
32109 +                          index);
32110 +}
32111 +
32112 +/* write block IO request description in the log */
32113 +void
32114 +write_io_log(const char *moniker, int rw, struct bio *bio)
32115 +{
32116 +       struct super_block *super;
32117 +       reiser4_super_info_data *sbinfo;
32118 +       reiser4_block_nr start;
32119 +       char jbuf[100];
32120 +
32121 +       /*
32122 +        * sbinfo->last_touched is last block where IO was issued to. It is
32123 +        * used to output seek distance into log.
32124 +        */
32125 +
32126 +       super = reiser4_get_current_sb();
32127 +       sbinfo = get_super_private(super);
32128 +
32129 +       start = bio->bi_sector >> (super->s_blocksize_bits - 9);
32130 +       jnode_short_info(jprivate(bio->bi_io_vec[0].bv_page), jbuf);
32131 +       write_current_logf(WRITE_IO_LOG, "......bio %s %c %+lli  (%llu,%u) %s",
32132 +                          moniker, (rw == READ) ? 'r' : 'w',
32133 +                          start - sbinfo->last_touched - 1,
32134 +                          start, bio->bi_vcnt, jbuf);
32135 +       sbinfo->last_touched = start + bio->bi_vcnt - 1;
32136 +}
32137 +
32138 +#endif
32139 +
32140 +/* Make Linus happy.
32141 +   Local variables:
32142 +   c-indentation-style: "K&R"
32143 +   mode-name: "LC"
32144 +   c-basic-offset: 8
32145 +   tab-width: 8
32146 +   fill-column: 120
32147 +   scroll-step: 1
32148 +   End:
32149 +*/
32150 diff -rupN linux-2.6.8-rc3/fs/reiser4/log.h linux-2.6.8-rc3-a/fs/reiser4/log.h
32151 --- linux-2.6.8-rc3/fs/reiser4/log.h    1970-01-01 03:00:00.000000000 +0300
32152 +++ linux-2.6.8-rc3-a/fs/reiser4/log.h  2004-08-05 21:20:52.931692482 +0400
32153 @@ -0,0 +1,122 @@
32154 +/* Copyright 2000, 2001, 2002, 2003 by Hans Reiser, licensing governed by
32155 + * reiser4/README */
32156 +
32157 +/* Tree-tracing facility. Copied from reiserfs v3.x patch, never released. See
32158 + * log.c for comments. */
32159 +
32160 +#if !defined( __REISER4_LOG_H__ )
32161 +#define __REISER4_LOG_H__
32162 +
32163 +#include "forward.h"
32164 +#include "debug.h"
32165 +
32166 +#include <linux/types.h>
32167 +#include <linux/fs.h>          /* for struct super_block, etc  */
32168 +#include <asm/semaphore.h>
32169 +
32170 +/*
32171 + * Log targets
32172 + */
32173 +typedef enum {
32174 +       log_to_file,    /* file */
32175 +       log_to_console  /* printk */,
32176 +       log_to_bucket   /* nowhere */
32177 +} log_file_type;
32178 +
32179 +#if REISER4_LOG
32180 +
32181 +/*
32182 + * data structure describing log file.
32183 + */
32184 +typedef struct {
32185 +       log_file_type type;      /* type of log file */
32186 +       struct file *fd;         /* actual file */
32187 +       char *buf;               /* logging buffer where records are
32188 +                                 * accumulated */
32189 +       size_t size;             /* buffer size */
32190 +       size_t used;             /* bytes used in the buffer */
32191 +       spinlock_t lock;         /* spinlock protecting this structure */
32192 +       struct list_head wait;   /* threads waiting for the free space in the
32193 +                                 * buffer */
32194 +       int disabled;            /* if > 0, logging is temporarily disabled */
32195 +       int long_term;           /* if != 0, then ->wait is used for
32196 +                                 * synchronization, otherwise--- ->lock.*/
32197 +} reiser4_log_file;
32198 +
32199 +/*
32200 + * markers for tree operations logged. Self-describing.
32201 + */
32202 +typedef enum {
32203 +       tree_cut = 'c',
32204 +       tree_lookup = 'l',
32205 +       tree_insert = 'i',
32206 +       tree_paste = 'p',
32207 +       tree_cached = 'C',
32208 +       tree_exit = 'e'
32209 +} reiser4_log_op;
32210 +
32211 +extern int open_log_file(struct super_block *super, const char *file_name, size_t size, reiser4_log_file * log);
32212 +extern int write_log(reiser4_log_file * file, const char *format, ...)
32213 +    __attribute__ ((format(printf, 2, 3)));
32214 +
32215 +extern int write_log_raw(reiser4_log_file * file, const void *data, size_t len);
32216 +extern int hold_log(reiser4_log_file * file, int flag);
32217 +extern int disable_log(reiser4_log_file * file, int flag);
32218 +extern void close_log_file(reiser4_log_file * file);
32219 +
32220 +#define write_syscall_log(format, ...) \
32221 +       write_current_logf(WRITE_SYSCALL_LOG, "%s "format, __FUNCTION__ , ## __VA_ARGS__)
32222 +extern void write_node_log(const jnode *node);
32223 +struct address_space;
32224 +extern void write_page_log(const struct address_space *mapping,
32225 +                            unsigned long index);
32226 +extern void write_io_log(const char *moniker, int rw, struct bio *bio);
32227 +extern void write_tree_log(reiser4_tree * tree, reiser4_log_op op, ...);
32228 +
32229 +extern char *jnode_short_info(const jnode *j, char *buf);
32230 +
32231 +
32232 +#else /* NO LOG */
32233 +
32234 +typedef struct {
32235 +} reiser4_log_file;
32236 +
32237 +#define open_log_file(super, file_name, size, log) (0)
32238 +#define write_log(file, format, ...) (0)
32239 +#define write_log_raw(file, data, len) (0)
32240 +#define hold_log(file, flag) (0)
32241 +#define disable_log(file, flag) (0)
32242 +#define close_log_file(file) noop
32243 +
32244 +#define write_syscall_log(format, ...) noop
32245 +#define write_tree_log(tree, op, ...) noop
32246 +#define write_node_log(node) noop
32247 +#define write_page_log(mapping, index) noop
32248 +#define jnode_short_info(j, buf) buf
32249 +
32250 +#endif
32251 +
32252 +#define write_current_logf(log_flag, format, ...)                              \
32253 +({                                                                             \
32254 +       struct super_block *super;                                              \
32255 +                                                                               \
32256 +       super = reiser4_get_current_sb();                                       \
32257 +       IF_LOG(log_flag, write_log(&get_super_private(super)->log_file,         \
32258 +                                   "%s %s %s " format "\n",                    \
32259 +                                  current->comm,                               \
32260 +                                  super->s_id, __FUNCTION__ , ## __VA_ARGS__));        \
32261 +})
32262 +
32263 +/* __REISER4_LOG_H__ */
32264 +#endif
32265 +
32266 +/* Make Linus happy.
32267 +   Local variables:
32268 +   c-indentation-style: "K&R"
32269 +   mode-name: "LC"
32270 +   c-basic-offset: 8
32271 +   tab-width: 8
32272 +   fill-column: 120
32273 +   scroll-step: 1
32274 +   End:
32275 +*/
32276 diff -rupN linux-2.6.8-rc3/fs/reiser4/oid.c linux-2.6.8-rc3-a/fs/reiser4/oid.c
32277 --- linux-2.6.8-rc3/fs/reiser4/oid.c    1970-01-01 03:00:00.000000000 +0300
32278 +++ linux-2.6.8-rc3-a/fs/reiser4/oid.c  2004-08-05 21:20:53.079661272 +0400
32279 @@ -0,0 +1,166 @@
32280 +/* Copyright 2003 by Hans Reiser, licensing governed by reiser4/README */
32281 +
32282 +#include "debug.h"
32283 +#include "super.h"
32284 +#include "txnmgr.h"
32285 +
32286 +/* we used to have oid allocation plugin. It was removed because it
32287 +   was recognized as providing unneeded level of abstraction. If one
32288 +   ever will find it useful - look at yet_unneeded_abstractions/oid
32289 +*/
32290 +
32291 +/*
32292 + * initialize in-memory data for oid allocator at @super. @nr_files and @next
32293 + * are provided by disk format plugin that reads them from the disk during
32294 + * mount.
32295 + */
32296 +reiser4_internal int
32297 +oid_init_allocator(struct super_block *super, oid_t nr_files, oid_t next)
32298 +{
32299 +       reiser4_super_info_data *sbinfo;
32300 +
32301 +       sbinfo = get_super_private(super);
32302 +
32303 +       sbinfo->next_to_use = next;
32304 +       sbinfo->oids_in_use = nr_files;
32305 +       return 0;
32306 +}
32307 +
32308 +/*
32309 + * allocate oid and return it. ABSOLUTE_MAX_OID is returned when allocator
32310 + * runs out of oids.
32311 + */
32312 +reiser4_internal oid_t
32313 +oid_allocate(struct super_block *super)
32314 +{
32315 +       reiser4_super_info_data *sbinfo;
32316 +       oid_t oid;
32317 +
32318 +       sbinfo = get_super_private(super);
32319 +
32320 +       reiser4_spin_lock_sb(sbinfo);
32321 +       if (sbinfo->next_to_use != ABSOLUTE_MAX_OID) {
32322 +               oid = sbinfo->next_to_use ++;
32323 +               sbinfo->oids_in_use ++;
32324 +       } else
32325 +               oid = ABSOLUTE_MAX_OID;
32326 +       reiser4_spin_unlock_sb(sbinfo);
32327 +       return oid;
32328 +}
32329 +
32330 +/*
32331 + * Tell oid allocator that @oid is now free.
32332 + */
32333 +reiser4_internal int
32334 +oid_release(struct super_block *super, oid_t oid UNUSED_ARG)
32335 +{
32336 +       reiser4_super_info_data *sbinfo;
32337 +
32338 +       sbinfo = get_super_private(super);
32339 +
32340 +       reiser4_spin_lock_sb(sbinfo);
32341 +       sbinfo->oids_in_use --;
32342 +       reiser4_spin_unlock_sb(sbinfo);
32343 +       return 0;
32344 +}
32345 +
32346 +/*
32347 + * return next @oid that would be allocated (i.e., returned by oid_allocate())
32348 + * without actually allocating it. This is used by disk format plugin to save
32349 + * oid allocator state on the disk.
32350 + */
32351 +reiser4_internal oid_t oid_next(const struct super_block *super)
32352 +{
32353 +       reiser4_super_info_data *sbinfo;
32354 +       oid_t oid;
32355 +
32356 +       sbinfo = get_super_private(super);
32357 +
32358 +       reiser4_spin_lock_sb(sbinfo);
32359 +       oid = sbinfo->next_to_use;
32360 +       reiser4_spin_unlock_sb(sbinfo);
32361 +       return oid;
32362 +}
32363 +
32364 +/*
32365 + * returns number of currently used oids. This is used by statfs(2) to report
32366 + * number of "inodes" and by disk format plugin to save oid allocator state on
32367 + * the disk.
32368 + */
32369 +reiser4_internal long oids_used(const struct super_block *super)
32370 +{
32371 +       reiser4_super_info_data *sbinfo;
32372 +       oid_t used;
32373 +
32374 +       sbinfo = get_super_private(super);
32375 +
32376 +       reiser4_spin_lock_sb(sbinfo);
32377 +       used = sbinfo->oids_in_use;
32378 +       reiser4_spin_unlock_sb(sbinfo);
32379 +       if (used < (__u64) ((long) ~0) >> 1)
32380 +               return (long) used;
32381 +       else
32382 +               return (long) -1;
32383 +}
32384 +
32385 +/*
32386 + * return number of "free" oids. This is used by statfs(2) to report "free"
32387 + * inodes.
32388 + */
32389 +reiser4_internal long oids_free(const struct super_block *super)
32390 +{
32391 +       reiser4_super_info_data *sbinfo;
32392 +       oid_t oids;
32393 +
32394 +       sbinfo = get_super_private(super);
32395 +
32396 +       reiser4_spin_lock_sb(sbinfo);
32397 +       oids = ABSOLUTE_MAX_OID - OIDS_RESERVED - sbinfo->next_to_use;
32398 +       reiser4_spin_unlock_sb(sbinfo);
32399 +       if (oids < (__u64) ((long) ~0) >> 1)
32400 +               return (long) oids;
32401 +       else
32402 +               return (long) -1;
32403 +}
32404 +
32405 +/*
32406 + * Count oid as allocated in atom. This is done after call to oid_allocate()
32407 + * at the point when we are irrevocably committed to creation of the new file
32408 + * (i.e., when oid allocation cannot be any longer rolled back due to some
32409 + * error).
32410 + */
32411 +reiser4_internal void
32412 +oid_count_allocated(void)
32413 +{
32414 +       txn_atom *atom;
32415 +
32416 +       atom = get_current_atom_locked();
32417 +       atom->nr_objects_created++;
32418 +       UNLOCK_ATOM(atom);
32419 +}
32420 +
32421 +/*
32422 + * Count oid as free in atom. This is done after call to oid_release() at the
32423 + * point when we are irrevocably committed to the deletion of the file (i.e.,
32424 + * when oid release cannot be any longer rolled back due to some error).
32425 + */
32426 +reiser4_internal void
32427 +oid_count_released(void)
32428 +{
32429 +       txn_atom *atom;
32430 +
32431 +       atom = get_current_atom_locked();
32432 +       atom->nr_objects_deleted++;
32433 +       UNLOCK_ATOM(atom);
32434 +}
32435 +
32436 +/*
32437 +   Local variables:
32438 +   c-indentation-style: "K&R"
32439 +   mode-name: "LC"
32440 +   c-basic-offset: 8
32441 +   tab-width: 8
32442 +   fill-column: 120
32443 +   scroll-step: 1
32444 +   End:
32445 +*/
32446 diff -rupN linux-2.6.8-rc3/fs/reiser4/page_cache.c linux-2.6.8-rc3-a/fs/reiser4/page_cache.c
32447 --- linux-2.6.8-rc3/fs/reiser4/page_cache.c     1970-01-01 03:00:00.000000000 +0300
32448 +++ linux-2.6.8-rc3-a/fs/reiser4/page_cache.c   2004-08-05 21:20:53.320610450 +0400
32449 @@ -0,0 +1,980 @@
32450 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
32451 + * reiser4/README */
32452 +
32453 +/* Memory pressure hooks. Fake inodes handling. */
32454 +/* We store all file system meta data (and data, of course) in the page cache.
32455 +
32456 +   What does this mean? In stead of using bread/brelse we create special
32457 +   "fake" inode (one per super block) and store content of formatted nodes
32458 +   into pages bound to this inode in the page cache. In newer kernels bread()
32459 +   already uses inode attached to block device (bd_inode). Advantage of having
32460 +   our own fake inode is that we can install appropriate methods in its
32461 +   address_space operations. Such methods are called by VM on memory pressure
32462 +   (or during background page flushing) and we can use them to react
32463 +   appropriately.
32464 +
32465 +   In initial version we only support one block per page. Support for multiple
32466 +   blocks per page is complicated by relocation.
32467 +
32468 +   To each page, used by reiser4, jnode is attached. jnode is analogous to
32469 +   buffer head. Difference is that jnode is bound to the page permanently:
32470 +   jnode cannot be removed from memory until its backing page is.
32471 +
32472 +   jnode contain pointer to page (->pg field) and page contain pointer to
32473 +   jnode in ->private field. Pointer from jnode to page is protected to by
32474 +   jnode's spinlock and pointer from page to jnode is protected by page lock
32475 +   (PG_locked bit). Lock ordering is: first take page lock, then jnode spin
32476 +   lock. To go into reverse direction use jnode_lock_page() function that uses
32477 +   standard try-lock-and-release device.
32478 +
32479 +   Properties:
32480 +
32481 +   1. when jnode-to-page mapping is established (by jnode_attach_page()), page
32482 +   reference counter is increased.
32483 +
32484 +   2. when jnode-to-page mapping is destroyed (by jnode_detach_page() and
32485 +   page_detach_jnode()), page reference counter is decreased.
32486 +
32487 +   3. on jload() reference counter on jnode page is increased, page is
32488 +   kmapped and `referenced'.
32489 +
32490 +   4. on jrelse() inverse operations are performed.
32491 +
32492 +   5. kmapping/kunmapping of unformatted pages is done by read/write methods.
32493 +
32494 +
32495 +   DEADLOCKS RELATED TO MEMORY PRESSURE. [OUTDATED. Only interesting
32496 +   historically.]
32497 +
32498 +   [In the following discussion, `lock' invariably means long term lock on
32499 +   znode.] (What about page locks?)
32500 +
32501 +   There is some special class of deadlock possibilities related to memory
32502 +   pressure. Locks acquired by other reiser4 threads are accounted for in
32503 +   deadlock prevention mechanism (lock.c), but when ->vm_writeback() is
32504 +   invoked additional hidden arc is added to the locking graph: thread that
32505 +   tries to allocate memory waits for ->vm_writeback() to finish. If this
32506 +   thread keeps lock and ->vm_writeback() tries to acquire this lock, deadlock
32507 +   prevention is useless.
32508 +
32509 +   Another related problem is possibility for ->vm_writeback() to run out of
32510 +   memory itself. This is not a problem for ext2 and friends, because their
32511 +   ->vm_writeback() don't allocate much memory, but reiser4 flush is
32512 +   definitely able to allocate huge amounts of memory.
32513 +
32514 +   It seems that there is no reliable way to cope with the problems above. In
32515 +   stead it was decided that ->vm_writeback() (as invoked in the kswapd
32516 +   context) wouldn't perform any flushing itself, but rather should just wake
32517 +   up some auxiliary thread dedicated for this purpose (or, the same thread
32518 +   that does periodic commit of old atoms (ktxnmgrd.c)).
32519 +
32520 +   Details:
32521 +
32522 +   1. Page is called `reclaimable' against particular reiser4 mount F if this
32523 +   page can be ultimately released by try_to_free_pages() under presumptions
32524 +   that:
32525 +
32526 +    a. ->vm_writeback() for F is no-op, and
32527 +
32528 +    b. none of the threads accessing F are making any progress, and
32529 +
32530 +    c. other reiser4 mounts obey the same memory reservation protocol as F
32531 +    (described below).
32532 +
32533 +   For example, clean un-pinned page, or page occupied by ext2 data are
32534 +   reclaimable against any reiser4 mount.
32535 +
32536 +   When there is more than one reiser4 mount in a system, condition (c) makes
32537 +   reclaim-ability not easily verifiable beyond trivial cases mentioned above.
32538 +
32539 +
32540 +
32541 +
32542 +
32543 +
32544 +
32545 +
32546 +   THIS COMMENT IS VALID FOR "MANY BLOCKS ON PAGE" CASE
32547 +
32548 +   Fake inode is used to bound formatted nodes and each node is indexed within
32549 +   fake inode by its block number. If block size of smaller than page size, it
32550 +   may so happen that block mapped to the page with formatted node is occupied
32551 +   by unformatted node or is unallocated. This lead to some complications,
32552 +   because flushing whole page can lead to an incorrect overwrite of
32553 +   unformatted node that is moreover, can be cached in some other place as
32554 +   part of the file body. To avoid this, buffers for unformatted nodes are
32555 +   never marked dirty. Also pages in the fake are never marked dirty. This
32556 +   rules out usage of ->writepage() as memory pressure hook. In stead
32557 +   ->releasepage() is used.
32558 +
32559 +   Josh is concerned that page->buffer is going to die. This should not pose
32560 +   significant problem though, because we need to add some data structures to
32561 +   the page anyway (jnode) and all necessary book keeping can be put there.
32562 +
32563 +*/
32564 +
32565 +/* Life cycle of pages/nodes.
32566 +
32567 +   jnode contains reference to page and page contains reference back to
32568 +   jnode. This reference is counted in page ->count. Thus, page bound to jnode
32569 +   cannot be released back into free pool.
32570 +
32571 +    1. Formatted nodes.
32572 +
32573 +      1. formatted node is represented by znode. When new znode is created its
32574 +      ->pg pointer is NULL initially.
32575 +
32576 +      2. when node content is loaded into znode (by call to zload()) for the
32577 +      first time following happens (in call to ->read_node() or
32578 +      ->allocate_node()):
32579 +
32580 +        1. new page is added to the page cache.
32581 +
32582 +        2. this page is attached to znode and its ->count is increased.
32583 +
32584 +        3. page is kmapped.
32585 +
32586 +      3. if more calls to zload() follow (without corresponding zrelses), page
32587 +      counter is left intact and in its stead ->d_count is increased in znode.
32588 +
32589 +      4. each call to zrelse decreases ->d_count. When ->d_count drops to zero
32590 +      ->release_node() is called and page is kunmapped as result.
32591 +
32592 +      5. at some moment node can be captured by a transaction. Its ->x_count
32593 +      is then increased by transaction manager.
32594 +
32595 +      6. if node is removed from the tree (empty node with JNODE_HEARD_BANSHEE
32596 +      bit set) following will happen (also see comment at the top of znode.c):
32597 +
32598 +        1. when last lock is released, node will be uncaptured from
32599 +        transaction. This released reference that transaction manager acquired
32600 +        at the step 5.
32601 +
32602 +        2. when last reference is released, zput() detects that node is
32603 +        actually deleted and calls ->delete_node()
32604 +        operation. page_cache_delete_node() implementation detaches jnode from
32605 +        page and releases page.
32606 +
32607 +      7. otherwise (node wasn't removed from the tree), last reference to
32608 +      znode will be released after transaction manager committed transaction
32609 +      node was in. This implies squallocing of this node (see
32610 +      flush.c). Nothing special happens at this point. Znode is still in the
32611 +      hash table and page is still attached to it.
32612 +
32613 +      8. znode is actually removed from the memory because of the memory
32614 +      pressure, or during umount (znodes_tree_done()). Anyway, znode is
32615 +      removed by the call to zdrop(). At this moment, page is detached from
32616 +      znode and removed from the inode address space.
32617 +
32618 +*/
32619 +
32620 +#include "debug.h"
32621 +#include "dformat.h"
32622 +#include "key.h"
32623 +#include "txnmgr.h"
32624 +#include "jnode.h"
32625 +#include "znode.h"
32626 +#include "block_alloc.h"
32627 +#include "tree.h"
32628 +#include "vfs_ops.h"
32629 +#include "inode.h"
32630 +#include "super.h"
32631 +#include "entd.h"
32632 +#include "page_cache.h"
32633 +#include "ktxnmgrd.h"
32634 +
32635 +#include <linux/types.h>
32636 +#include <linux/fs.h>
32637 +#include <linux/mm.h>          /* for struct page */
32638 +#include <linux/swap.h>                /* for struct page */
32639 +#include <linux/pagemap.h>
32640 +#include <linux/bio.h>
32641 +#include <linux/writeback.h>
32642 +#include <linux/blkdev.h>
32643 +
32644 +static struct bio *page_bio(struct page *page, jnode * node, int rw, int gfp);
32645 +
32646 +static struct address_space_operations formatted_fake_as_ops;
32647 +
32648 +static const oid_t fake_ino = 0x1;
32649 +static const oid_t bitmap_ino = 0x2;
32650 +static const oid_t cc_ino = 0x3;
32651 +
32652 +/* one-time initialization of fake inodes handling functions. */
32653 +reiser4_internal int
32654 +init_fakes()
32655 +{
32656 +       return 0;
32657 +}
32658 +
32659 +static void
32660 +init_fake_inode(struct super_block *super, struct inode *fake, struct inode **pfake)
32661 +{
32662 +       assert("nikita-2168", fake->i_state & I_NEW);
32663 +       fake->i_mapping->a_ops = &formatted_fake_as_ops;
32664 +       fake->i_blkbits = super->s_blocksize_bits;
32665 +       fake->i_size = ~0ull;
32666 +       fake->i_rdev = super->s_bdev->bd_dev;
32667 +       fake->i_bdev = super->s_bdev;
32668 +       *pfake = fake;
32669 +       /* NOTE-NIKITA something else? */
32670 +       unlock_new_inode(fake);
32671 +}
32672 +
32673 +/* initialize fake inode to which formatted nodes are bound in the page cache. */
32674 +reiser4_internal int
32675 +init_formatted_fake(struct super_block *super)
32676 +{
32677 +       struct inode *fake;
32678 +       struct inode *bitmap;
32679 +       struct inode *cc;
32680 +       reiser4_super_info_data *sinfo;
32681 +
32682 +       assert("nikita-1703", super != NULL);
32683 +
32684 +       sinfo = get_super_private_nocheck(super);
32685 +       fake = iget_locked(super, oid_to_ino(fake_ino));
32686 +
32687 +       if (fake != NULL) {
32688 +               init_fake_inode(super, fake, &sinfo->fake);
32689 +
32690 +               bitmap = iget_locked(super, oid_to_ino(bitmap_ino));
32691 +               if (bitmap != NULL) {
32692 +                       init_fake_inode(super, bitmap, &sinfo->bitmap);
32693 +
32694 +                       cc = iget_locked(super, oid_to_ino(cc_ino));
32695 +                       if (cc != NULL) {
32696 +                               init_fake_inode(super, cc, &sinfo->cc);
32697 +                               return 0;
32698 +                       } else {
32699 +                               iput(sinfo->fake);
32700 +                               iput(sinfo->bitmap);
32701 +                               sinfo->fake = NULL;
32702 +                               sinfo->bitmap = NULL;
32703 +                       }
32704 +               } else {
32705 +                       iput(sinfo->fake);
32706 +                       sinfo->fake = NULL;
32707 +               }
32708 +       }
32709 +       return RETERR(-ENOMEM);
32710 +}
32711 +
32712 +/* release fake inode for @super */
32713 +reiser4_internal int
32714 +done_formatted_fake(struct super_block *super)
32715 +{
32716 +       reiser4_super_info_data *sinfo;
32717 +
32718 +       sinfo = get_super_private_nocheck(super);
32719 +
32720 +       if (sinfo->fake != NULL) {
32721 +               assert("vs-1426", sinfo->fake->i_data.nrpages == 0);
32722 +               iput(sinfo->fake);
32723 +               sinfo->fake = NULL;
32724 +       }
32725 +
32726 +       if (sinfo->bitmap != NULL) {
32727 +               iput(sinfo->bitmap);
32728 +               sinfo->bitmap = NULL;
32729 +       }
32730 +
32731 +       if (sinfo->cc != NULL) {
32732 +               iput(sinfo->cc);
32733 +               sinfo->cc = NULL;
32734 +       }
32735 +       return 0;
32736 +}
32737 +
32738 +#if REISER4_LOG
32739 +int reiser4_submit_bio_helper(const char *moniker, int rw, struct bio *bio)
32740 +{
32741 +       write_io_log(moniker, rw, bio);
32742 +       submit_bio(rw, bio);
32743 +       return 0;
32744 +}
32745 +#endif
32746 +
32747 +reiser4_internal void reiser4_wait_page_writeback (struct page * page)
32748 +{
32749 +       assert ("zam-783", PageLocked(page));
32750 +
32751 +       do {
32752 +               unlock_page(page);
32753 +               wait_on_page_writeback(page);
32754 +               lock_page(page);
32755 +       } while (PageWriteback(page));
32756 +}
32757 +
32758 +/* return tree @page is in */
32759 +reiser4_internal reiser4_tree *
32760 +tree_by_page(const struct page *page /* page to query */ )
32761 +{
32762 +       assert("nikita-2461", page != NULL);
32763 +       return &get_super_private(page->mapping->host->i_sb)->tree;
32764 +}
32765 +
32766 +#if REISER4_DEBUG_MEMCPY
32767 +
32768 +/* Our own versions of memcpy, memmove, and memset used to profile shifts of
32769 +   tree node content. Coded to avoid inlining. */
32770 +
32771 +struct mem_ops_table {
32772 +       void *(*cpy) (void *dest, const void *src, size_t n);
32773 +       void *(*move) (void *dest, const void *src, size_t n);
32774 +       void *(*set) (void *s, int c, size_t n);
32775 +};
32776 +
32777 +void *
32778 +xxmemcpy(void *dest, const void *src, size_t n)
32779 +{
32780 +       return memcpy(dest, src, n);
32781 +}
32782 +
32783 +void *
32784 +xxmemmove(void *dest, const void *src, size_t n)
32785 +{
32786 +       return memmove(dest, src, n);
32787 +}
32788 +
32789 +void *
32790 +xxmemset(void *s, int c, size_t n)
32791 +{
32792 +       return memset(s, c, n);
32793 +}
32794 +
32795 +struct mem_ops_table std_mem_ops = {
32796 +       .cpy = xxmemcpy,
32797 +       .move = xxmemmove,
32798 +       .set = xxmemset
32799 +};
32800 +
32801 +struct mem_ops_table *mem_ops = &std_mem_ops;
32802 +
32803 +void *
32804 +xmemcpy(void *dest, const void *src, size_t n)
32805 +{
32806 +       return mem_ops->cpy(dest, src, n);
32807 +}
32808 +
32809 +void *
32810 +xmemmove(void *dest, const void *src, size_t n)
32811 +{
32812 +       return mem_ops->move(dest, src, n);
32813 +}
32814 +
32815 +void *
32816 +xmemset(void *s, int c, size_t n)
32817 +{
32818 +       return mem_ops->set(s, c, n);
32819 +}
32820 +
32821 +#endif
32822 +
32823 +/* completion handler for single page bio-based read.
32824 +
32825 +   mpage_end_io_read() would also do. But it's static.
32826 +
32827 +*/
32828 +static int
32829 +end_bio_single_page_read(struct bio *bio, unsigned int bytes_done UNUSED_ARG, int err UNUSED_ARG)
32830 +{
32831 +       struct page *page;
32832 +
32833 +       if (bio->bi_size != 0) {
32834 +               warning("nikita-3332", "Truncated single page read: %i",
32835 +                       bio->bi_size);
32836 +               return 1;
32837 +       }
32838 +
32839 +       page = bio->bi_io_vec[0].bv_page;
32840 +
32841 +       if (test_bit(BIO_UPTODATE, &bio->bi_flags))
32842 +               SetPageUptodate(page);
32843 +       else {
32844 +               ClearPageUptodate(page);
32845 +               SetPageError(page);
32846 +       }
32847 +       unlock_page(page);
32848 +       bio_put(bio);
32849 +       return 0;
32850 +}
32851 +
32852 +/* completion handler for single page bio-based write.
32853 +
32854 +   mpage_end_io_write() would also do. But it's static.
32855 +
32856 +*/
32857 +static int
32858 +end_bio_single_page_write(struct bio *bio, unsigned int bytes_done UNUSED_ARG, int err UNUSED_ARG)
32859 +{
32860 +       struct page *page;
32861 +
32862 +       if (bio->bi_size != 0) {
32863 +               warning("nikita-3333", "Truncated single page write: %i",
32864 +                       bio->bi_size);
32865 +               return 1;
32866 +       }
32867 +
32868 +       page = bio->bi_io_vec[0].bv_page;
32869 +
32870 +       if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
32871 +               SetPageError(page);
32872 +       end_page_writeback(page);
32873 +       bio_put(bio);
32874 +       return 0;
32875 +}
32876 +
32877 +/* ->readpage() method for formatted nodes */
32878 +static int
32879 +formatted_readpage(struct file *f UNUSED_ARG, struct page *page /* page to read */ )
32880 +{
32881 +       assert("nikita-2412", PagePrivate(page) && jprivate(page));
32882 +       return page_io(page, jprivate(page), READ, GFP_KERNEL);
32883 +}
32884 +
32885 +/* submit single-page bio request */
32886 +reiser4_internal int
32887 +page_io(struct page *page /* page to perform io for */ ,
32888 +       jnode * node /* jnode of page */ ,
32889 +       int rw /* read or write */ , int gfp /* GFP mask */ )
32890 +{
32891 +       struct bio *bio;
32892 +       int result;
32893 +
32894 +       assert("nikita-2094", page != NULL);
32895 +       assert("nikita-2226", PageLocked(page));
32896 +       assert("nikita-2634", node != NULL);
32897 +       assert("nikita-2893", rw == READ || rw == WRITE);
32898 +
32899 +       if (rw) {
32900 +               if (unlikely(page->mapping->host->i_sb->s_flags & MS_RDONLY)) {
32901 +                       unlock_page(page);
32902 +                       return 0;
32903 +               }
32904 +       }
32905 +
32906 +       bio = page_bio(page, node, rw, gfp);
32907 +       if (!IS_ERR(bio)) {
32908 +               if (rw == WRITE) {
32909 +                       SetPageWriteback(page);
32910 +                       unlock_page(page);
32911 +               }
32912 +               reiser4_submit_bio(rw, bio);
32913 +               result = 0;
32914 +       } else {
32915 +               unlock_page(page);
32916 +               result = PTR_ERR(bio);
32917 +       }
32918 +
32919 +       return result;
32920 +}
32921 +
32922 +/* helper function to construct bio for page */
32923 +static struct bio *
32924 +page_bio(struct page *page, jnode * node, int rw, int gfp)
32925 +{
32926 +       struct bio *bio;
32927 +       assert("nikita-2092", page != NULL);
32928 +       assert("nikita-2633", node != NULL);
32929 +
32930 +       /* Simple implemenation in the assumption that blocksize == pagesize.
32931 +
32932 +          We only have to submit one block, but submit_bh() will allocate bio
32933 +          anyway, so lets use all the bells-and-whistles of bio code.
32934 +       */
32935 +
32936 +       bio = bio_alloc(gfp, 1);
32937 +       if (bio != NULL) {
32938 +               int blksz;
32939 +               struct super_block *super;
32940 +               reiser4_block_nr blocknr;
32941 +
32942 +               super = page->mapping->host->i_sb;
32943 +               assert("nikita-2029", super != NULL);
32944 +               blksz = super->s_blocksize;
32945 +               assert("nikita-2028", blksz == (int) PAGE_CACHE_SIZE);
32946 +
32947 +               blocknr = *UNDER_SPIN(jnode, node, jnode_get_io_block(node));
32948 +
32949 +               assert("nikita-2275", blocknr != (reiser4_block_nr) 0);
32950 +               assert("nikita-2276", !blocknr_is_fake(&blocknr));
32951 +
32952 +               bio->bi_bdev = super->s_bdev;
32953 +               /* fill bio->bi_sector before calling bio_add_page(), because
32954 +                * q->merge_bvec_fn may want to inspect it (see
32955 +                * drivers/md/linear.c:linear_mergeable_bvec() for example. */
32956 +               bio->bi_sector = blocknr * (blksz >> 9);
32957 +
32958 +               if (!bio_add_page(bio, page, blksz, 0)) {
32959 +                       warning("nikita-3452",
32960 +                               "Single page bio cannot be constructed");
32961 +                       return ERR_PTR(RETERR(-EINVAL));
32962 +               }
32963 +
32964 +               /* bio -> bi_idx is filled by bio_init() */
32965 +               bio->bi_end_io = (rw == READ) ?
32966 +                       end_bio_single_page_read : end_bio_single_page_write;
32967 +
32968 +               return bio;
32969 +       } else
32970 +               return ERR_PTR(RETERR(-ENOMEM));
32971 +}
32972 +
32973 +
32974 +/* this function is internally called by jnode_make_dirty() */
32975 +int set_page_dirty_internal (struct page * page, int tag_as_moved)
32976 +{
32977 +       if (REISER4_STATS && !PageDirty(page))
32978 +               reiser4_stat_inc(pages_dirty);
32979 +
32980 +       /* the below resembles __set_page_dirty_nobuffers except that it also clears REISER4_MOVED page tag */
32981 +       if (!TestSetPageDirty(page)) {
32982 +               struct address_space *mapping = page->mapping;
32983 +
32984 +               if (mapping) {
32985 +                       read_lock_irq(&mapping->tree_lock);
32986 +                       if (page->mapping) {    /* Race with truncate? */
32987 +                               BUG_ON(page->mapping != mapping);
32988 +                               if (!mapping->backing_dev_info->memory_backed)
32989 +                                       inc_page_state(nr_dirty);
32990 +                               radix_tree_tag_set(&mapping->page_tree,
32991 +                                       page->index, PAGECACHE_TAG_DIRTY);
32992 +                               if (tag_as_moved)
32993 +                                       radix_tree_tag_set(
32994 +                                               &mapping->page_tree, page->index,
32995 +                                               PAGECACHE_TAG_REISER4_MOVED);
32996 +                               else
32997 +                                       radix_tree_tag_clear(
32998 +                                               &mapping->page_tree, page->index,
32999 +                                               PAGECACHE_TAG_REISER4_MOVED);
33000 +                       }
33001 +                       read_unlock_irq(&mapping->tree_lock);
33002 +                       __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
33003 +               }
33004 +       }
33005 +       return 0;
33006 +}
33007 +
33008 +reiser4_internal void capture_reiser4_inodes (
33009 +       struct super_block * sb, struct writeback_control * wbc)
33010 +{
33011 +       const unsigned long start = jiffies;
33012 +       long captured = 0;
33013 +
33014 +       if (list_empty(&sb->s_io))
33015 +               list_splice_init(&sb->s_dirty, &sb->s_io);
33016 +
33017 +       while (!list_empty(&sb->s_io)) {
33018 +               struct inode *inode = list_entry(
33019 +                       sb->s_io.prev, struct inode, i_list);
33020 +
33021 +               list_move(&inode->i_list, &sb->s_dirty);
33022 +
33023 +               if (time_after(inode->dirtied_when, start))
33024 +                       continue;
33025 +
33026 +               __iget(inode);
33027 +               spin_unlock(&inode_lock);
33028 +
33029 +               {
33030 +                       file_plugin *fplug;
33031 +
33032 +                       fplug = inode_file_plugin(inode);
33033 +                       if (fplug != NULL && fplug->capture != NULL) {
33034 +                               /* call file plugin method to capture anonymous pages and
33035 +                                * anonymous jnodes */
33036 +                               fplug->capture(inode, wbc, &captured);
33037 +                       }
33038 +               }
33039 +
33040 +               spin_lock(&inode_lock);
33041 +               /* set inode state according what pages it has. */
33042 +               if (!(inode->i_state & I_FREEING)) {
33043 +                       struct address_space * mapping = inode->i_mapping;
33044 +                       unsigned long flags;
33045 +
33046 +                       read_lock_irqsave(&mapping->tree_lock, flags);
33047 +                       if (!radix_tree_tagged(&mapping->page_tree, PAGECACHE_TAG_DIRTY) &&
33048 +                           !radix_tree_tagged(&mapping->page_tree, PAGECACHE_TAG_REISER4_MOVED))
33049 +                       {
33050 +                               inode->i_state &= ~(I_DIRTY);
33051 +                       }
33052 +                       read_unlock_irqrestore(&mapping->tree_lock, flags);
33053 +               }
33054 +               spin_unlock(&inode_lock);
33055 +
33056 +               iput(inode);
33057 +
33058 +               spin_lock(&inode_lock);
33059 +               if (wbc->nr_to_write <= 0) {
33060 +                       warning("vs-1689", "does this ever happen? nr_to_write = %ld", wbc->nr_to_write);
33061 +                       break;
33062 +               }
33063 +       }
33064 +}
33065 +
33066 +
33067 +/* Common memory pressure notification. */
33068 +reiser4_internal int
33069 +reiser4_writepage(struct page *page /* page to start writeback from */,
33070 +                 struct writeback_control *wbc)
33071 +{
33072 +       struct super_block *s;
33073 +       reiser4_context ctx;
33074 +       reiser4_tree *tree;
33075 +       txn_atom * atom;
33076 +       jnode *node;
33077 +       int result;
33078 +
33079 +       s = page->mapping->host->i_sb;
33080 +       init_context(&ctx, s);
33081 +
33082 +       reiser4_stat_inc(pcwb.calls);
33083 +
33084 +       assert("vs-828", PageLocked(page));
33085 +
33086 +#if REISER4_USE_ENTD
33087 +
33088 +       /* Throttle memory allocations if we were not in reiser4 */
33089 +       if (ctx.parent == &ctx) {
33090 +               write_page_by_ent(page, wbc);
33091 +               result = 1;
33092 +               goto out;
33093 +       }
33094 +#endif /* REISER4_USE_ENTD */
33095 +
33096 +       tree = &get_super_private(s)->tree;
33097 +       node = jnode_of_page(page);
33098 +       if (!IS_ERR(node)) {
33099 +               int phantom;
33100 +
33101 +               assert("nikita-2419", node != NULL);
33102 +
33103 +               LOCK_JNODE(node);
33104 +               /*
33105 +                * page was dirty, but jnode is not. This is (only?)
33106 +                * possible if page was modified through mmap(). We
33107 +                * want to handle such jnodes specially.
33108 +                */
33109 +               phantom = !jnode_is_dirty(node);
33110 +               atom = jnode_get_atom(node);
33111 +               if (atom != NULL) {
33112 +                       if (!(atom->flags & ATOM_FORCE_COMMIT)) {
33113 +                               atom->flags |= ATOM_FORCE_COMMIT;
33114 +                               ktxnmgrd_kick(&get_super_private(s)->tmgr);
33115 +                               reiser4_stat_inc(txnmgr.commit_from_writepage);
33116 +                       }
33117 +                       UNLOCK_ATOM(atom);
33118 +               }
33119 +               UNLOCK_JNODE(node);
33120 +
33121 +               result = emergency_flush(page);
33122 +               if (result != 0) {
33123 +                       /*
33124 +                        * cannot flush page right now, or some error
33125 +                        */
33126 +                       reiser4_stat_inc(pcwb.not_written);
33127 +               } else {
33128 +                       /*
33129 +                        * page was successfully flushed
33130 +                        */
33131 +                       reiser4_stat_inc(pcwb.written);
33132 +                       if (phantom && jnode_is_unformatted(node))
33133 +                               JF_SET(node, JNODE_KEEPME);
33134 +               }
33135 +               jput(node);
33136 +       } else {
33137 +               reiser4_stat_inc(pcwb.no_jnode);
33138 +               result = PTR_ERR(node);
33139 +       }
33140 +       if (result != 0) {
33141 +               /*
33142 +                * shrink list doesn't move page to another mapping
33143 +                * list when clearing dirty flag. So it is enough to
33144 +                * just set dirty bit.
33145 +                */
33146 +               set_page_dirty_internal(page, 0);
33147 +               unlock_page(page);
33148 +       }
33149 + out:
33150 +       reiser4_exit_context(&ctx);
33151 +       return result;
33152 +}
33153 +
33154 +/* ->set_page_dirty() method of formatted address_space */
33155 +static int
33156 +formatted_set_page_dirty(struct page *page     /* page to mark
33157 +                                                * dirty */ )
33158 +{
33159 +       assert("nikita-2173", page != NULL);
33160 +       return __set_page_dirty_nobuffers(page);
33161 +}
33162 +
33163 +/* address space operations for the fake inode */
33164 +static struct address_space_operations formatted_fake_as_ops = {
33165 +       /* Perform a writeback of a single page as a memory-freeing
33166 +        * operation. */
33167 +       .writepage = reiser4_writepage,
33168 +       /* this is called to read formatted node */
33169 +       .readpage = formatted_readpage,
33170 +       /* ->sync_page() method of fake inode address space operations. Called
33171 +          from wait_on_page() and lock_page().
33172 +
33173 +          This is most annoyingly misnomered method. Actually it is called
33174 +          from wait_on_page_bit() and lock_page() and its purpose is to
33175 +          actually start io by jabbing device drivers.
33176 +       */
33177 +       .sync_page = reiser4_start_up_io,
33178 +       /* Write back some dirty pages from this mapping. Called from sync.
33179 +          called during sync (pdflush) */
33180 +       .writepages = reiser4_writepages,
33181 +       /* Set a page dirty */
33182 +       .set_page_dirty = formatted_set_page_dirty,
33183 +       /* used for read-ahead. Not applicable */
33184 +       .readpages = NULL,
33185 +       .prepare_write = NULL,
33186 +       .commit_write = NULL,
33187 +       .bmap = NULL,
33188 +       /* called just before page is being detached from inode mapping and
33189 +          removed from memory. Called on truncate, cut/squeeze, and
33190 +          umount. */
33191 +       .invalidatepage = reiser4_invalidatepage,
33192 +       /* this is called by shrink_cache() so that file system can try to
33193 +          release objects (jnodes, buffers, journal heads) attached to page
33194 +          and, may be made page itself free-able.
33195 +       */
33196 +       .releasepage = reiser4_releasepage,
33197 +       .direct_IO = NULL
33198 +};
33199 +
33200 +/* called just before page is released (no longer used by reiser4). Callers:
33201 +   jdelete() and extent2tail(). */
33202 +reiser4_internal void
33203 +drop_page(struct page *page)
33204 +{
33205 +       assert("nikita-2181", PageLocked(page));
33206 +       clear_page_dirty(page);
33207 +       ClearPageUptodate(page);
33208 +#if defined(PG_skipped)
33209 +       ClearPageSkipped(page);
33210 +#endif
33211 +       if (page->mapping != NULL) {
33212 +               remove_from_page_cache(page);
33213 +               unlock_page(page);
33214 +               /* page removed from the mapping---decrement page counter */
33215 +               page_cache_release(page);
33216 +       } else
33217 +               unlock_page(page);
33218 +}
33219 +
33220 +
33221 +/* this is called by truncate_jnodes_range which in its turn is always called
33222 +   after truncate_mapping_pages_range. Therefore, here jnode can not have
33223 +   page. New pages can not be created because truncate_jnodes_range goes under
33224 +   exclusive access on file obtained, where as new page creation requires
33225 +   non-exclusive access obtained */
33226 +static void
33227 +invalidate_unformatted(jnode *node)
33228 +{
33229 +       struct page *page;
33230 +
33231 +       LOCK_JNODE(node);
33232 +       page = node->pg;
33233 +       if (page) {
33234 +               page_cache_get(page);
33235 +               UNLOCK_JNODE(node);
33236 +               truncate_inode_pages_range(page->mapping, page->index, 1);
33237 +               page_cache_release(page);
33238 +       } else {
33239 +               JF_SET(node, JNODE_HEARD_BANSHEE);
33240 +               uncapture_jnode(node);
33241 +               unhash_unformatted_jnode(node);
33242 +       }
33243 +}
33244 +
33245 +#define JNODE_GANG_SIZE (16)
33246 +
33247 +/* find all eflushed jnodes from range specified and invalidate them */
33248 +reiser4_internal int
33249 +truncate_jnodes_range(struct inode *inode, pgoff_t from, pgoff_t count)
33250 +{
33251 +       reiser4_inode *info;
33252 +       int truncated_jnodes;
33253 +       reiser4_tree *tree;
33254 +       unsigned long index;
33255 +       unsigned long end;
33256 +
33257 +       truncated_jnodes = 0;
33258 +
33259 +       info = reiser4_inode_data(inode);
33260 +       tree = tree_by_inode(inode);
33261 +
33262 +       index = from;
33263 +       end   = from + count;
33264 +
33265 +       while (1) {
33266 +               jnode *gang[JNODE_GANG_SIZE];
33267 +               int    taken;
33268 +               int    i;
33269 +               jnode *node;
33270 +
33271 +               assert("nikita-3466", index <= end);
33272 +
33273 +               RLOCK_TREE(tree);
33274 +               taken = radix_tree_gang_lookup(jnode_tree_by_reiser4_inode(info), (void **)gang,
33275 +                                              index, JNODE_GANG_SIZE);
33276 +               for (i = 0; i < taken; ++i) {
33277 +                       node = gang[i];
33278 +                       if (index_jnode(node) < end)
33279 +                               jref(node);
33280 +                       else
33281 +                               gang[i] = NULL;
33282 +               }
33283 +               RUNLOCK_TREE(tree);
33284 +
33285 +               for (i = 0; i < taken; ++i) {
33286 +                       node = gang[i];
33287 +                       if (node != NULL) {
33288 +                               index = max(index, index_jnode(node));
33289 +                               invalidate_unformatted(node);
33290 +                               truncated_jnodes ++;
33291 +                               jput(node);
33292 +                       } else
33293 +                               break;
33294 +               }
33295 +               if (i != taken || taken == 0)
33296 +                       break;
33297 +       }
33298 +       return truncated_jnodes;
33299 +}
33300 +
33301 +/* */
33302 +reiser4_internal void
33303 +reiser4_invalidate_pages(struct address_space *mapping, pgoff_t from, unsigned long count)
33304 +{
33305 +       loff_t from_bytes, count_bytes;
33306 +
33307 +       if (count == 0)
33308 +               return;
33309 +       from_bytes = ((loff_t)from) << PAGE_CACHE_SHIFT;
33310 +       count_bytes = ((loff_t)count) << PAGE_CACHE_SHIFT;
33311 +
33312 +       /*invalidate_mmap_range(mapping, from_bytes, count_bytes);*/
33313 +       unmap_mapping_range(mapping, from_bytes, count_bytes, 1/*even cows*/);
33314 +       truncate_inode_pages_range(mapping, from, count);
33315 +       truncate_jnodes_range(mapping->host, from, count);
33316 +}
33317 +
33318 +
33319 +#if REISER4_DEBUG_OUTPUT
33320 +
33321 +#define page_flag_name( page, flag )                   \
33322 +       ( test_bit( ( flag ), &( page ) -> flags ) ? ((#flag "|")+3) : "" )
33323 +
33324 +reiser4_internal void
33325 +print_page(const char *prefix, struct page *page)
33326 +{
33327 +       if (page == NULL) {
33328 +               printk("null page\n");
33329 +               return;
33330 +       }
33331 +       printk("%s: page index: %lu mapping: %p count: %i private: %lx\n",
33332 +              prefix, page->index, page->mapping, page_count(page), page->private);
33333 +       printk("\tflags: %s%s%s%s %s%s%s %s%s%s%s %s%s%s\n",
33334 +              page_flag_name(page, PG_locked),
33335 +              page_flag_name(page, PG_error),
33336 +              page_flag_name(page, PG_referenced),
33337 +              page_flag_name(page, PG_uptodate),
33338 +              page_flag_name(page, PG_dirty),
33339 +              page_flag_name(page, PG_lru),
33340 +              page_flag_name(page, PG_slab),
33341 +              page_flag_name(page, PG_highmem),
33342 +              page_flag_name(page, PG_checked),
33343 +              page_flag_name(page, PG_arch_1),
33344 +              page_flag_name(page, PG_reserved),
33345 +              page_flag_name(page, PG_private), page_flag_name(page, PG_writeback), page_flag_name(page, PG_nosave));
33346 +       if (jprivate(page) != NULL) {
33347 +               print_jnode("\tpage jnode", jprivate(page));
33348 +               printk("\n");
33349 +       }
33350 +}
33351 +
33352 +reiser4_internal void
33353 +print_page_state(const char *prefix, struct page_state *ps)
33354 +{
33355 +       printk("%i: %s: "
33356 +              "free: %u, "
33357 +              "dirty: %lu, "
33358 +              "writeback: %lu, "
33359 +//            "pagecache: %lu, "
33360 +//          "page_table_pages: %lu, "
33361 +//          "reverse_maps: %lu, "
33362 +              "mapped: %lu, "
33363 +              "slab: %lu, "
33364 +//          "pgpgin: %lu, "
33365 +//          "pgpgout: %lu, "
33366 +//          "pswpin: %lu, "
33367 +//          "pswpout: %lu, "
33368 +//          "pgalloc: %lu, "
33369 +//          "pgfree: %lu, "
33370 +//          "pgactivate: %lu, "
33371 +//          "pgdeactivate: %lu, "
33372 +//          "pgfault: %lu, "
33373 +//          "pgmajfault: %lu, "
33374 +//          "pgscan: %lu, "
33375 +//          "pgrefill: %lu, "
33376 +//          "pgsteal: %lu, "
33377 +              "kswapd_steal: %lu, "
33378 +//          "pageoutrun: %lu, "
33379 +//          "allocstall: %lu
33380 +              "\n", current->pid, prefix,
33381 +
33382 +              nr_free_pages(),
33383 +              ps->nr_dirty,
33384 +              ps->nr_writeback,
33385 +//            ps->nr_pagecache,
33386 +//          ps->nr_page_table_pages,
33387 +//          ps->nr_reverse_maps,
33388 +              ps->nr_mapped,
33389 +              ps->nr_slab,
33390 +//          ps->pgpgin,
33391 +//          ps->pgpgout,
33392 +//          ps->pswpin,
33393 +//          ps->pswpout,
33394 +//          ps->pgalloc,
33395 +//          ps->pgfree,
33396 +//          ps->pgactivate,
33397 +//          ps->pgdeactivate,
33398 +//          ps->pgfault,
33399 +//          ps->pgmajfault,
33400 +//          ps->pgscan,
33401 +//          ps->pgrefill,
33402 +//          ps->pgsteal,
33403 +              ps->kswapd_steal //,
33404 +//          ps->pageoutrun,
33405 +//          ps->allocstall
33406 +               );
33407 +}
33408 +
33409 +reiser4_internal void
33410 +print_page_stats(const char *prefix)
33411 +{
33412 +       struct page_state ps;
33413 +       get_full_page_state(&ps);
33414 +       print_page_state(prefix, &ps);
33415 +}
33416 +
33417 +
33418 +#endif
33419 +
33420 +/* Make Linus happy.
33421 +   Local variables:
33422 +   c-indentation-style: "K&R"
33423 +   mode-name: "LC"
33424 +   c-basic-offset: 8
33425 +   tab-width: 8
33426 +   fill-column: 120
33427 +   scroll-step: 1
33428 +   End:
33429 +*/
33430 diff -rupN linux-2.6.8-rc3/fs/reiser4/page_cache.h linux-2.6.8-rc3-a/fs/reiser4/page_cache.h
33431 --- linux-2.6.8-rc3/fs/reiser4/page_cache.h     1970-01-01 03:00:00.000000000 +0300
33432 +++ linux-2.6.8-rc3-a/fs/reiser4/page_cache.h   2004-08-05 21:20:53.473578186 +0400
33433 @@ -0,0 +1,71 @@
33434 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
33435 + * reiser4/README */
33436 +/* Memory pressure hooks. Fake inodes handling. See page_cache.c. */
33437 +
33438 +#if !defined( __REISER4_PAGE_CACHE_H__ )
33439 +#define __REISER4_PAGE_CACHE_H__
33440 +
33441 +#include "forward.h"
33442 +#include "debug.h"
33443 +
33444 +#include <linux/fs.h>          /* for struct super_block, address_space  */
33445 +#include <linux/mm.h>          /* for struct page  */
33446 +#include <linux/pagemap.h>     /* for lock_page()  */
33447 +
33448 +extern int init_fakes(void);
33449 +extern int init_formatted_fake(struct super_block *super);
33450 +extern int done_formatted_fake(struct super_block *super);
33451 +
33452 +extern reiser4_tree *tree_by_page(const struct page *page);
33453 +
33454 +extern int set_page_dirty_internal (struct page * page, int tag_as_moved);
33455 +
33456 +#if REISER4_LOG
33457 +extern char *jnode_short_info(const jnode *j, char *buf);
33458 +extern int reiser4_submit_bio_helper(const char *moniker,
33459 +                                    int rw, struct bio *bio);
33460 +#define reiser4_submit_bio(rw, bio)                            \
33461 +       reiser4_submit_bio_helper(__FUNCTION__, (rw), (bio))
33462 +#else
33463 +#define reiser4_submit_bio(rw, bio) submit_bio((rw), (bio))
33464 +#endif
33465 +
33466 +extern void reiser4_wait_page_writeback (struct page * page);
33467 +static inline void lock_and_wait_page_writeback (struct page * page)
33468 +{
33469 +       lock_page(page);
33470 +       if (unlikely(PageWriteback(page)))
33471 +           reiser4_wait_page_writeback(page);
33472 +}
33473 +
33474 +#define jprivate(page) ((jnode *) (page)->private)
33475 +
33476 +extern int page_io(struct page *page, jnode * node, int rw, int gfp);
33477 +extern int reiser4_writepage(struct page *page, struct writeback_control *wbc);
33478 +extern void drop_page(struct page *page);
33479 +extern void reiser4_invalidate_pages(struct address_space *, pgoff_t from, unsigned long count);
33480 +extern void capture_reiser4_inodes (struct super_block *, struct writeback_control *);
33481 +
33482 +#if REISER4_DEBUG_OUTPUT
33483 +extern void print_page(const char *prefix, struct page *page);
33484 +extern void print_page_state(const char *prefix, struct page_state *ps);
33485 +extern void print_page_stats(const char *prefix);
33486 +#else
33487 +#define print_page(prf, p) noop
33488 +#define print_page_state(prefix, ps) noop
33489 +#define print_page_stats(prefix) noop
33490 +#endif
33491 +
33492 +/* __REISER4_PAGE_CACHE_H__ */
33493 +#endif
33494 +
33495 +/* Make Linus happy.
33496 +   Local variables:
33497 +   c-indentation-style: "K&R"
33498 +   mode-name: "LC"
33499 +   c-basic-offset: 8
33500 +   tab-width: 8
33501 +   fill-column: 120
33502 +   scroll-step: 1
33503 +   End:
33504 +*/
33505 diff -rupN linux-2.6.8-rc3/fs/reiser4/plugin/compress/compress.c linux-2.6.8-rc3-a/fs/reiser4/plugin/compress/compress.c
33506 --- linux-2.6.8-rc3/fs/reiser4/plugin/compress/compress.c       1970-01-01 03:00:00.000000000 +0300
33507 +++ linux-2.6.8-rc3-a/fs/reiser4/plugin/compress/compress.c     2004-08-05 21:20:53.145647354 +0400
33508 @@ -0,0 +1,533 @@
33509 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
33510 +/* reiser4 compression transform plugins */
33511 +
33512 +#include "../../debug.h"
33513 +#include "../plugin.h"
33514 +#include "../cryptcompress.h"
33515 +#include "minilzo.h"
33516 +
33517 +#include <linux/config.h>
33518 +#include <linux/zlib.h>
33519 +#include <linux/vmalloc.h>
33520 +#include <linux/types.h>
33521 +
33522 +static void
33523 +null_compress(void * ctx, __u8 *src_first, unsigned src_len,
33524 +             __u8 *dst_first, unsigned *dst_len)
33525 +{
33526 +       int i;
33527 +       assert("edward-793", ctx == NULL);
33528 +       assert("edward-794", src_first != NULL);
33529 +       assert("edward-795", dst_first != NULL);
33530 +       assert("edward-796", src_len != 0);
33531 +       assert("edward-797", dst_len != NULL);
33532 +
33533 +       for (i=0; i < NONE_NRCOPY; i++)
33534 +               fast_copy(src_first, dst_first, src_len);
33535 +       *dst_len = src_len;
33536 +       return;
33537 +}
33538 +
33539 +static void
33540 +null_decompress(void * ctx, __u8 *src_first, unsigned src_len,
33541 +               __u8 *dst_first, unsigned *dst_len)
33542 +{
33543 +       impossible("edward-798", "trying to decompress uncompressed data");
33544 +}
33545 +
33546 +LOCAL void __lzrw1_compress(UBYTE *, ULONG, UBYTE *, ULONG *);
33547 +LOCAL void __lzrw1_decompress(UBYTE *, ULONG, UBYTE *, ULONG *);
33548 +
33549 +/******************************************************************************/
33550 +/*                         Start of LZRW1.C                                   */
33551 +/******************************************************************************/
33552 +/*
33553 +  THE LZRW1 ALGORITHM
33554 +  ===================
33555 +  Author : Ross N. Williams.
33556 +  Date   : 31-Mar-1991.
33557 +
33558 +  1. I typed the following code in from my paper "An Extremely Fast Data
33559 +  Compression Algorithm", Data Compression Conference, Utah, 7-11 April,
33560 +  1991. The  fact that this  code works indicates  that the code  in the
33561 +  paper is OK.
33562 +
33563 +  2. This file has been copied into a test harness and works.
33564 +
33565 +  3. Some users running old C compilers may wish to insert blanks around
33566 +  the "="  symbols of  assignments so  as to  avoid expressions  such as
33567 +  "a=*b;" being interpreted as "a=a*b;"
33568 +
33569 +  4. This code is public domain.
33570 +
33571 +  5. Warning:  This code  is non-deterministic insofar  as it  may yield
33572 +  different  compressed representations  of the  same file  on different
33573 +  runs. (However, it will always decompress correctly to the original).
33574 +
33575 +  6. If you use this code in anger (e.g. in a product) drop me a note at
33576 +  ross@spam.ua.oz.au and I will put you  on a mailing list which will be
33577 +  invoked if anyone finds a bug in this code.
33578 +
33579 +  7.   The  internet   newsgroup  comp.compression   might  also   carry
33580 +  information on this algorithm from time to time.
33581 +*/
33582 +/******************************************************************************/
33583 +#define FLAG_BYTES    4     /* Number of bytes used by copy flag. */
33584 +#define FLAG_COMPRESS 0     /* Signals that compression occurred. */
33585 +#define FLAG_COPY     1     /* Signals that a copyover occurred.  */
33586 +/******************************************************************************/
33587 +
33588 +LOCAL void __lzrw1_compress(p_src_first,src_len,p_dst_first,p_dst_len)
33589 +     /* Input  : Specify input block using p_src_first and src_len.          */
33590 +     /* Input  : Point p_dst_first to the start of the output zone (OZ).     */
33591 +     /* Input  : Point p_dst_len to a ULONG to receive the output length.    */
33592 +     /* Input  : Input block and output zone must not overlap.               */
33593 +     /* Output : Length of output block written to *p_dst_len.               */
33594 +     /* Output : Output block in Mem[p_dst_first..p_dst_first+*p_dst_len-1]. */
33595 +     /* Output : May write in OZ=Mem[p_dst_first..p_dst_first+src_len+256-1].*/
33596 +     /* Output : Upon completion guaranteed *p_dst_len<=src_len+FLAG_BYTES.  */
33597 +     UBYTE *p_src_first,*p_dst_first; ULONG src_len,*p_dst_len;
33598 +#define PS *p++!=*s++  /* Body of inner unrolled matching loop.         */
33599 +#define ITEMMAX 16     /* Maximum number of bytes in an expanded item.  */
33600 +{UBYTE *p_src=p_src_first,*p_dst=p_dst_first;
33601 + UBYTE *p_src_post=p_src_first+src_len,*p_dst_post=p_dst_first+src_len;
33602 + UBYTE *p_src_max1=p_src_post-ITEMMAX,*p_src_max16=p_src_post-16*ITEMMAX;
33603 + UBYTE *hash[4096],*p_control; UWORD control=0,control_bits=0;
33604 + *p_dst=FLAG_COMPRESS; p_dst+=FLAG_BYTES; p_control=p_dst; p_dst+=2;
33605 + while (TRUE)
33606 +   {UBYTE *p,*s; UWORD unroll=16,len,index; ULONG offset;
33607 +   if (p_dst>p_dst_post) goto overrun;
33608 +   if (p_src>p_src_max16)
33609 +     {unroll=1;
33610 +     if (p_src>p_src_max1)
33611 +       {if (p_src==p_src_post) break; goto literal;}}
33612 +   begin_unrolled_loop:
33613 +   index=((40543*((((p_src[0]<<4)^p_src[1])<<4)^p_src[2]))>>4) & 0xFFF;
33614 +   p=hash[index]; hash[index]=s=p_src; offset=s-p;
33615 +   if (offset>4095 || p<p_src_first || offset==0 || PS || PS || PS)
33616 +     {literal: *p_dst++=*p_src++; control>>=1; control_bits++;}
33617 +   else
33618 +     {PS || PS || PS || PS || PS || PS || PS ||
33619 +       PS || PS || PS || PS || PS || PS || s++; len=s-p_src-1;
33620 +     *p_dst++=((offset&0xF00)>>4)+(len-1); *p_dst++=offset&0xFF;
33621 +     p_src+=len; control=(control>>1)|0x8000; control_bits++;}
33622 +#ifndef linux
33623 +   end_unrolled_loop: if (--unroll) goto begin_unrolled_loop;
33624 +#else
33625 +   /* end_unrolled_loop: */ if (--unroll) goto begin_unrolled_loop;
33626 +#endif
33627 +   if (control_bits==16)
33628 +     {*p_control=control&0xFF; *(p_control+1)=control>>8;
33629 +     p_control=p_dst; p_dst+=2; control=control_bits=0;}
33630 +   }
33631 + control>>=16-control_bits;
33632 + *p_control++=control&0xFF; *p_control++=control>>8;
33633 + if (p_control==p_dst) p_dst-=2;
33634 + *p_dst_len=p_dst-p_dst_first;
33635 + return;
33636 + overrun: fast_copy(p_src_first,p_dst_first+FLAG_BYTES,src_len);
33637 + *p_dst_first=FLAG_COPY; *p_dst_len=src_len+FLAG_BYTES;
33638 +}
33639 +
33640 +/******************************************************************************/
33641 +
33642 +LOCAL void __lzrw1_decompress(p_src_first,src_len,p_dst_first,p_dst_len)
33643 +     /* Input  : Specify input block using p_src_first and src_len.          */
33644 +     /* Input  : Point p_dst_first to the start of the output zone.          */
33645 +     /* Input  : Point p_dst_len to a ULONG to receive the output length.    */
33646 +     /* Input  : Input block and output zone must not overlap. User knows    */
33647 +     /* Input  : upperbound on output block length from earlier compression. */
33648 +     /* Input  : In any case, maximum expansion possible is eight times.     */
33649 +     /* Output : Length of output block written to *p_dst_len.               */
33650 +     /* Output : Output block in Mem[p_dst_first..p_dst_first+*p_dst_len-1]. */
33651 +     /* Output : Writes only  in Mem[p_dst_first..p_dst_first+*p_dst_len-1]. */
33652 +     UBYTE *p_src_first, *p_dst_first; ULONG src_len, *p_dst_len;
33653 +{UWORD controlbits=0, control;
33654 + UBYTE *p_src=p_src_first+FLAG_BYTES, *p_dst=p_dst_first,
33655 +   *p_src_post=p_src_first+src_len;
33656 + if (*p_src_first==FLAG_COPY)
33657 +   {fast_copy(p_src_first+FLAG_BYTES,p_dst_first,src_len-FLAG_BYTES);
33658 +   *p_dst_len=src_len-FLAG_BYTES; return;}
33659 + while (p_src!=p_src_post)
33660 +   {if (controlbits==0)
33661 +     {control=*p_src++; control|=(*p_src++)<<8; controlbits=16;}
33662 +   if (control&1)
33663 +     {UWORD offset,len; UBYTE *p;
33664 +     offset=(*p_src&0xF0)<<4; len=1+(*p_src++&0xF);
33665 +     offset+=*p_src++&0xFF; p=p_dst-offset;
33666 +     while (len--) *p_dst++=*p++;}
33667 +   else
33668 +     *p_dst++=*p_src++;
33669 +   control>>=1; controlbits--;
33670 +   }
33671 + *p_dst_len=p_dst-p_dst_first;
33672 +}
33673 +
33674 +/******************************************************************************/
33675 +/*                          End of LZRW1.C                                    */
33676 +/******************************************************************************/
33677 +
33678 +static int
33679 +lzrw1_overrun(unsigned src_len UNUSED_ARG)
33680 +{
33681 +       return 256;
33682 +}
33683 +
33684 +static void
33685 +lzrw1_compress(tfm_info_t ctx, __u8 *src_first, unsigned src_len, __u8 *dst_first, unsigned *dst_len)
33686 +{
33687 +       assert("edward-764", ctx == NULL);
33688 +       __lzrw1_compress(src_first, src_len, dst_first, dst_len);
33689 +       return;
33690 +}
33691 +
33692 +static void
33693 +lzrw1_decompress(tfm_info_t ctx, __u8 *src_first, unsigned src_len, __u8 *dst_first, unsigned *dst_len)
33694 +{
33695 +       assert("edward-765", ctx == NULL);
33696 +       __lzrw1_decompress(src_first, src_len, dst_first, dst_len);
33697 +       return;
33698 +}
33699 +
33700 +/******************************************************************************/
33701 +/*                                GZIP1.C                                     */
33702 +/******************************************************************************/
33703 +/*                                                                            */
33704 +/* See linux/zlib.h for details                                               */
33705 +/*                                                                            */
33706 +/******************************************************************************/
33707 +
33708 +#define GZIP1_DEF_LEVEL                        Z_BEST_SPEED
33709 +#define GZIP1_DEF_WINBITS              15
33710 +#define GZIP1_DEF_MEMLEVEL             MAX_MEM_LEVEL
33711 +
33712 +static int
33713 +gzip6_overrun(unsigned src_len UNUSED_ARG)
33714 +{
33715 +       return 0;
33716 +}
33717 +
33718 +static int
33719 +gzip1_alloc (tfm_info_t * ctx, tfm_action act)
33720 +{
33721 +       int ret = -ENXIO;
33722 +       assert("edward-766", *ctx == NULL);
33723 +#if REISER4_GZIP_TFM
33724 +       ret = 0;
33725 +       switch (act) {
33726 +       case TFM_WRITE: /* compress */
33727 +               *ctx = __vmalloc(zlib_deflate_workspacesize(),
33728 +                                (in_softirq() ? GFP_ATOMIC : GFP_KERNEL)|__GFP_HIGHMEM,
33729 +                                PAGE_KERNEL);
33730 +               if (*ctx == NULL) {
33731 +                       ret = -ENOMEM;
33732 +                       break;
33733 +               }
33734 +               xmemset(*ctx, 0, zlib_deflate_workspacesize());
33735 +               break;
33736 +       case TFM_READ: /* decompress */
33737 +               *ctx = reiser4_kmalloc(zlib_inflate_workspacesize(),
33738 +                                      (in_softirq() ? GFP_ATOMIC : GFP_KERNEL));
33739 +               if (*ctx == NULL) {
33740 +                       ret = -ENOMEM;
33741 +                       break;
33742 +               }
33743 +               xmemset(*ctx, 0, zlib_inflate_workspacesize());
33744 +               break;
33745 +       default:
33746 +               impossible("edward-767", "alloc workspace for unknown tfm action");
33747 +       }
33748 +#endif
33749 +       if (ret)
33750 +               warning("edward-768", "alloc workspace for gzip1 (tfm action = %d) failed\n", act);
33751 +       return ret;
33752 +}
33753 +
33754 +static void
33755 +gzip1_free (tfm_info_t * ctx, tfm_action act)
33756 +{
33757 +#if REISER4_GZIP_TFM
33758 +       assert("edward-769", *ctx != NULL);
33759 +
33760 +       switch (act) {
33761 +       case TFM_WRITE: /* compress */
33762 +               vfree(*ctx);
33763 +               break;
33764 +       case TFM_READ:
33765 +               reiser4_kfree(*ctx);
33766 +               break;
33767 +       default:
33768 +               impossible("edward-770", "free workspace for unknown tfm action");
33769 +       }
33770 +#endif
33771 +       return;
33772 +}
33773 +
33774 +static void
33775 +gzip1_compress(tfm_info_t ctx, __u8 *src_first, unsigned src_len, __u8 *dst_first, unsigned *dst_len)
33776 +{
33777 +#if REISER4_GZIP_TFM
33778 +       int ret = 0;
33779 +       struct z_stream_s stream;
33780 +       compression_plugin * cplug = compression_plugin_by_id(GZIP1_COMPRESSION_ID);
33781 +
33782 +       xmemset(&stream, 0, sizeof(stream));
33783 +
33784 +       assert("edward-842", ctx != NULL);
33785 +
33786 +       if (!ctx) {
33787 +               ret = cplug->alloc(&stream.workspace, TFM_WRITE);
33788 +               if (ret)
33789 +                       goto rollback;
33790 +       }
33791 +       else
33792 +               stream.workspace = ctx;
33793 +
33794 +       ret = zlib_deflateInit2(&stream, GZIP1_DEF_LEVEL, Z_DEFLATED,
33795 +                               -GZIP1_DEF_WINBITS, GZIP1_DEF_MEMLEVEL,
33796 +                               Z_DEFAULT_STRATEGY);
33797 +       if (ret != Z_OK) {
33798 +               warning("edward-771", "zlib_deflateInit2 returned %d\n", ret);
33799 +               goto rollback;
33800 +       }
33801 +       ret = zlib_deflateReset(&stream);
33802 +       if (ret != Z_OK) {
33803 +               warning("edward-772", "zlib_deflateReset returned %d\n", ret);
33804 +               goto rollback;
33805 +       }
33806 +       stream.next_in = src_first;
33807 +       stream.avail_in = src_len;
33808 +       stream.next_out = dst_first;
33809 +       stream.avail_out = *dst_len;
33810 +
33811 +       ret = zlib_deflate(&stream, Z_FINISH);
33812 +       if (ret != Z_STREAM_END) {
33813 +               warning("edward-773", "zlib_deflate returned %d\n", ret);
33814 +               goto rollback;
33815 +       }
33816 +       *dst_len = stream.total_out;
33817 +       if (!ctx)
33818 +               cplug->free(&stream.workspace, TFM_WRITE);
33819 +       return;
33820 + rollback:
33821 +       if (!ctx && stream.workspace)
33822 +               cplug->free(&stream.workspace, TFM_WRITE);
33823 +       *dst_len = src_len;
33824 +#endif
33825 +       return;
33826 +}
33827 +
33828 +static void
33829 +gzip1_decompress(tfm_info_t ctx, __u8 *src_first, unsigned src_len, __u8 *dst_first, unsigned *dst_len)
33830 +{
33831 +#if REISER4_GZIP_TFM
33832 +       int ret = 0;
33833 +       struct z_stream_s stream;
33834 +       compression_plugin * cplug = compression_plugin_by_id(GZIP1_COMPRESSION_ID);
33835 +
33836 +       xmemset(&stream, 0, sizeof(stream));
33837 +
33838 +       assert("edward-843", ctx == NULL);
33839 +
33840 +       if (!ctx) {
33841 +               ret = cplug->alloc(&stream.workspace, TFM_READ);
33842 +               if (ret)
33843 +                       goto out;
33844 +       }
33845 +       else
33846 +               stream.workspace = ctx;
33847 +
33848 +       ret = zlib_inflateInit2(&stream, -GZIP1_DEF_WINBITS);
33849 +       if (ret != Z_OK) {
33850 +               warning("edward-774", "zlib_inflateInit2 returned %d\n", ret);
33851 +               goto out;
33852 +       }
33853 +       ret = zlib_inflateReset(&stream);
33854 +       if (ret != Z_OK) {
33855 +               warning("edward-775", "zlib_inflateReset returned %d\n", ret);
33856 +               goto out;
33857 +       }
33858 +
33859 +       stream.next_in = src_first;
33860 +       stream.avail_in = src_len;
33861 +       stream.next_out = dst_first;
33862 +       stream.avail_out = *dst_len;
33863 +
33864 +       ret = zlib_inflate(&stream, Z_SYNC_FLUSH);
33865 +       /*
33866 +        * Work around a bug in zlib, which sometimes wants to taste an extra
33867 +        * byte when being used in the (undocumented) raw deflate mode.
33868 +        * (From USAGI).
33869 +        */
33870 +       if (ret == Z_OK && !stream.avail_in && stream.avail_out) {
33871 +               u8 zerostuff = 0;
33872 +               stream.next_in = &zerostuff;
33873 +               stream.avail_in = 1;
33874 +               ret = zlib_inflate(&stream, Z_FINISH);
33875 +       }
33876 +       if (ret != Z_STREAM_END) {
33877 +               warning("edward-776", "zlib_inflate returned %d\n", ret);
33878 +               goto out;
33879 +       }
33880 +       *dst_len = stream.total_out;
33881 + out:
33882 +       if (!ctx && stream.workspace)
33883 +               cplug->free(&stream.workspace, TFM_READ);
33884 +#endif
33885 +       return;
33886 +}
33887 +
33888 +/******************************************************************************/
33889 +/*                            none compression                                */
33890 +/******************************************************************************/
33891 +
33892 +static int
33893 +none_overrun(unsigned src_len UNUSED_ARG)
33894 +{
33895 +       return 0;
33896 +}
33897 +
33898 +/******************************************************************************/
33899 +/*                                 lzo1                                       */
33900 +/******************************************************************************/
33901 +
33902 +static int
33903 +lzo1_overrun(unsigned in_len)
33904 +{
33905 +       return in_len / 64 + 16 + 3;
33906 +}
33907 +
33908 +#define HEAP_ALLOC(var,size) \
33909 +       lzo_align_t __LZO_MMODEL var [ ((size) + (sizeof(lzo_align_t) - 1)) / sizeof(lzo_align_t) ]
33910 +
33911 +static void
33912 +lzo1_compress(tfm_info_t ctx, __u8 *src_first, unsigned src_len, __u8 *dst_first, unsigned *dst_len)
33913 +{
33914 +       int result;
33915 +       HEAP_ALLOC(wrkmem,LZO1X_1_MEM_COMPRESS);
33916 +
33917 +       assert("edward-846", ctx == NULL);
33918 +       assert("edward-847", src_len != 0);
33919 +
33920 +       result = lzo_init();
33921 +
33922 +       if (result != LZO_E_OK) {
33923 +               warning("edward-848", "lzo_init() failed\n");
33924 +               goto out;
33925 +       }
33926 +       result = lzo1x_1_compress(src_first, src_len, dst_first, dst_len, wrkmem);
33927 +       if (result != LZO_E_OK) {
33928 +               warning("edward-849", "lzo1x_1_compress failed\n");
33929 +               goto out;
33930 +       }
33931 +       if (*dst_len >= src_len)
33932 +               warning("edward-850", "lzo1x_1_compress: incompressible data\n");
33933 +       return;
33934 + out:
33935 +       *dst_len = src_len;
33936 +       return;
33937 +}
33938 +
33939 +static void
33940 +lzo1_decompress(tfm_info_t ctx, __u8 *src_first, unsigned src_len, __u8 *dst_first, unsigned *dst_len)
33941 +{
33942 +       int result;
33943 +
33944 +       assert("edward-851", ctx == NULL);
33945 +       assert("edward-852", src_len != 0);
33946 +
33947 +       result = lzo1x_decompress(src_first, src_len, dst_first, dst_len, NULL);
33948 +       if (result != LZO_E_OK)
33949 +               warning("edward-853", "lzo1x_1_decompress failed\n");
33950 +       return;
33951 +}
33952 +
33953 +compression_plugin compression_plugins[LAST_COMPRESSION_ID] = {
33954 +       [NONE_COMPRESSION_ID] = {
33955 +               .h = {
33956 +                       .type_id = REISER4_COMPRESSION_PLUGIN_TYPE,
33957 +                       .id = NONE_COMPRESSION_ID,
33958 +                       .pops = NULL,
33959 +                       .label = "none",
33960 +                       .desc = "absence of any compression transform",
33961 +                       .linkage = TYPE_SAFE_LIST_LINK_ZERO
33962 +               },
33963 +               .overrun = none_overrun,
33964 +               .alloc = NULL,
33965 +               .free = NULL,
33966 +               .compress = NULL,
33967 +               .decompress = NULL
33968 +       },
33969 +       [NULL_COMPRESSION_ID] = {
33970 +               .h = {
33971 +                       .type_id = REISER4_COMPRESSION_PLUGIN_TYPE,
33972 +                       .id = NULL_COMPRESSION_ID,
33973 +                       .pops = NULL,
33974 +                       .label = "null",
33975 +                       .desc = "fast copy",
33976 +                       .linkage = TYPE_SAFE_LIST_LINK_ZERO
33977 +               },
33978 +               .overrun = none_overrun,
33979 +               .alloc = NULL,
33980 +               .free = NULL,
33981 +               .compress = null_compress,
33982 +               .decompress = null_decompress
33983 +       },
33984 +       [LZRW1_COMPRESSION_ID] = {
33985 +               .h = {
33986 +                       .type_id = REISER4_COMPRESSION_PLUGIN_TYPE,
33987 +                       .id = LZRW1_COMPRESSION_ID,
33988 +                       .pops = NULL,
33989 +                       .label = "lzrw1",
33990 +                       .desc = "lzrw1 compression transform",
33991 +                       .linkage = TYPE_SAFE_LIST_LINK_ZERO
33992 +               },
33993 +               .overrun = lzrw1_overrun,
33994 +               .alloc = NULL,
33995 +               .free = NULL,
33996 +               .compress = lzrw1_compress,
33997 +               .decompress = lzrw1_decompress
33998 +       },
33999 +       [LZO1_COMPRESSION_ID] = {
34000 +               .h = {
34001 +                       .type_id = REISER4_COMPRESSION_PLUGIN_TYPE,
34002 +                       .id = LZO1_COMPRESSION_ID,
34003 +                       .pops = NULL,
34004 +                       .label = "lzo1",
34005 +                       .desc = "lzo1 compression transform",
34006 +                       .linkage = TYPE_SAFE_LIST_LINK_ZERO
34007 +               },
34008 +               .overrun = lzo1_overrun,
34009 +               .alloc = NULL,
34010 +               .free = NULL,
34011 +               .compress = lzo1_compress,
34012 +               .decompress = lzo1_decompress
34013 +       },
34014 +       [GZIP1_COMPRESSION_ID] = {
34015 +               .h = {
34016 +                       .type_id = REISER4_COMPRESSION_PLUGIN_TYPE,
34017 +                       .id = GZIP1_COMPRESSION_ID,
34018 +                       .pops = NULL,
34019 +                       .label = "gzip1",
34020 +                       .desc = "gzip1 compression transform",
34021 +                       .linkage = TYPE_SAFE_LIST_LINK_ZERO
34022 +               },
34023 +               .overrun = gzip6_overrun,
34024 +               .alloc = gzip1_alloc,
34025 +               .free = gzip1_free,
34026 +               .compress = gzip1_compress,
34027 +               .decompress = gzip1_decompress
34028 +       }
34029 +};
34030 +
34031 +/*
34032 +  Local variables:
34033 +  c-indentation-style: "K&R"
34034 +  mode-name: "LC"
34035 +  c-basic-offset: 8
34036 +  tab-width: 8
34037 +  fill-column: 120
34038 +  scroll-step: 1
34039 +  End:
34040 +*/
34041 +
34042 diff -rupN linux-2.6.8-rc3/fs/reiser4/plugin/compress/compress.h linux-2.6.8-rc3-a/fs/reiser4/plugin/compress/compress.h
34043 --- linux-2.6.8-rc3/fs/reiser4/plugin/compress/compress.h       1970-01-01 03:00:00.000000000 +0300
34044 +++ linux-2.6.8-rc3-a/fs/reiser4/plugin/compress/compress.h     2004-08-05 21:20:53.187638497 +0400
34045 @@ -0,0 +1,77 @@
34046 +#if !defined( __FS_REISER4_COMPRESS_H__ )
34047 +#define REISER4_COMPRESS_H
34048 +
34049 +#include <linux/types.h>
34050 +#include <linux/string.h>
34051 +
34052 +#define NONE_NRCOPY 4
34053 +
34054 +typedef enum {
34055 +       TFM_READ,
34056 +       TFM_WRITE
34057 +} tfm_action;
34058 +
34059 +/******************************************************************************/
34060 +/*                                                                            */
34061 +/*                                    PORT.H                                  */
34062 +/*                                                                            */
34063 +/******************************************************************************/
34064 +/*                                                                            */
34065 +/* This module contains macro definitions and types that are likely to        */
34066 +/* change between computers.                                                  */
34067 +/*                                                                            */
34068 +/******************************************************************************/
34069 +
34070 +#ifndef DONE_PORT       /* Only do this if not previously done.               */
34071 +
34072 +   #ifdef THINK_C
34073 +      #define UBYTE unsigned char      /* Unsigned byte                       */
34074 +      #define UWORD unsigned int       /* Unsigned word (2 bytes)             */
34075 +      #define ULONG unsigned long      /* Unsigned word (4 bytes)             */
34076 +      #define BOOL  unsigned char      /* Boolean                             */
34077 +      #define FOPEN_BINARY_READ  "rb"  /* Mode string for binary reading.     */
34078 +      #define FOPEN_BINARY_WRITE "wb"  /* Mode string for binary writing.     */
34079 +      #define FOPEN_TEXT_APPEND  "a"   /* Mode string for text appending.     */
34080 +      #define REAL double              /* USed for floating point stuff.      */
34081 +   #endif
34082 +   #if defined(LINUX) || defined(linux)
34083 +      #define UBYTE __u8               /* Unsigned byte                       */
34084 +      #define UWORD __u16              /* Unsigned word (2 bytes)             */
34085 +      #define ULONG __u32              /* Unsigned word (4 bytes)             */
34086 +      #define LONG  __s32              /* Signed   word (4 bytes)             */
34087 +      #define BOOL  is not used here   /* Boolean                             */
34088 +      #define FOPEN_BINARY_READ  not used  /* Mode string for binary reading. */
34089 +      #define FOPEN_BINARY_WRITE not used  /* Mode string for binary writing. */
34090 +      #define FOPEN_TEXT_APPEND  not used  /* Mode string for text appending. */
34091 +      #define REAL not used                /* USed for floating point stuff.  */
34092 +      #ifndef TRUE
34093 +      #define TRUE 1
34094 +      #endif
34095 +   #endif
34096 +
34097 +   #define DONE_PORT                   /* Don't do all this again.            */
34098 +   #define MALLOC_FAIL NULL            /* Failure status from malloc()        */
34099 +   #define LOCAL static                /* For non-exported routines.          */
34100 +   #define EXPORT                      /* Signals exported function.          */
34101 +   #define then                        /* Useful for aligning ifs.            */
34102 +
34103 +#endif
34104 +
34105 +/******************************************************************************/
34106 +/*                              End of PORT.H                                 */
34107 +/******************************************************************************/
34108 +
34109 +#define fast_copy(src,dst,len) xmemcpy(dst,src,len)
34110 +
34111 +#endif /* __FS_REISER4_COMPRESS_H__ */
34112 +
34113 +/* Make Linus happy.
34114 +   Local variables:
34115 +   c-indentation-style: "K&R"
34116 +   mode-name: "LC"
34117 +   c-basic-offset: 8
34118 +   tab-width: 8
34119 +   fill-column: 120
34120 +   scroll-step: 1
34121 +   End:
34122 +*/
34123 diff -rupN linux-2.6.8-rc3/fs/reiser4/plugin/compress/lzoconf.h linux-2.6.8-rc3-a/fs/reiser4/plugin/compress/lzoconf.h
34124 --- linux-2.6.8-rc3/fs/reiser4/plugin/compress/lzoconf.h        1970-01-01 03:00:00.000000000 +0300
34125 +++ linux-2.6.8-rc3-a/fs/reiser4/plugin/compress/lzoconf.h      2004-08-05 21:20:52.996678775 +0400
34126 @@ -0,0 +1,456 @@
34127 +/* lzoconf.h -- configuration for the LZO real-time data compression library
34128 +   adopted for reiser4 compression tramsform plugin
34129 +
34130 +   This file is part of the LZO real-time data compression library.
34131 +
34132 +   Copyright (C) 2002 Markus Franz Xaver Johannes Oberhumer
34133 +   Copyright (C) 2001 Markus Franz Xaver Johannes Oberhumer
34134 +   Copyright (C) 2000 Markus Franz Xaver Johannes Oberhumer
34135 +   Copyright (C) 1999 Markus Franz Xaver Johannes Oberhumer
34136 +   Copyright (C) 1998 Markus Franz Xaver Johannes Oberhumer
34137 +   Copyright (C) 1997 Markus Franz Xaver Johannes Oberhumer
34138 +   Copyright (C) 1996 Markus Franz Xaver Johannes Oberhumer
34139 +   All Rights Reserved.
34140 +
34141 +   The LZO library is free software; you can redistribute it and/or
34142 +   modify it under the terms of the GNU General Public License as
34143 +   published by the Free Software Foundation; either version 2 of
34144 +   the License, or (at your option) any later version.
34145 +
34146 +   The LZO library is distributed in the hope that it will be useful,
34147 +   but WITHOUT ANY WARRANTY; without even the implied warranty of
34148 +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
34149 +   GNU General Public License for more details.
34150 +
34151 +   You should have received a copy of the GNU General Public License
34152 +   along with the LZO library; see the file COPYING.
34153 +   If not, write to the Free Software Foundation, Inc.,
34154 +   59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
34155 +
34156 +   Markus F.X.J. Oberhumer
34157 +   <markus@oberhumer.com>
34158 +   http://www.oberhumer.com/opensource/lzo/
34159 + */
34160 +
34161 +#include <linux/kernel.h> /* for UINT_MAX, ULONG_MAX - edward */
34162 +
34163 +#ifndef __LZOCONF_H
34164 +#define __LZOCONF_H
34165 +
34166 +#define LZO_VERSION             0x1080
34167 +#define LZO_VERSION_STRING      "1.08"
34168 +#define LZO_VERSION_DATE        "Jul 12 2002"
34169 +
34170 +/* internal Autoconf configuration file - only used when building LZO */
34171 +#if defined(LZO_HAVE_CONFIG_H)
34172 +#  include <config.h>
34173 +#endif
34174 +#ifdef __cplusplus
34175 +extern "C" {
34176 +#endif
34177 +
34178 +
34179 +/***********************************************************************
34180 +// LZO requires a conforming <limits.h>
34181 +************************************************************************/
34182 +
34183 +#define CHAR_BIT  8 /* -edward */
34184 +#define USHRT_MAX 0xffff /* -edward */
34185 +
34186 +#if 0 /* -edward */
34187 +#if !defined(CHAR_BIT) || (CHAR_BIT != 8)
34188 +#  error "invalid CHAR_BIT"
34189 +#endif
34190 +#if !defined(UCHAR_MAX) || !defined(UINT_MAX) || !defined(ULONG_MAX)
34191 +#  error "check your compiler installation"
34192 +#endif
34193 +#if (USHRT_MAX < 1) || (UINT_MAX < 1) || (ULONG_MAX < 1)
34194 +#  error "your limits.h macros are broken"
34195 +#endif
34196 +#endif /* -edward */
34197 +/* workaround a cpp bug under hpux 10.20 */
34198 +#define LZO_0xffffffffL         4294967295ul
34199 +
34200 +#if 0 /* -edward */
34201 +#if !defined(LZO_UINT32_C)
34202 +#  if (UINT_MAX < LZO_0xffffffffL)
34203 +#    define LZO_UINT32_C(c)     c ## UL
34204 +#  else
34205 +#    define LZO_UINT32_C(c)     c ## U
34206 +#  endif
34207 +#endif
34208 +#endif /* -edward */
34209 +
34210 +/***********************************************************************
34211 +// architecture defines
34212 +************************************************************************/
34213 +
34214 +#if !defined(__LZO_WIN) && !defined(__LZO_DOS) && !defined(__LZO_OS2)
34215 +#  if defined(__WINDOWS__) || defined(_WINDOWS) || defined(_Windows)
34216 +#    define __LZO_WIN
34217 +#  elif defined(__WIN32__) || defined(_WIN32) || defined(WIN32)
34218 +#    define __LZO_WIN
34219 +#  elif defined(__NT__) || defined(__NT_DLL__) || defined(__WINDOWS_386__)
34220 +#    define __LZO_WIN
34221 +#  elif defined(__DOS__) || defined(__MSDOS__) || defined(MSDOS)
34222 +#    define __LZO_DOS
34223 +#  elif defined(__OS2__) || defined(__OS2V2__) || defined(OS2)
34224 +#    define __LZO_OS2
34225 +#  elif defined(__palmos__)
34226 +#    define __LZO_PALMOS
34227 +#  elif defined(__TOS__) || defined(__atarist__)
34228 +#    define __LZO_TOS
34229 +#  endif
34230 +#endif
34231 +
34232 +#if (UINT_MAX < LZO_0xffffffffL)
34233 +#  if defined(__LZO_WIN)
34234 +#    define __LZO_WIN16
34235 +#  elif defined(__LZO_DOS)
34236 +#    define __LZO_DOS16
34237 +#  elif defined(__LZO_PALMOS)
34238 +#    define __LZO_PALMOS16
34239 +#  elif defined(__LZO_TOS)
34240 +#    define __LZO_TOS16
34241 +#  elif defined(__C166__)
34242 +#  else
34243 +     /* porting hint: for pure 16-bit architectures try compiling
34244 +      * everything with -D__LZO_STRICT_16BIT */
34245 +#    error "16-bit target not supported - contact me for porting hints"
34246 +#  endif
34247 +#endif
34248 +
34249 +#if !defined(__LZO_i386)
34250 +#  if defined(__LZO_DOS) || defined(__LZO_WIN16)
34251 +#    define __LZO_i386
34252 +#  elif defined(__i386__) || defined(__386__) || defined(_M_IX86)
34253 +#    define __LZO_i386
34254 +#  endif
34255 +#endif
34256 +
34257 +#if defined(__LZO_STRICT_16BIT)
34258 +#  if (UINT_MAX < LZO_0xffffffffL)
34259 +#    include <lzo16bit.h>
34260 +#  endif
34261 +#endif
34262 +
34263 +/* memory checkers */
34264 +#if !defined(__LZO_CHECKER)
34265 +#  if defined(__BOUNDS_CHECKING_ON)
34266 +#    define __LZO_CHECKER
34267 +#  elif defined(__CHECKER__)
34268 +#    define __LZO_CHECKER
34269 +#  elif defined(__INSURE__)
34270 +#    define __LZO_CHECKER
34271 +#  elif defined(__PURIFY__)
34272 +#    define __LZO_CHECKER
34273 +#  endif
34274 +#endif
34275 +
34276 +
34277 +/***********************************************************************
34278 +// integral and pointer types
34279 +************************************************************************/
34280 +
34281 +/* Integral types with 32 bits or more */
34282 +#if !defined(LZO_UINT32_MAX)
34283 +#  if (UINT_MAX >= LZO_0xffffffffL)
34284 +     typedef unsigned int       lzo_uint32;
34285 +     typedef int                lzo_int32;
34286 +#    define LZO_UINT32_MAX      UINT_MAX
34287 +#    define LZO_INT32_MAX       INT_MAX
34288 +#    define LZO_INT32_MIN       INT_MIN
34289 +#  elif (ULONG_MAX >= LZO_0xffffffffL)
34290 +     typedef unsigned long      lzo_uint32;
34291 +     typedef long               lzo_int32;
34292 +#    define LZO_UINT32_MAX      ULONG_MAX
34293 +#    define LZO_INT32_MAX       LONG_MAX
34294 +#    define LZO_INT32_MIN       LONG_MIN
34295 +#  else
34296 +#    error "lzo_uint32"
34297 +#  endif
34298 +#endif
34299 +
34300 +/* lzo_uint is used like size_t */
34301 +#if !defined(LZO_UINT_MAX)
34302 +#  if (UINT_MAX >= LZO_0xffffffffL)
34303 +     typedef unsigned int       lzo_uint;
34304 +     typedef int                lzo_int;
34305 +#    define LZO_UINT_MAX        UINT_MAX
34306 +#    define LZO_INT_MAX         INT_MAX
34307 +#    define LZO_INT_MIN         INT_MIN
34308 +#  elif (ULONG_MAX >= LZO_0xffffffffL)
34309 +     typedef unsigned long      lzo_uint;
34310 +     typedef long               lzo_int;
34311 +#    define LZO_UINT_MAX        ULONG_MAX
34312 +#    define LZO_INT_MAX         LONG_MAX
34313 +#    define LZO_INT_MIN         LONG_MIN
34314 +#  else
34315 +#    error "lzo_uint"
34316 +#  endif
34317 +#endif
34318 +
34319 +typedef int lzo_bool;
34320 +
34321 +
34322 +/***********************************************************************
34323 +// memory models
34324 +************************************************************************/
34325 +
34326 +/* Memory model for the public code segment. */
34327 +#if !defined(__LZO_CMODEL)
34328 +#  if defined(__LZO_DOS16) || defined(__LZO_WIN16)
34329 +#    define __LZO_CMODEL        __far
34330 +#  elif defined(__LZO_i386) && defined(__WATCOMC__)
34331 +#    define __LZO_CMODEL        __near
34332 +#  else
34333 +#    define __LZO_CMODEL
34334 +#  endif
34335 +#endif
34336 +
34337 +/* Memory model for the public data segment. */
34338 +#if !defined(__LZO_DMODEL)
34339 +#  if defined(__LZO_DOS16) || defined(__LZO_WIN16)
34340 +#    define __LZO_DMODEL        __far
34341 +#  elif defined(__LZO_i386) && defined(__WATCOMC__)
34342 +#    define __LZO_DMODEL        __near
34343 +#  else
34344 +#    define __LZO_DMODEL
34345 +#  endif
34346 +#endif
34347 +
34348 +/* Memory model that allows to access memory at offsets of lzo_uint. */
34349 +#if !defined(__LZO_MMODEL)
34350 +#  if (LZO_UINT_MAX <= UINT_MAX)
34351 +#    define __LZO_MMODEL
34352 +#  elif defined(__LZO_DOS16) || defined(__LZO_WIN16)
34353 +#    define __LZO_MMODEL        __huge
34354 +#    define LZO_999_UNSUPPORTED
34355 +#  elif defined(__LZO_PALMOS16) || defined(__LZO_TOS16)
34356 +#    define __LZO_MMODEL
34357 +#  else
34358 +#    error "__LZO_MMODEL"
34359 +#  endif
34360 +#endif
34361 +
34362 +/* no typedef here because of const-pointer issues */
34363 +#define lzo_byte                unsigned char __LZO_MMODEL
34364 +#define lzo_bytep               unsigned char __LZO_MMODEL *
34365 +#define lzo_charp               char __LZO_MMODEL *
34366 +#define lzo_voidp               void __LZO_MMODEL *
34367 +#define lzo_shortp              short __LZO_MMODEL *
34368 +#define lzo_ushortp             unsigned short __LZO_MMODEL *
34369 +#define lzo_uint32p             lzo_uint32 __LZO_MMODEL *
34370 +#define lzo_int32p              lzo_int32 __LZO_MMODEL *
34371 +#define lzo_uintp               lzo_uint __LZO_MMODEL *
34372 +#define lzo_intp                lzo_int __LZO_MMODEL *
34373 +#define lzo_voidpp              lzo_voidp __LZO_MMODEL *
34374 +#define lzo_bytepp              lzo_bytep __LZO_MMODEL *
34375 +
34376 +#ifndef lzo_sizeof_dict_t
34377 +#  define lzo_sizeof_dict_t     sizeof(lzo_bytep)
34378 +#endif
34379 +
34380 +
34381 +/***********************************************************************
34382 +// calling conventions and function types
34383 +************************************************************************/
34384 +
34385 +/* linkage */
34386 +#if !defined(__LZO_EXTERN_C)
34387 +#  ifdef __cplusplus
34388 +#    define __LZO_EXTERN_C      extern "C"
34389 +#  else
34390 +#    define __LZO_EXTERN_C      extern
34391 +#  endif
34392 +#endif
34393 +
34394 +/* calling convention */
34395 +#if !defined(__LZO_CDECL)
34396 +#  if defined(__LZO_DOS16) || defined(__LZO_WIN16)
34397 +#    define __LZO_CDECL         __LZO_CMODEL __cdecl
34398 +#  elif defined(__LZO_i386) && defined(_MSC_VER)
34399 +#    define __LZO_CDECL         __LZO_CMODEL __cdecl
34400 +#  elif defined(__LZO_i386) && defined(__WATCOMC__)
34401 +#    define __LZO_CDECL         __LZO_CMODEL __cdecl
34402 +#  else
34403 +#    define __LZO_CDECL         __LZO_CMODEL
34404 +#  endif
34405 +#endif
34406 +#if !defined(__LZO_ENTRY)
34407 +#  define __LZO_ENTRY           __LZO_CDECL
34408 +#endif
34409 +
34410 +/* C++ exception specification for extern "C" function types */
34411 +#if !defined(__cplusplus)
34412 +#  undef LZO_NOTHROW
34413 +#  define LZO_NOTHROW
34414 +#elif !defined(LZO_NOTHROW)
34415 +#  define LZO_NOTHROW
34416 +#endif
34417 +
34418 +
34419 +typedef int
34420 +(__LZO_ENTRY *lzo_compress_t)   ( const lzo_byte *src, lzo_uint  src_len,
34421 +                                        lzo_byte *dst, lzo_uintp dst_len,
34422 +                                        lzo_voidp wrkmem );
34423 +
34424 +typedef int
34425 +(__LZO_ENTRY *lzo_decompress_t) ( const lzo_byte *src, lzo_uint  src_len,
34426 +                                        lzo_byte *dst, lzo_uintp dst_len,
34427 +                                        lzo_voidp wrkmem );
34428 +
34429 +typedef int
34430 +(__LZO_ENTRY *lzo_optimize_t)   (       lzo_byte *src, lzo_uint  src_len,
34431 +                                        lzo_byte *dst, lzo_uintp dst_len,
34432 +                                        lzo_voidp wrkmem );
34433 +
34434 +typedef int
34435 +(__LZO_ENTRY *lzo_compress_dict_t)(const lzo_byte *src, lzo_uint  src_len,
34436 +                                        lzo_byte *dst, lzo_uintp dst_len,
34437 +                                        lzo_voidp wrkmem,
34438 +                                  const lzo_byte *dict, lzo_uint dict_len );
34439 +
34440 +typedef int
34441 +(__LZO_ENTRY *lzo_decompress_dict_t)(const lzo_byte *src, lzo_uint  src_len,
34442 +                                        lzo_byte *dst, lzo_uintp dst_len,
34443 +                                        lzo_voidp wrkmem,
34444 +                                  const lzo_byte *dict, lzo_uint dict_len );
34445 +
34446 +
34447 +/* assembler versions always use __cdecl */
34448 +typedef int
34449 +(__LZO_CDECL *lzo_compress_asm_t)( const lzo_byte *src, lzo_uint  src_len,
34450 +                                        lzo_byte *dst, lzo_uintp dst_len,
34451 +                                        lzo_voidp wrkmem );
34452 +
34453 +typedef int
34454 +(__LZO_CDECL *lzo_decompress_asm_t)( const lzo_byte *src, lzo_uint  src_len,
34455 +                                        lzo_byte *dst, lzo_uintp dst_len,
34456 +                                        lzo_voidp wrkmem );
34457 +
34458 +
34459 +/* a progress indicator callback function */
34460 +typedef void (__LZO_ENTRY *lzo_progress_callback_t) (lzo_uint, lzo_uint);
34461 +
34462 +
34463 +/***********************************************************************
34464 +// export information
34465 +************************************************************************/
34466 +
34467 +/* DLL export information */
34468 +#if !defined(__LZO_EXPORT1)
34469 +#  define __LZO_EXPORT1
34470 +#endif
34471 +#if !defined(__LZO_EXPORT2)
34472 +#  define __LZO_EXPORT2
34473 +#endif
34474 +
34475 +/* exported calling convention for C functions */
34476 +#if !defined(LZO_PUBLIC)
34477 +#  define LZO_PUBLIC(_rettype) \
34478 +                __LZO_EXPORT1 _rettype __LZO_EXPORT2 __LZO_ENTRY
34479 +#endif
34480 +#if !defined(LZO_EXTERN)
34481 +#  define LZO_EXTERN(_rettype)          __LZO_EXTERN_C LZO_PUBLIC(_rettype)
34482 +#endif
34483 +#if !defined(LZO_PRIVATE)
34484 +#  define LZO_PRIVATE(_rettype)         static _rettype __LZO_ENTRY
34485 +#endif
34486 +
34487 +/* exported __cdecl calling convention for assembler functions */
34488 +#if !defined(LZO_PUBLIC_CDECL)
34489 +#  define LZO_PUBLIC_CDECL(_rettype) \
34490 +                __LZO_EXPORT1 _rettype __LZO_EXPORT2 __LZO_CDECL
34491 +#endif
34492 +#if !defined(LZO_EXTERN_CDECL)
34493 +#  define LZO_EXTERN_CDECL(_rettype)    __LZO_EXTERN_C LZO_PUBLIC_CDECL(_rettype)
34494 +#endif
34495 +
34496 +/* exported global variables (LZO currently uses no static variables and
34497 + * is fully thread safe) */
34498 +#if !defined(LZO_PUBLIC_VAR)
34499 +#  define LZO_PUBLIC_VAR(_type) \
34500 +                __LZO_EXPORT1 _type __LZO_EXPORT2 __LZO_DMODEL
34501 +#endif
34502 +#if !defined(LZO_EXTERN_VAR)
34503 +#  define LZO_EXTERN_VAR(_type)         extern LZO_PUBLIC_VAR(_type)
34504 +#endif
34505 +
34506 +
34507 +/***********************************************************************
34508 +// error codes and prototypes
34509 +************************************************************************/
34510 +
34511 +/* Error codes for the compression/decompression functions. Negative
34512 + * values are errors, positive values will be used for special but
34513 + * normal events.
34514 + */
34515 +#define LZO_E_OK                    0
34516 +#define LZO_E_ERROR                 (-1)
34517 +#define LZO_E_OUT_OF_MEMORY         (-2)    /* not used right now */
34518 +#define LZO_E_NOT_COMPRESSIBLE      (-3)    /* not used right now */
34519 +#define LZO_E_INPUT_OVERRUN         (-4)
34520 +#define LZO_E_OUTPUT_OVERRUN        (-5)
34521 +#define LZO_E_LOOKBEHIND_OVERRUN    (-6)
34522 +#define LZO_E_EOF_NOT_FOUND         (-7)
34523 +#define LZO_E_INPUT_NOT_CONSUMED    (-8)
34524 +
34525 +
34526 +/* lzo_init() should be the first function you call.
34527 + * Check the return code !
34528 + *
34529 + * lzo_init() is a macro to allow checking that the library and the
34530 + * compiler's view of various types are consistent.
34531 + */
34532 +#define lzo_init() __lzo_init2(LZO_VERSION,(int)sizeof(short),(int)sizeof(int),\
34533 +    (int)sizeof(long),(int)sizeof(lzo_uint32),(int)sizeof(lzo_uint),\
34534 +    (int)lzo_sizeof_dict_t,(int)sizeof(char *),(int)sizeof(lzo_voidp),\
34535 +    (int)sizeof(lzo_compress_t))
34536 +LZO_EXTERN(int) __lzo_init2(unsigned,int,int,int,int,int,int,int,int,int);
34537 +
34538 +/* version functions (useful for shared libraries) */
34539 +LZO_EXTERN(unsigned) lzo_version(void);
34540 +LZO_EXTERN(const char *) lzo_version_string(void);
34541 +LZO_EXTERN(const char *) lzo_version_date(void);
34542 +LZO_EXTERN(const lzo_charp) _lzo_version_string(void);
34543 +LZO_EXTERN(const lzo_charp) _lzo_version_date(void);
34544 +
34545 +/* string functions */
34546 +LZO_EXTERN(int)
34547 +lzo_memcmp(const lzo_voidp _s1, const lzo_voidp _s2, lzo_uint _len);
34548 +LZO_EXTERN(lzo_voidp)
34549 +lzo_memcpy(lzo_voidp _dest, const lzo_voidp _src, lzo_uint _len);
34550 +LZO_EXTERN(lzo_voidp)
34551 +lzo_memmove(lzo_voidp _dest, const lzo_voidp _src, lzo_uint _len);
34552 +LZO_EXTERN(lzo_voidp)
34553 +lzo_memset(lzo_voidp _s, int _c, lzo_uint _len);
34554 +
34555 +/* checksum functions */
34556 +LZO_EXTERN(lzo_uint32)
34557 +lzo_adler32(lzo_uint32 _adler, const lzo_byte *_buf, lzo_uint _len);
34558 +LZO_EXTERN(lzo_uint32)
34559 +lzo_crc32(lzo_uint32 _c, const lzo_byte *_buf, lzo_uint _len);
34560 +
34561 +/* misc. */
34562 +LZO_EXTERN(lzo_bool) lzo_assert(int _expr);
34563 +LZO_EXTERN(int) _lzo_config_check(void);
34564 +typedef union { lzo_bytep p; lzo_uint u; } __lzo_pu_u;
34565 +typedef union { lzo_bytep p; lzo_uint32 u32; } __lzo_pu32_u;
34566 +typedef union { void *vp; lzo_bytep bp; lzo_uint32 u32; long l; } lzo_align_t;
34567 +
34568 +/* align a char pointer on a boundary that is a multiple of `size' */
34569 +LZO_EXTERN(unsigned) __lzo_align_gap(const lzo_voidp _ptr, lzo_uint _size);
34570 +#define LZO_PTR_ALIGN_UP(_ptr,_size) \
34571 +    ((_ptr) + (lzo_uint) __lzo_align_gap((const lzo_voidp)(_ptr),(lzo_uint)(_size)))
34572 +
34573 +/* deprecated - only for backward compatibility */
34574 +#define LZO_ALIGN(_ptr,_size) LZO_PTR_ALIGN_UP(_ptr,_size)
34575 +
34576 +
34577 +#ifdef __cplusplus
34578 +} /* extern "C" */
34579 +#endif
34580 +
34581 +#endif /* already included */
34582 +
34583 diff -rupN linux-2.6.8-rc3/fs/reiser4/plugin/compress/minilzo.c linux-2.6.8-rc3-a/fs/reiser4/plugin/compress/minilzo.c
34584 --- linux-2.6.8-rc3/fs/reiser4/plugin/compress/minilzo.c        1970-01-01 03:00:00.000000000 +0300
34585 +++ linux-2.6.8-rc3-a/fs/reiser4/plugin/compress/minilzo.c      2004-08-05 21:20:53.313611926 +0400
34586 @@ -0,0 +1,2947 @@
34587 +/* minilzo.c -- mini subset of the LZO real-time data compression library
34588 +   Adopted for reiser4 compression transform plugin.
34589 +
34590 +   This file is part of the LZO real-time data compression library.
34591 +
34592 +   Copyright (C) 2002 Markus Franz Xaver Johannes Oberhumer
34593 +   Copyright (C) 2001 Markus Franz Xaver Johannes Oberhumer
34594 +   Copyright (C) 2000 Markus Franz Xaver Johannes Oberhumer
34595 +   Copyright (C) 1999 Markus Franz Xaver Johannes Oberhumer
34596 +   Copyright (C) 1998 Markus Franz Xaver Johannes Oberhumer
34597 +   Copyright (C) 1997 Markus Franz Xaver Johannes Oberhumer
34598 +   Copyright (C) 1996 Markus Franz Xaver Johannes Oberhumer
34599 +   All Rights Reserved.
34600 +
34601 +   The LZO library is free software; you can redistribute it and/or
34602 +   modify it under the terms of the GNU General Public License as
34603 +   published by the Free Software Foundation; either version 2 of
34604 +   the License, or (at your option) any later version.
34605 +
34606 +   The LZO library is distributed in the hope that it will be useful,
34607 +   but WITHOUT ANY WARRANTY; without even the implied warranty of
34608 +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
34609 +   GNU General Public License for more details.
34610 +
34611 +   You should have received a copy of the GNU General Public License
34612 +   along with the LZO library; see the file COPYING.
34613 +   If not, write to the Free Software Foundation, Inc.,
34614 +   59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
34615 +
34616 +   Markus F.X.J. Oberhumer
34617 +   <markus@oberhumer.com>
34618 +   http://www.oberhumer.com/opensource/lzo/
34619 + */
34620 +
34621 +/*
34622 + * NOTE:
34623 + *   the full LZO package can be found at
34624 + *   http://www.oberhumer.com/opensource/lzo/
34625 + */
34626 +
34627 +#include "../../debug.h" /* for reiser4 assert macro -edward */
34628 +
34629 +#define __LZO_IN_MINILZO
34630 +#define LZO_BUILD
34631 +
34632 +#ifdef MINILZO_HAVE_CONFIG_H
34633 +#  include <config.h>
34634 +#endif
34635 +
34636 +#undef LZO_HAVE_CONFIG_H
34637 +#include "minilzo.h"
34638 +
34639 +#if !defined(MINILZO_VERSION) || (MINILZO_VERSION != 0x1080)
34640 +#  error "version mismatch in miniLZO source files"
34641 +#endif
34642 +
34643 +#ifdef MINILZO_HAVE_CONFIG_H
34644 +#  define LZO_HAVE_CONFIG_H
34645 +#endif
34646 +
34647 +#if 0 /* -edward */
34648 +#if !defined(LZO_NO_SYS_TYPES_H)
34649 +#  include <sys/types.h>
34650 +#endif
34651 +#include <stdio.h>
34652 +#endif /* -edward */
34653 +
34654 +#ifndef __LZO_CONF_H
34655 +#define __LZO_CONF_H
34656 +
34657 +#if !defined(__LZO_IN_MINILZO)
34658 +#  ifndef __LZOCONF_H
34659 +#    include <lzoconf.h>
34660 +#  endif
34661 +#endif
34662 +
34663 +#if defined(__BOUNDS_CHECKING_ON)
34664 +#  include <unchecked.h>
34665 +#else
34666 +#  define BOUNDS_CHECKING_OFF_DURING(stmt)      stmt
34667 +#  define BOUNDS_CHECKING_OFF_IN_EXPR(expr)     (expr)
34668 +#endif
34669 +
34670 +#if 0 /* edward */
34671 +#if !defined(LZO_HAVE_CONFIG_H)
34672 +#  include <stddef.h>
34673 +#  include <string.h>
34674 +#  if !defined(NO_STDLIB_H)
34675 +#    include <stdlib.h>
34676 +#  endif
34677 +#endif /* edward */
34678 +#  define HAVE_MEMCMP
34679 +#  define HAVE_MEMCPY
34680 +#  define HAVE_MEMMOVE
34681 +#  define HAVE_MEMSET
34682 +#if 0 /* edward */
34683 +#else
34684 +#  include <sys/types.h>
34685 +#  if defined(HAVE_STDDEF_H)
34686 +#    include <stddef.h>
34687 +#  endif
34688 +#  if defined(STDC_HEADERS)
34689 +#    include <string.h>
34690 +#    include <stdlib.h>
34691 +#  endif
34692 +#endif
34693 +#endif /* edward */
34694 +
34695 +#if defined(__LZO_DOS16) || defined(__LZO_WIN16)
34696 +#  define HAVE_MALLOC_H
34697 +#  define HAVE_HALLOC
34698 +#endif
34699 +
34700 +#undef NDEBUG
34701 +#if !defined(LZO_DEBUG)
34702 +#  define NDEBUG
34703 +#endif
34704 +#if defined(LZO_DEBUG) || !defined(NDEBUG)
34705 +#  if !defined(NO_STDIO_H)
34706 +#    include <stdio.h>
34707 +#  endif
34708 +#endif
34709 +# if 0 /* edward */
34710 +#include <assert.h>
34711 +#endif /* edward */
34712 +
34713 +#if !defined(LZO_COMPILE_TIME_ASSERT)
34714 +#  define LZO_COMPILE_TIME_ASSERT(expr) \
34715 +       { typedef int __lzo_compile_time_assert_fail[1 - 2 * !(expr)]; }
34716 +#endif
34717 +
34718 +#if !defined(LZO_UNUSED)
34719 +#  if 1
34720 +#    define LZO_UNUSED(var)     ((void)&var)
34721 +#  elif 0
34722 +#    define LZO_UNUSED(var)     { typedef int __lzo_unused[sizeof(var) ? 2 : 1]; }
34723 +#  else
34724 +#    define LZO_UNUSED(parm)    (parm = parm)
34725 +#  endif
34726 +#endif
34727 +
34728 +#if !defined(__inline__) && !defined(__GNUC__)
34729 +#  if defined(__cplusplus)
34730 +#    define __inline__      inline
34731 +#  else
34732 +#    define __inline__
34733 +#  endif
34734 +#endif
34735 +
34736 +#if defined(NO_MEMCMP)
34737 +#  undef HAVE_MEMCMP
34738 +#endif
34739 +
34740 +#if !defined(HAVE_MEMCMP)
34741 +#  undef memcmp
34742 +#  define memcmp    lzo_memcmp
34743 +#endif
34744 +#if !defined(HAVE_MEMCPY)
34745 +#  undef memcpy
34746 +#  define memcpy    lzo_memcpy
34747 +#endif
34748 +#if !defined(HAVE_MEMMOVE)
34749 +#  undef memmove
34750 +#  define memmove   lzo_memmove
34751 +#endif
34752 +#if !defined(HAVE_MEMSET)
34753 +#  undef memset
34754 +#  define memset    lzo_memset
34755 +#endif
34756 +
34757 +#if 0
34758 +#  define LZO_BYTE(x)       ((unsigned char) (x))
34759 +#else
34760 +#  define LZO_BYTE(x)       ((unsigned char) ((x) & 0xff))
34761 +#endif
34762 +
34763 +#define LZO_MAX(a,b)        ((a) >= (b) ? (a) : (b))
34764 +#define LZO_MIN(a,b)        ((a) <= (b) ? (a) : (b))
34765 +#define LZO_MAX3(a,b,c)     ((a) >= (b) ? LZO_MAX(a,c) : LZO_MAX(b,c))
34766 +#define LZO_MIN3(a,b,c)     ((a) <= (b) ? LZO_MIN(a,c) : LZO_MIN(b,c))
34767 +
34768 +#define lzo_sizeof(type)    ((lzo_uint) (sizeof(type)))
34769 +
34770 +#define LZO_HIGH(array)     ((lzo_uint) (sizeof(array)/sizeof(*(array))))
34771 +
34772 +#define LZO_SIZE(bits)      (1u << (bits))
34773 +#define LZO_MASK(bits)      (LZO_SIZE(bits) - 1)
34774 +
34775 +#define LZO_LSIZE(bits)     (1ul << (bits))
34776 +#define LZO_LMASK(bits)     (LZO_LSIZE(bits) - 1)
34777 +
34778 +#define LZO_USIZE(bits)     ((lzo_uint) 1 << (bits))
34779 +#define LZO_UMASK(bits)     (LZO_USIZE(bits) - 1)
34780 +
34781 +#define LZO_STYPE_MAX(b)    (((1l  << (8*(b)-2)) - 1l)  + (1l  << (8*(b)-2)))
34782 +#define LZO_UTYPE_MAX(b)    (((1ul << (8*(b)-1)) - 1ul) + (1ul << (8*(b)-1)))
34783 +
34784 +#if !defined(SIZEOF_UNSIGNED)
34785 +#  if (UINT_MAX == 0xffff)
34786 +#    define SIZEOF_UNSIGNED         2
34787 +#  elif (UINT_MAX == LZO_0xffffffffL)
34788 +#    define SIZEOF_UNSIGNED         4
34789 +#  elif (UINT_MAX >= LZO_0xffffffffL)
34790 +#    define SIZEOF_UNSIGNED         8
34791 +#  else
34792 +#    error "SIZEOF_UNSIGNED"
34793 +#  endif
34794 +#endif
34795 +
34796 +#if !defined(SIZEOF_UNSIGNED_LONG)
34797 +#  if (ULONG_MAX == LZO_0xffffffffL)
34798 +#    define SIZEOF_UNSIGNED_LONG    4
34799 +#  elif (ULONG_MAX >= LZO_0xffffffffL)
34800 +#    define SIZEOF_UNSIGNED_LONG    8
34801 +#  else
34802 +#    error "SIZEOF_UNSIGNED_LONG"
34803 +#  endif
34804 +#endif
34805 +
34806 +#if !defined(SIZEOF_SIZE_T)
34807 +#  define SIZEOF_SIZE_T             SIZEOF_UNSIGNED
34808 +#endif
34809 +#if !defined(SIZE_T_MAX)
34810 +#  define SIZE_T_MAX                LZO_UTYPE_MAX(SIZEOF_SIZE_T)
34811 +#endif
34812 +
34813 +#if 1 && defined(__LZO_i386) && (UINT_MAX == LZO_0xffffffffL)
34814 +#  if !defined(LZO_UNALIGNED_OK_2) && (USHRT_MAX == 0xffff)
34815 +#    define LZO_UNALIGNED_OK_2
34816 +#  endif
34817 +#  if !defined(LZO_UNALIGNED_OK_4) && (LZO_UINT32_MAX == LZO_0xffffffffL)
34818 +#    define LZO_UNALIGNED_OK_4
34819 +#  endif
34820 +#endif
34821 +
34822 +#if defined(LZO_UNALIGNED_OK_2) || defined(LZO_UNALIGNED_OK_4)
34823 +#  if !defined(LZO_UNALIGNED_OK)
34824 +#    define LZO_UNALIGNED_OK
34825 +#  endif
34826 +#endif
34827 +
34828 +#if defined(__LZO_NO_UNALIGNED)
34829 +#  undef LZO_UNALIGNED_OK
34830 +#  undef LZO_UNALIGNED_OK_2
34831 +#  undef LZO_UNALIGNED_OK_4
34832 +#endif
34833 +
34834 +#if defined(LZO_UNALIGNED_OK_2) && (USHRT_MAX != 0xffff)
34835 +#  error "LZO_UNALIGNED_OK_2 must not be defined on this system"
34836 +#endif
34837 +#if defined(LZO_UNALIGNED_OK_4) && (LZO_UINT32_MAX != LZO_0xffffffffL)
34838 +#  error "LZO_UNALIGNED_OK_4 must not be defined on this system"
34839 +#endif
34840 +
34841 +#if defined(__LZO_NO_ALIGNED)
34842 +#  undef LZO_ALIGNED_OK_4
34843 +#endif
34844 +
34845 +#if defined(LZO_ALIGNED_OK_4) && (LZO_UINT32_MAX != LZO_0xffffffffL)
34846 +#  error "LZO_ALIGNED_OK_4 must not be defined on this system"
34847 +#endif
34848 +
34849 +#define LZO_LITTLE_ENDIAN       1234
34850 +#define LZO_BIG_ENDIAN          4321
34851 +#define LZO_PDP_ENDIAN          3412
34852 +
34853 +#if !defined(LZO_BYTE_ORDER)
34854 +#  if defined(MFX_BYTE_ORDER)
34855 +#    define LZO_BYTE_ORDER      MFX_BYTE_ORDER
34856 +#  elif defined(__LZO_i386)
34857 +#    define LZO_BYTE_ORDER      LZO_LITTLE_ENDIAN
34858 +#  elif defined(BYTE_ORDER)
34859 +#    define LZO_BYTE_ORDER      BYTE_ORDER
34860 +#  elif defined(__BYTE_ORDER)
34861 +#    define LZO_BYTE_ORDER      __BYTE_ORDER
34862 +#  endif
34863 +#endif
34864 +
34865 +#if defined(LZO_BYTE_ORDER)
34866 +#  if (LZO_BYTE_ORDER != LZO_LITTLE_ENDIAN) && \
34867 +      (LZO_BYTE_ORDER != LZO_BIG_ENDIAN)
34868 +#    error "invalid LZO_BYTE_ORDER"
34869 +#  endif
34870 +#endif
34871 +
34872 +#if defined(LZO_UNALIGNED_OK) && !defined(LZO_BYTE_ORDER)
34873 +#  error "LZO_BYTE_ORDER is not defined"
34874 +#endif
34875 +
34876 +#define LZO_OPTIMIZE_GNUC_i386_IS_BUGGY
34877 +
34878 +#if defined(NDEBUG) && !defined(LZO_DEBUG) && !defined(__LZO_CHECKER)
34879 +#  if defined(__GNUC__) && defined(__i386__)
34880 +#    if !defined(LZO_OPTIMIZE_GNUC_i386_IS_BUGGY)
34881 +#      define LZO_OPTIMIZE_GNUC_i386
34882 +#    endif
34883 +#  endif
34884 +#endif
34885 +
34886 +__LZO_EXTERN_C int __lzo_init_done;
34887 +__LZO_EXTERN_C const lzo_byte __lzo_copyright[];
34888 +LZO_EXTERN(const lzo_byte *) lzo_copyright(void);
34889 +__LZO_EXTERN_C const lzo_uint32 _lzo_crc32_table[256];
34890 +
34891 +#define _LZO_STRINGIZE(x)           #x
34892 +#define _LZO_MEXPAND(x)             _LZO_STRINGIZE(x)
34893 +
34894 +#define _LZO_CONCAT2(a,b)           a ## b
34895 +#define _LZO_CONCAT3(a,b,c)         a ## b ## c
34896 +#define _LZO_CONCAT4(a,b,c,d)       a ## b ## c ## d
34897 +#define _LZO_CONCAT5(a,b,c,d,e)     a ## b ## c ## d ## e
34898 +
34899 +#define _LZO_ECONCAT2(a,b)          _LZO_CONCAT2(a,b)
34900 +#define _LZO_ECONCAT3(a,b,c)        _LZO_CONCAT3(a,b,c)
34901 +#define _LZO_ECONCAT4(a,b,c,d)      _LZO_CONCAT4(a,b,c,d)
34902 +#define _LZO_ECONCAT5(a,b,c,d,e)    _LZO_CONCAT5(a,b,c,d,e)
34903 +
34904 +#if 0
34905 +
34906 +#define __LZO_IS_COMPRESS_QUERY(i,il,o,ol,w)    ((lzo_voidp)(o) == (w))
34907 +#define __LZO_QUERY_COMPRESS(i,il,o,ol,w,n,s) \
34908 +               (*ol = (n)*(s), LZO_E_OK)
34909 +
34910 +#define __LZO_IS_DECOMPRESS_QUERY(i,il,o,ol,w)  ((lzo_voidp)(o) == (w))
34911 +#define __LZO_QUERY_DECOMPRESS(i,il,o,ol,w,n,s) \
34912 +               (*ol = (n)*(s), LZO_E_OK)
34913 +
34914 +#define __LZO_IS_OPTIMIZE_QUERY(i,il,o,ol,w)    ((lzo_voidp)(o) == (w))
34915 +#define __LZO_QUERY_OPTIMIZE(i,il,o,ol,w,n,s) \
34916 +               (*ol = (n)*(s), LZO_E_OK)
34917 +
34918 +#endif
34919 +
34920 +#ifndef __LZO_PTR_H
34921 +#define __LZO_PTR_H
34922 +
34923 +#ifdef __cplusplus
34924 +extern "C" {
34925 +#endif
34926 +
34927 +#if defined(__LZO_DOS16) || defined(__LZO_WIN16)
34928 +#  include <dos.h>
34929 +#  if 1 && defined(__WATCOMC__)
34930 +#    include <i86.h>
34931 +     __LZO_EXTERN_C unsigned char _HShift;
34932 +#    define __LZO_HShift    _HShift
34933 +#  elif 1 && defined(_MSC_VER)
34934 +     __LZO_EXTERN_C unsigned short __near _AHSHIFT;
34935 +#    define __LZO_HShift    ((unsigned) &_AHSHIFT)
34936 +#  elif defined(__LZO_WIN16)
34937 +#    define __LZO_HShift    3
34938 +#  else
34939 +#    define __LZO_HShift    12
34940 +#  endif
34941 +#  if !defined(_FP_SEG) && defined(FP_SEG)
34942 +#    define _FP_SEG         FP_SEG
34943 +#  endif
34944 +#  if !defined(_FP_OFF) && defined(FP_OFF)
34945 +#    define _FP_OFF         FP_OFF
34946 +#  endif
34947 +#endif
34948 +
34949 +#if !defined(lzo_ptrdiff_t)
34950 +#  if (UINT_MAX >= LZO_0xffffffffL)
34951 +     typedef ptrdiff_t          lzo_ptrdiff_t;
34952 +#  else
34953 +     typedef long               lzo_ptrdiff_t;
34954 +#  endif
34955 +#endif
34956 +
34957 +#if !defined(__LZO_HAVE_PTR_T)
34958 +#  if defined(lzo_ptr_t)
34959 +#    define __LZO_HAVE_PTR_T
34960 +#  endif
34961 +#endif
34962 +#if !defined(__LZO_HAVE_PTR_T)
34963 +#  if defined(SIZEOF_CHAR_P) && defined(SIZEOF_UNSIGNED_LONG)
34964 +#    if (SIZEOF_CHAR_P == SIZEOF_UNSIGNED_LONG)
34965 +       typedef unsigned long    lzo_ptr_t;
34966 +       typedef long             lzo_sptr_t;
34967 +#      define __LZO_HAVE_PTR_T
34968 +#    endif
34969 +#  endif
34970 +#endif
34971 +#if !defined(__LZO_HAVE_PTR_T)
34972 +#  if defined(SIZEOF_CHAR_P) && defined(SIZEOF_UNSIGNED)
34973 +#    if (SIZEOF_CHAR_P == SIZEOF_UNSIGNED)
34974 +       typedef unsigned int     lzo_ptr_t;
34975 +       typedef int              lzo_sptr_t;
34976 +#      define __LZO_HAVE_PTR_T
34977 +#    endif
34978 +#  endif
34979 +#endif
34980 +#if !defined(__LZO_HAVE_PTR_T)
34981 +#  if defined(SIZEOF_CHAR_P) && defined(SIZEOF_UNSIGNED_SHORT)
34982 +#    if (SIZEOF_CHAR_P == SIZEOF_UNSIGNED_SHORT)
34983 +       typedef unsigned short   lzo_ptr_t;
34984 +       typedef short            lzo_sptr_t;
34985 +#      define __LZO_HAVE_PTR_T
34986 +#    endif
34987 +#  endif
34988 +#endif
34989 +#if !defined(__LZO_HAVE_PTR_T)
34990 +#  if defined(LZO_HAVE_CONFIG_H) || defined(SIZEOF_CHAR_P)
34991 +#    error "no suitable type for lzo_ptr_t"
34992 +#  else
34993 +     typedef unsigned long      lzo_ptr_t;
34994 +     typedef long               lzo_sptr_t;
34995 +#    define __LZO_HAVE_PTR_T
34996 +#  endif
34997 +#endif
34998 +
34999 +#if defined(__LZO_DOS16) || defined(__LZO_WIN16)
35000 +#define PTR(a)              ((lzo_bytep) (a))
35001 +#define PTR_ALIGNED_4(a)    ((_FP_OFF(a) & 3) == 0)
35002 +#define PTR_ALIGNED2_4(a,b) (((_FP_OFF(a) | _FP_OFF(b)) & 3) == 0)
35003 +#else
35004 +#define PTR(a)              ((lzo_ptr_t) (a))
35005 +#define PTR_LINEAR(a)       PTR(a)
35006 +#define PTR_ALIGNED_4(a)    ((PTR_LINEAR(a) & 3) == 0)
35007 +#define PTR_ALIGNED_8(a)    ((PTR_LINEAR(a) & 7) == 0)
35008 +#define PTR_ALIGNED2_4(a,b) (((PTR_LINEAR(a) | PTR_LINEAR(b)) & 3) == 0)
35009 +#define PTR_ALIGNED2_8(a,b) (((PTR_LINEAR(a) | PTR_LINEAR(b)) & 7) == 0)
35010 +#endif
35011 +
35012 +#define PTR_LT(a,b)         (PTR(a) < PTR(b))
35013 +#define PTR_GE(a,b)         (PTR(a) >= PTR(b))
35014 +#define PTR_DIFF(a,b)       ((lzo_ptrdiff_t) (PTR(a) - PTR(b)))
35015 +#define pd(a,b)             ((lzo_uint) ((a)-(b)))
35016 +
35017 +LZO_EXTERN(lzo_ptr_t)
35018 +__lzo_ptr_linear(const lzo_voidp ptr);
35019 +
35020 +typedef union
35021 +{
35022 +    char            a_char;
35023 +    unsigned char   a_uchar;
35024 +    short           a_short;
35025 +    unsigned short  a_ushort;
35026 +    int             a_int;
35027 +    unsigned int    a_uint;
35028 +    long            a_long;
35029 +    unsigned long   a_ulong;
35030 +    lzo_int         a_lzo_int;
35031 +    lzo_uint        a_lzo_uint;
35032 +    lzo_int32       a_lzo_int32;
35033 +    lzo_uint32      a_lzo_uint32;
35034 +    ptrdiff_t       a_ptrdiff_t;
35035 +    lzo_ptrdiff_t   a_lzo_ptrdiff_t;
35036 +    lzo_ptr_t       a_lzo_ptr_t;
35037 +    lzo_voidp       a_lzo_voidp;
35038 +    void *          a_void_p;
35039 +    lzo_bytep       a_lzo_bytep;
35040 +    lzo_bytepp      a_lzo_bytepp;
35041 +    lzo_uintp       a_lzo_uintp;
35042 +    lzo_uint *      a_lzo_uint_p;
35043 +    lzo_uint32p     a_lzo_uint32p;
35044 +    lzo_uint32 *    a_lzo_uint32_p;
35045 +    unsigned char * a_uchar_p;
35046 +    char *          a_char_p;
35047 +}
35048 +lzo_full_align_t;
35049 +
35050 +#ifdef __cplusplus
35051 +}
35052 +#endif
35053 +
35054 +#endif
35055 +
35056 +#define LZO_DETERMINISTIC
35057 +
35058 +#define LZO_DICT_USE_PTR
35059 +#if defined(__LZO_DOS16) || defined(__LZO_WIN16) || defined(__LZO_STRICT_16BIT)
35060 +#  undef LZO_DICT_USE_PTR
35061 +#endif
35062 +
35063 +#if defined(LZO_DICT_USE_PTR)
35064 +#  define lzo_dict_t    const lzo_bytep
35065 +#  define lzo_dict_p    lzo_dict_t __LZO_MMODEL *
35066 +#else
35067 +#  define lzo_dict_t    lzo_uint
35068 +#  define lzo_dict_p    lzo_dict_t __LZO_MMODEL *
35069 +#endif
35070 +
35071 +#if !defined(lzo_moff_t)
35072 +#define lzo_moff_t      lzo_uint
35073 +#endif
35074 +
35075 +#endif
35076 +
35077 +LZO_PUBLIC(lzo_ptr_t)
35078 +__lzo_ptr_linear(const lzo_voidp ptr)
35079 +{
35080 +    lzo_ptr_t p;
35081 +
35082 +#if defined(__LZO_DOS16) || defined(__LZO_WIN16)
35083 +    p = (((lzo_ptr_t)(_FP_SEG(ptr))) << (16 - __LZO_HShift)) + (_FP_OFF(ptr));
35084 +#else
35085 +    p = PTR_LINEAR(ptr);
35086 +#endif
35087 +
35088 +    return p;
35089 +}
35090 +
35091 +LZO_PUBLIC(unsigned)
35092 +__lzo_align_gap(const lzo_voidp ptr, lzo_uint size)
35093 +{
35094 +    lzo_ptr_t p, s, n;
35095 +
35096 +    assert("lzo-01", size > 0);
35097 +
35098 +    p = __lzo_ptr_linear(ptr);
35099 +    s = (lzo_ptr_t) (size - 1);
35100 +#if 0
35101 +    assert((size & (size - 1)) == 0);
35102 +    n = ((p + s) & ~s) - p;
35103 +#else
35104 +    n = (((p + s) / size) * size) - p;
35105 +#endif
35106 +
35107 +    assert("lzo-02", (long)n >= 0);
35108 +    assert("lzo-03", n <= s);
35109 +
35110 +    return (unsigned)n;
35111 +}
35112 +
35113 +#ifndef __LZO_UTIL_H
35114 +#define __LZO_UTIL_H
35115 +
35116 +#ifndef __LZO_CONF_H
35117 +#endif
35118 +
35119 +#ifdef __cplusplus
35120 +extern "C" {
35121 +#endif
35122 +
35123 +#if 1 && defined(HAVE_MEMCPY)
35124 +#if !defined(__LZO_DOS16) && !defined(__LZO_WIN16)
35125 +
35126 +#define MEMCPY8_DS(dest,src,len) \
35127 +    memcpy(dest,src,len); \
35128 +    dest += len; \
35129 +    src += len
35130 +
35131 +#endif
35132 +#endif
35133 +
35134 +#if 0 && !defined(MEMCPY8_DS)
35135 +
35136 +#define MEMCPY8_DS(dest,src,len) \
35137 +    { do { \
35138 +       *dest++ = *src++; \
35139 +       *dest++ = *src++; \
35140 +       *dest++ = *src++; \
35141 +       *dest++ = *src++; \
35142 +       *dest++ = *src++; \
35143 +       *dest++ = *src++; \
35144 +       *dest++ = *src++; \
35145 +       *dest++ = *src++; \
35146 +       len -= 8; \
35147 +    } while (len > 0); }
35148 +
35149 +#endif
35150 +
35151 +#if !defined(MEMCPY8_DS)
35152 +
35153 +#define MEMCPY8_DS(dest,src,len) \
35154 +    { register lzo_uint __l = (len) / 8; \
35155 +    do { \
35156 +       *dest++ = *src++; \
35157 +       *dest++ = *src++; \
35158 +       *dest++ = *src++; \
35159 +       *dest++ = *src++; \
35160 +       *dest++ = *src++; \
35161 +       *dest++ = *src++; \
35162 +       *dest++ = *src++; \
35163 +       *dest++ = *src++; \
35164 +    } while (--__l > 0); }
35165 +
35166 +#endif
35167 +
35168 +#define MEMCPY_DS(dest,src,len) \
35169 +    do *dest++ = *src++; \
35170 +    while (--len > 0)
35171 +
35172 +#define MEMMOVE_DS(dest,src,len) \
35173 +    do *dest++ = *src++; \
35174 +    while (--len > 0)
35175 +
35176 +#if 0 && defined(LZO_OPTIMIZE_GNUC_i386)
35177 +
35178 +#define BZERO8_PTR(s,l,n) \
35179 +__asm__ __volatile__( \
35180 +    "movl  %0,%%eax \n"             \
35181 +    "movl  %1,%%edi \n"             \
35182 +    "movl  %2,%%ecx \n"             \
35183 +    "cld \n"                        \
35184 +    "rep \n"                        \
35185 +    "stosl %%eax,(%%edi) \n"        \
35186 +    :               \
35187 +    :"g" (0),"g" (s),"g" (n)        \
35188 +    :"eax","edi","ecx", "memory", "cc" \
35189 +)
35190 +
35191 +#elif (LZO_UINT_MAX <= SIZE_T_MAX) && defined(HAVE_MEMSET)
35192 +
35193 +#if 1
35194 +#define BZERO8_PTR(s,l,n)   memset((s),0,(lzo_uint)(l)*(n))
35195 +#else
35196 +#define BZERO8_PTR(s,l,n)   memset((lzo_voidp)(s),0,(lzo_uint)(l)*(n))
35197 +#endif
35198 +
35199 +#else
35200 +
35201 +#define BZERO8_PTR(s,l,n) \
35202 +    lzo_memset((lzo_voidp)(s),0,(lzo_uint)(l)*(n))
35203 +
35204 +#endif
35205 +
35206 +#if 0
35207 +#if defined(__GNUC__) && defined(__i386__)
35208 +
35209 +unsigned char lzo_rotr8(unsigned char value, int shift);
35210 +extern __inline__ unsigned char lzo_rotr8(unsigned char value, int shift)
35211 +{
35212 +    unsigned char result;
35213 +
35214 +    __asm__ __volatile__ ("movb %b1, %b0; rorb %b2, %b0"
35215 +                       : "=a"(result) : "g"(value), "c"(shift));
35216 +    return result;
35217 +}
35218 +
35219 +unsigned short lzo_rotr16(unsigned short value, int shift);
35220 +extern __inline__ unsigned short lzo_rotr16(unsigned short value, int shift)
35221 +{
35222 +    unsigned short result;
35223 +
35224 +    __asm__ __volatile__ ("movw %b1, %b0; rorw %b2, %b0"
35225 +                       : "=a"(result) : "g"(value), "c"(shift));
35226 +    return result;
35227 +}
35228 +
35229 +#endif
35230 +#endif
35231 +
35232 +#ifdef __cplusplus
35233 +}
35234 +#endif
35235 +
35236 +#endif
35237 +
35238 +LZO_PUBLIC(lzo_bool)
35239 +lzo_assert(int expr)
35240 +{
35241 +    return (expr) ? 1 : 0;
35242 +}
35243 +
35244 +/* If you use the LZO library in a product, you *must* keep this
35245 + * copyright string in the executable of your product.
35246 + */
35247 +
35248 +const lzo_byte __lzo_copyright[] =
35249 +#if !defined(__LZO_IN_MINLZO)
35250 +    LZO_VERSION_STRING;
35251 +#else
35252 +    "\n\n\n"
35253 +    "LZO real-time data compression library.\n"
35254 +    "Copyright (C) 1996, 1997, 1998, 1999, 2000, 2001, 2002 Markus Franz Xaver Johannes Oberhumer\n"
35255 +    "<markus.oberhumer@jk.uni-linz.ac.at>\n"
35256 +    "http://www.oberhumer.com/opensource/lzo/\n"
35257 +    "\n"
35258 +    "LZO version: v" LZO_VERSION_STRING ", " LZO_VERSION_DATE "\n"
35259 +    "LZO build date: " __DATE__ " " __TIME__ "\n\n"
35260 +    "LZO special compilation options:\n"
35261 +#ifdef __cplusplus
35262 +    " __cplusplus\n"
35263 +#endif
35264 +#if defined(__PIC__)
35265 +    " __PIC__\n"
35266 +#elif defined(__pic__)
35267 +    " __pic__\n"
35268 +#endif
35269 +#if (UINT_MAX < LZO_0xffffffffL)
35270 +    " 16BIT\n"
35271 +#endif
35272 +#if defined(__LZO_STRICT_16BIT)
35273 +    " __LZO_STRICT_16BIT\n"
35274 +#endif
35275 +#if (UINT_MAX > LZO_0xffffffffL)
35276 +    " UINT_MAX=" _LZO_MEXPAND(UINT_MAX) "\n"
35277 +#endif
35278 +#if (ULONG_MAX > LZO_0xffffffffL)
35279 +    " ULONG_MAX=" _LZO_MEXPAND(ULONG_MAX) "\n"
35280 +#endif
35281 +#if defined(LZO_BYTE_ORDER)
35282 +    " LZO_BYTE_ORDER=" _LZO_MEXPAND(LZO_BYTE_ORDER) "\n"
35283 +#endif
35284 +#if defined(LZO_UNALIGNED_OK_2)
35285 +    " LZO_UNALIGNED_OK_2\n"
35286 +#endif
35287 +#if defined(LZO_UNALIGNED_OK_4)
35288 +    " LZO_UNALIGNED_OK_4\n"
35289 +#endif
35290 +#if defined(LZO_ALIGNED_OK_4)
35291 +    " LZO_ALIGNED_OK_4\n"
35292 +#endif
35293 +#if defined(LZO_DICT_USE_PTR)
35294 +    " LZO_DICT_USE_PTR\n"
35295 +#endif
35296 +#if defined(__LZO_QUERY_COMPRESS)
35297 +    " __LZO_QUERY_COMPRESS\n"
35298 +#endif
35299 +#if defined(__LZO_QUERY_DECOMPRESS)
35300 +    " __LZO_QUERY_DECOMPRESS\n"
35301 +#endif
35302 +#if defined(__LZO_IN_MINILZO)
35303 +    " __LZO_IN_MINILZO\n"
35304 +#endif
35305 +    "\n\n"
35306 +    "$Id: LZO " LZO_VERSION_STRING " built " __DATE__ " " __TIME__
35307 +#if defined(__GNUC__) && defined(__VERSION__)
35308 +    " by gcc " __VERSION__
35309 +#elif defined(__BORLANDC__)
35310 +    " by Borland C " _LZO_MEXPAND(__BORLANDC__)
35311 +#elif defined(_MSC_VER)
35312 +    " by Microsoft C " _LZO_MEXPAND(_MSC_VER)
35313 +#elif defined(__PUREC__)
35314 +    " by Pure C " _LZO_MEXPAND(__PUREC__)
35315 +#elif defined(__SC__)
35316 +    " by Symantec C " _LZO_MEXPAND(__SC__)
35317 +#elif defined(__TURBOC__)
35318 +    " by Turbo C " _LZO_MEXPAND(__TURBOC__)
35319 +#elif defined(__WATCOMC__)
35320 +    " by Watcom C " _LZO_MEXPAND(__WATCOMC__)
35321 +#endif
35322 +    " $\n"
35323 +    "$Copyright: LZO (C) 1996, 1997, 1998, 1999, 2000, 2001, 2002 Markus Franz Xaver Johannes Oberhumer $\n";
35324 +#endif
35325 +
35326 +LZO_PUBLIC(const lzo_byte *)
35327 +lzo_copyright(void)
35328 +{
35329 +    return __lzo_copyright;
35330 +}
35331 +
35332 +LZO_PUBLIC(unsigned)
35333 +lzo_version(void)
35334 +{
35335 +    return LZO_VERSION;
35336 +}
35337 +
35338 +LZO_PUBLIC(const char *)
35339 +lzo_version_string(void)
35340 +{
35341 +    return LZO_VERSION_STRING;
35342 +}
35343 +
35344 +LZO_PUBLIC(const char *)
35345 +lzo_version_date(void)
35346 +{
35347 +    return LZO_VERSION_DATE;
35348 +}
35349 +
35350 +LZO_PUBLIC(const lzo_charp)
35351 +_lzo_version_string(void)
35352 +{
35353 +    return LZO_VERSION_STRING;
35354 +}
35355 +
35356 +LZO_PUBLIC(const lzo_charp)
35357 +_lzo_version_date(void)
35358 +{
35359 +    return LZO_VERSION_DATE;
35360 +}
35361 +
35362 +#define LZO_BASE 65521u
35363 +#define LZO_NMAX 5552
35364 +
35365 +#define LZO_DO1(buf,i)  {s1 += buf[i]; s2 += s1;}
35366 +#define LZO_DO2(buf,i)  LZO_DO1(buf,i); LZO_DO1(buf,i+1);
35367 +#define LZO_DO4(buf,i)  LZO_DO2(buf,i); LZO_DO2(buf,i+2);
35368 +#define LZO_DO8(buf,i)  LZO_DO4(buf,i); LZO_DO4(buf,i+4);
35369 +#define LZO_DO16(buf,i) LZO_DO8(buf,i); LZO_DO8(buf,i+8);
35370 +
35371 +LZO_PUBLIC(lzo_uint32)
35372 +lzo_adler32(lzo_uint32 adler, const lzo_byte *buf, lzo_uint len)
35373 +{
35374 +    lzo_uint32 s1 = adler & 0xffff;
35375 +    lzo_uint32 s2 = (adler >> 16) & 0xffff;
35376 +    int k;
35377 +
35378 +    if (buf == NULL)
35379 +       return 1;
35380 +
35381 +    while (len > 0)
35382 +    {
35383 +       k = len < LZO_NMAX ? (int) len : LZO_NMAX;
35384 +       len -= k;
35385 +       if (k >= 16) do
35386 +       {
35387 +           LZO_DO16(buf,0);
35388 +           buf += 16;
35389 +           k -= 16;
35390 +       } while (k >= 16);
35391 +       if (k != 0) do
35392 +       {
35393 +           s1 += *buf++;
35394 +           s2 += s1;
35395 +       } while (--k > 0);
35396 +       s1 %= LZO_BASE;
35397 +       s2 %= LZO_BASE;
35398 +    }
35399 +    return (s2 << 16) | s1;
35400 +}
35401 +
35402 +LZO_PUBLIC(int)
35403 +lzo_memcmp(const lzo_voidp s1, const lzo_voidp s2, lzo_uint len)
35404 +{
35405 +#if (LZO_UINT_MAX <= SIZE_T_MAX) && defined(HAVE_MEMCMP)
35406 +    return memcmp(s1,s2,len);
35407 +#else
35408 +    const lzo_byte *p1 = (const lzo_byte *) s1;
35409 +    const lzo_byte *p2 = (const lzo_byte *) s2;
35410 +    int d;
35411 +
35412 +    if (len > 0) do
35413 +    {
35414 +       d = *p1 - *p2;
35415 +       if (d != 0)
35416 +           return d;
35417 +       p1++;
35418 +       p2++;
35419 +    }
35420 +    while (--len > 0);
35421 +    return 0;
35422 +#endif
35423 +}
35424 +
35425 +LZO_PUBLIC(lzo_voidp)
35426 +lzo_memcpy(lzo_voidp dest, const lzo_voidp src, lzo_uint len)
35427 +{
35428 +#if (LZO_UINT_MAX <= SIZE_T_MAX) && defined(HAVE_MEMCPY)
35429 +    return memcpy(dest,src,len);
35430 +#else
35431 +    lzo_byte *p1 = (lzo_byte *) dest;
35432 +    const lzo_byte *p2 = (const lzo_byte *) src;
35433 +
35434 +    if (len <= 0 || p1 == p2)
35435 +       return dest;
35436 +    do
35437 +       *p1++ = *p2++;
35438 +    while (--len > 0);
35439 +    return dest;
35440 +#endif
35441 +}
35442 +
35443 +LZO_PUBLIC(lzo_voidp)
35444 +lzo_memmove(lzo_voidp dest, const lzo_voidp src, lzo_uint len)
35445 +{
35446 +#if (LZO_UINT_MAX <= SIZE_T_MAX) && defined(HAVE_MEMMOVE)
35447 +    return memmove(dest,src,len);
35448 +#else
35449 +    lzo_byte *p1 = (lzo_byte *) dest;
35450 +    const lzo_byte *p2 = (const lzo_byte *) src;
35451 +
35452 +    if (len <= 0 || p1 == p2)
35453 +       return dest;
35454 +
35455 +    if (p1 < p2)
35456 +    {
35457 +       do
35458 +           *p1++ = *p2++;
35459 +       while (--len > 0);
35460 +    }
35461 +    else
35462 +    {
35463 +       p1 += len;
35464 +       p2 += len;
35465 +       do
35466 +           *--p1 = *--p2;
35467 +       while (--len > 0);
35468 +    }
35469 +    return dest;
35470 +#endif
35471 +}
35472 +
35473 +LZO_PUBLIC(lzo_voidp)
35474 +lzo_memset(lzo_voidp s, int c, lzo_uint len)
35475 +{
35476 +#if (LZO_UINT_MAX <= SIZE_T_MAX) && defined(HAVE_MEMSET)
35477 +    return memset(s,c,len);
35478 +#else
35479 +    lzo_byte *p = (lzo_byte *) s;
35480 +
35481 +    if (len > 0) do
35482 +       *p++ = LZO_BYTE(c);
35483 +    while (--len > 0);
35484 +    return s;
35485 +#endif
35486 +}
35487 +
35488 +#if 0
35489 +#  define IS_SIGNED(type)       (((type) (1ul << (8 * sizeof(type) - 1))) < 0)
35490 +#  define IS_UNSIGNED(type)     (((type) (1ul << (8 * sizeof(type) - 1))) > 0)
35491 +#else
35492 +#  define IS_SIGNED(type)       (((type) (-1)) < ((type) 0))
35493 +#  define IS_UNSIGNED(type)     (((type) (-1)) > ((type) 0))
35494 +#endif
35495 +
35496 +#define IS_POWER_OF_2(x)        (((x) & ((x) - 1)) == 0)
35497 +
35498 +static lzo_bool schedule_insns_bug(void);
35499 +static lzo_bool strength_reduce_bug(int *);
35500 +
35501 +#if 0 || defined(LZO_DEBUG)
35502 +#include <stdio.h>
35503 +static lzo_bool __lzo_assert_fail(const char *s, unsigned line)
35504 +{
35505 +#if defined(__palmos__)
35506 +    printf("LZO assertion failed in line %u: '%s'\n",line,s);
35507 +#else
35508 +    fprintf(stderr,"LZO assertion failed in line %u: '%s'\n",line,s);
35509 +#endif
35510 +    return 0;
35511 +}
35512 +#  define __lzo_assert(x)   ((x) ? 1 : __lzo_assert_fail(#x,__LINE__))
35513 +#else
35514 +#  define __lzo_assert(x)   ((x) ? 1 : 0)
35515 +#endif
35516 +
35517 +#undef COMPILE_TIME_ASSERT
35518 +#if 0
35519 +#  define COMPILE_TIME_ASSERT(expr)     r &= __lzo_assert(expr)
35520 +#else
35521 +#  define COMPILE_TIME_ASSERT(expr)     LZO_COMPILE_TIME_ASSERT(expr)
35522 +#endif
35523 +
35524 +static lzo_bool basic_integral_check(void)
35525 +{
35526 +    lzo_bool r = 1;
35527 +
35528 +    COMPILE_TIME_ASSERT(CHAR_BIT == 8);
35529 +    COMPILE_TIME_ASSERT(sizeof(char) == 1);
35530 +    COMPILE_TIME_ASSERT(sizeof(short) >= 2);
35531 +    COMPILE_TIME_ASSERT(sizeof(long) >= 4);
35532 +    COMPILE_TIME_ASSERT(sizeof(int) >= sizeof(short));
35533 +    COMPILE_TIME_ASSERT(sizeof(long) >= sizeof(int));
35534 +
35535 +    COMPILE_TIME_ASSERT(sizeof(lzo_uint) == sizeof(lzo_int));
35536 +    COMPILE_TIME_ASSERT(sizeof(lzo_uint32) == sizeof(lzo_int32));
35537 +
35538 +    COMPILE_TIME_ASSERT(sizeof(lzo_uint32) >= 4);
35539 +    COMPILE_TIME_ASSERT(sizeof(lzo_uint32) >= sizeof(unsigned));
35540 +#if defined(__LZO_STRICT_16BIT)
35541 +    COMPILE_TIME_ASSERT(sizeof(lzo_uint) == 2);
35542 +#else
35543 +    COMPILE_TIME_ASSERT(sizeof(lzo_uint) >= 4);
35544 +    COMPILE_TIME_ASSERT(sizeof(lzo_uint) >= sizeof(unsigned));
35545 +#endif
35546 +
35547 +#if (USHRT_MAX == 65535u)
35548 +    COMPILE_TIME_ASSERT(sizeof(short) == 2);
35549 +#elif (USHRT_MAX == LZO_0xffffffffL)
35550 +    COMPILE_TIME_ASSERT(sizeof(short) == 4);
35551 +#elif (USHRT_MAX >= LZO_0xffffffffL)
35552 +    COMPILE_TIME_ASSERT(sizeof(short) > 4);
35553 +#endif
35554 +#if 0 /* to make gcc happy -edward */
35555 +#if (UINT_MAX == 65535u)
35556 +    COMPILE_TIME_ASSERT(sizeof(int) == 2);
35557 +#elif (UINT_MAX == LZO_0xffffffffL)
35558 +    COMPILE_TIME_ASSERT(sizeof(int) == 4);
35559 +#elif (UINT_MAX >= LZO_0xffffffffL)
35560 +    COMPILE_TIME_ASSERT(sizeof(int) > 4);
35561 +#endif
35562 +#if (ULONG_MAX == 65535ul)
35563 +    COMPILE_TIME_ASSERT(sizeof(long) == 2);
35564 +#elif (ULONG_MAX == LZO_0xffffffffL)
35565 +    COMPILE_TIME_ASSERT(sizeof(long) == 4);
35566 +#elif (ULONG_MAX >= LZO_0xffffffffL)
35567 +    COMPILE_TIME_ASSERT(sizeof(long) > 4);
35568 +#endif
35569 +#if defined(SIZEOF_UNSIGNED)
35570 +    COMPILE_TIME_ASSERT(SIZEOF_UNSIGNED == sizeof(unsigned));
35571 +#endif
35572 +#if defined(SIZEOF_UNSIGNED_LONG)
35573 +    COMPILE_TIME_ASSERT(SIZEOF_UNSIGNED_LONG == sizeof(unsigned long));
35574 +#endif
35575 +#if defined(SIZEOF_UNSIGNED_SHORT)
35576 +    COMPILE_TIME_ASSERT(SIZEOF_UNSIGNED_SHORT == sizeof(unsigned short));
35577 +#endif
35578 +#if !defined(__LZO_IN_MINILZO)
35579 +#if defined(SIZEOF_SIZE_T)
35580 +    COMPILE_TIME_ASSERT(SIZEOF_SIZE_T == sizeof(size_t));
35581 +#endif
35582 +#endif
35583 +#endif /* -edward */
35584 +
35585 +    COMPILE_TIME_ASSERT(IS_UNSIGNED(unsigned char));
35586 +    COMPILE_TIME_ASSERT(IS_UNSIGNED(unsigned short));
35587 +    COMPILE_TIME_ASSERT(IS_UNSIGNED(unsigned));
35588 +    COMPILE_TIME_ASSERT(IS_UNSIGNED(unsigned long));
35589 +    COMPILE_TIME_ASSERT(IS_SIGNED(short));
35590 +    COMPILE_TIME_ASSERT(IS_SIGNED(int));
35591 +    COMPILE_TIME_ASSERT(IS_SIGNED(long));
35592 +
35593 +    COMPILE_TIME_ASSERT(IS_UNSIGNED(lzo_uint32));
35594 +    COMPILE_TIME_ASSERT(IS_UNSIGNED(lzo_uint));
35595 +    COMPILE_TIME_ASSERT(IS_SIGNED(lzo_int32));
35596 +    COMPILE_TIME_ASSERT(IS_SIGNED(lzo_int));
35597 +
35598 +    COMPILE_TIME_ASSERT(INT_MAX    == LZO_STYPE_MAX(sizeof(int)));
35599 +    COMPILE_TIME_ASSERT(UINT_MAX   == LZO_UTYPE_MAX(sizeof(unsigned)));
35600 +    COMPILE_TIME_ASSERT(LONG_MAX   == LZO_STYPE_MAX(sizeof(long)));
35601 +    COMPILE_TIME_ASSERT(ULONG_MAX  == LZO_UTYPE_MAX(sizeof(unsigned long)));
35602 +    //    COMPILE_TIME_ASSERT(SHRT_MAX   == LZO_STYPE_MAX(sizeof(short))); /* edward */
35603 +    COMPILE_TIME_ASSERT(USHRT_MAX  == LZO_UTYPE_MAX(sizeof(unsigned short)));
35604 +    COMPILE_TIME_ASSERT(LZO_UINT32_MAX == LZO_UTYPE_MAX(sizeof(lzo_uint32)));
35605 +    COMPILE_TIME_ASSERT(LZO_UINT_MAX   == LZO_UTYPE_MAX(sizeof(lzo_uint)));
35606 +#if !defined(__LZO_IN_MINILZO)
35607 +    COMPILE_TIME_ASSERT(SIZE_T_MAX     == LZO_UTYPE_MAX(sizeof(size_t)));
35608 +#endif
35609 +
35610 +    r &= __lzo_assert(LZO_BYTE(257) == 1);
35611 +
35612 +    return r;
35613 +}
35614 +
35615 +static lzo_bool basic_ptr_check(void)
35616 +{
35617 +    lzo_bool r = 1;
35618 +
35619 +    COMPILE_TIME_ASSERT(sizeof(char *) >= sizeof(int));
35620 +    COMPILE_TIME_ASSERT(sizeof(lzo_byte *) >= sizeof(char *));
35621 +
35622 +    COMPILE_TIME_ASSERT(sizeof(lzo_voidp) == sizeof(lzo_byte *));
35623 +    COMPILE_TIME_ASSERT(sizeof(lzo_voidp) == sizeof(lzo_voidpp));
35624 +    COMPILE_TIME_ASSERT(sizeof(lzo_voidp) == sizeof(lzo_bytepp));
35625 +    COMPILE_TIME_ASSERT(sizeof(lzo_voidp) >= sizeof(lzo_uint));
35626 +
35627 +    COMPILE_TIME_ASSERT(sizeof(lzo_ptr_t) == sizeof(lzo_voidp));
35628 +    COMPILE_TIME_ASSERT(sizeof(lzo_ptr_t) == sizeof(lzo_sptr_t));
35629 +    COMPILE_TIME_ASSERT(sizeof(lzo_ptr_t) >= sizeof(lzo_uint));
35630 +
35631 +    COMPILE_TIME_ASSERT(sizeof(lzo_ptrdiff_t) >= 4);
35632 +    COMPILE_TIME_ASSERT(sizeof(lzo_ptrdiff_t) >= sizeof(ptrdiff_t));
35633 +
35634 +    COMPILE_TIME_ASSERT(sizeof(ptrdiff_t) >= sizeof(size_t));
35635 +    COMPILE_TIME_ASSERT(sizeof(lzo_ptrdiff_t) >= sizeof(lzo_uint));
35636 +
35637 +#if defined(SIZEOF_CHAR_P)
35638 +    COMPILE_TIME_ASSERT(SIZEOF_CHAR_P == sizeof(char *));
35639 +#endif
35640 +#if defined(SIZEOF_PTRDIFF_T)
35641 +    COMPILE_TIME_ASSERT(SIZEOF_PTRDIFF_T == sizeof(ptrdiff_t));
35642 +#endif
35643 +
35644 +    COMPILE_TIME_ASSERT(IS_SIGNED(ptrdiff_t));
35645 +    COMPILE_TIME_ASSERT(IS_UNSIGNED(size_t));
35646 +    COMPILE_TIME_ASSERT(IS_SIGNED(lzo_ptrdiff_t));
35647 +    COMPILE_TIME_ASSERT(IS_SIGNED(lzo_sptr_t));
35648 +    COMPILE_TIME_ASSERT(IS_UNSIGNED(lzo_ptr_t));
35649 +    COMPILE_TIME_ASSERT(IS_UNSIGNED(lzo_moff_t));
35650 +
35651 +    return r;
35652 +}
35653 +
35654 +static lzo_bool ptr_check(void)
35655 +{
35656 +    lzo_bool r = 1;
35657 +    int i;
35658 +    char _wrkmem[10 * sizeof(lzo_byte *) + sizeof(lzo_full_align_t)];
35659 +    lzo_bytep wrkmem;
35660 +    lzo_bytepp dict;
35661 +    unsigned char x[4 * sizeof(lzo_full_align_t)];
35662 +    long d;
35663 +    lzo_full_align_t a;
35664 +    lzo_full_align_t u;
35665 +
35666 +    for (i = 0; i < (int) sizeof(x); i++)
35667 +       x[i] = LZO_BYTE(i);
35668 +
35669 +    wrkmem = LZO_PTR_ALIGN_UP((lzo_byte *)_wrkmem,sizeof(lzo_full_align_t));
35670 +
35671 +#if 0
35672 +    dict = (lzo_bytepp) wrkmem;
35673 +#else
35674 +
35675 +    u.a_lzo_bytep = wrkmem; dict = u.a_lzo_bytepp;
35676 +#endif
35677 +
35678 +    d = (long) ((const lzo_bytep) dict - (const lzo_bytep) _wrkmem);
35679 +    r &= __lzo_assert(d >= 0);
35680 +    r &= __lzo_assert(d < (long) sizeof(lzo_full_align_t));
35681 +
35682 +    memset(&a,0,sizeof(a));
35683 +    r &= __lzo_assert(a.a_lzo_voidp == NULL);
35684 +
35685 +    memset(&a,0xff,sizeof(a));
35686 +    r &= __lzo_assert(a.a_ushort == USHRT_MAX);
35687 +    r &= __lzo_assert(a.a_uint == UINT_MAX);
35688 +    r &= __lzo_assert(a.a_ulong == ULONG_MAX);
35689 +    r &= __lzo_assert(a.a_lzo_uint == LZO_UINT_MAX);
35690 +    r &= __lzo_assert(a.a_lzo_uint32 == LZO_UINT32_MAX);
35691 +
35692 +    if (r == 1)
35693 +    {
35694 +       for (i = 0; i < 8; i++)
35695 +           r &= __lzo_assert((const lzo_voidp) (&dict[i]) == (const lzo_voidp) (&wrkmem[i * sizeof(lzo_byte *)]));
35696 +    }
35697 +
35698 +    memset(&a,0,sizeof(a));
35699 +    r &= __lzo_assert(a.a_char_p == NULL);
35700 +    r &= __lzo_assert(a.a_lzo_bytep == NULL);
35701 +    r &= __lzo_assert(NULL == (void *)0);
35702 +    if (r == 1)
35703 +    {
35704 +       for (i = 0; i < 10; i++)
35705 +           dict[i] = wrkmem;
35706 +       BZERO8_PTR(dict+1,sizeof(dict[0]),8);
35707 +       r &= __lzo_assert(dict[0] == wrkmem);
35708 +       for (i = 1; i < 9; i++)
35709 +           r &= __lzo_assert(dict[i] == NULL);
35710 +       r &= __lzo_assert(dict[9] == wrkmem);
35711 +    }
35712 +
35713 +    if (r == 1)
35714 +    {
35715 +       unsigned k = 1;
35716 +       const unsigned n = (unsigned) sizeof(lzo_uint32);
35717 +       lzo_byte *p0;
35718 +       lzo_byte *p1;
35719 +
35720 +       k += __lzo_align_gap(&x[k],n);
35721 +       p0 = (lzo_bytep) &x[k];
35722 +#if defined(PTR_LINEAR)
35723 +       r &= __lzo_assert((PTR_LINEAR(p0) & (n-1)) == 0);
35724 +#else
35725 +       r &= __lzo_assert(n == 4);
35726 +       r &= __lzo_assert(PTR_ALIGNED_4(p0));
35727 +#endif
35728 +
35729 +       r &= __lzo_assert(k >= 1);
35730 +       p1 = (lzo_bytep) &x[1];
35731 +       r &= __lzo_assert(PTR_GE(p0,p1));
35732 +
35733 +       r &= __lzo_assert(k < 1+n);
35734 +       p1 = (lzo_bytep) &x[1+n];
35735 +       r &= __lzo_assert(PTR_LT(p0,p1));
35736 +
35737 +       if (r == 1)
35738 +       {
35739 +           lzo_uint32 v0, v1;
35740 +#if 0
35741 +           v0 = * (lzo_uint32 *) &x[k];
35742 +           v1 = * (lzo_uint32 *) &x[k+n];
35743 +#else
35744 +
35745 +           u.a_uchar_p = &x[k];
35746 +           v0 = *u.a_lzo_uint32_p;
35747 +           u.a_uchar_p = &x[k+n];
35748 +           v1 = *u.a_lzo_uint32_p;
35749 +#endif
35750 +           r &= __lzo_assert(v0 > 0);
35751 +           r &= __lzo_assert(v1 > 0);
35752 +       }
35753 +    }
35754 +
35755 +    return r;
35756 +}
35757 +
35758 +LZO_PUBLIC(int)
35759 +_lzo_config_check(void)
35760 +{
35761 +    lzo_bool r = 1;
35762 +    int i;
35763 +    union {
35764 +       lzo_uint32 a;
35765 +       unsigned short b;
35766 +       lzo_uint32 aa[4];
35767 +       unsigned char x[4*sizeof(lzo_full_align_t)];
35768 +    } u;
35769 +
35770 +    COMPILE_TIME_ASSERT( (int) ((unsigned char) ((signed char) -1)) == 255);
35771 +    COMPILE_TIME_ASSERT( (((unsigned char)128) << (int)(8*sizeof(int)-8)) < 0);
35772 +
35773 +#if 0
35774 +    r &= __lzo_assert((const void *)&u == (const void *)&u.a);
35775 +    r &= __lzo_assert((const void *)&u == (const void *)&u.b);
35776 +    r &= __lzo_assert((const void *)&u == (const void *)&u.x[0]);
35777 +    r &= __lzo_assert((const void *)&u == (const void *)&u.aa[0]);
35778 +#endif
35779 +
35780 +    r &= basic_integral_check();
35781 +    r &= basic_ptr_check();
35782 +    if (r != 1)
35783 +       return LZO_E_ERROR;
35784 +
35785 +    u.a = 0; u.b = 0;
35786 +    for (i = 0; i < (int) sizeof(u.x); i++)
35787 +       u.x[i] = LZO_BYTE(i);
35788 +
35789 +#if defined(LZO_BYTE_ORDER)
35790 +    if (r == 1)
35791 +    {
35792 +#  if (LZO_BYTE_ORDER == LZO_LITTLE_ENDIAN)
35793 +       lzo_uint32 a = (lzo_uint32) (u.a & LZO_0xffffffffL);
35794 +       unsigned short b = (unsigned short) (u.b & 0xffff);
35795 +       r &= __lzo_assert(a == 0x03020100L);
35796 +       r &= __lzo_assert(b == 0x0100);
35797 +#  elif (LZO_BYTE_ORDER == LZO_BIG_ENDIAN)
35798 +       lzo_uint32 a = u.a >> (8 * sizeof(u.a) - 32);
35799 +       unsigned short b = u.b >> (8 * sizeof(u.b) - 16);
35800 +       r &= __lzo_assert(a == 0x00010203L);
35801 +       r &= __lzo_assert(b == 0x0001);
35802 +#  else
35803 +#    error "invalid LZO_BYTE_ORDER"
35804 +#  endif
35805 +    }
35806 +#endif
35807 +
35808 +#if defined(LZO_UNALIGNED_OK_2)
35809 +    COMPILE_TIME_ASSERT(sizeof(short) == 2);
35810 +    if (r == 1)
35811 +    {
35812 +       unsigned short b[4];
35813 +
35814 +       for (i = 0; i < 4; i++)
35815 +           b[i] = * (const unsigned short *) &u.x[i];
35816 +
35817 +#  if (LZO_BYTE_ORDER == LZO_LITTLE_ENDIAN)
35818 +       r &= __lzo_assert(b[0] == 0x0100);
35819 +       r &= __lzo_assert(b[1] == 0x0201);
35820 +       r &= __lzo_assert(b[2] == 0x0302);
35821 +       r &= __lzo_assert(b[3] == 0x0403);
35822 +#  elif (LZO_BYTE_ORDER == LZO_BIG_ENDIAN)
35823 +       r &= __lzo_assert(b[0] == 0x0001);
35824 +       r &= __lzo_assert(b[1] == 0x0102);
35825 +       r &= __lzo_assert(b[2] == 0x0203);
35826 +       r &= __lzo_assert(b[3] == 0x0304);
35827 +#  endif
35828 +    }
35829 +#endif
35830 +
35831 +#if defined(LZO_UNALIGNED_OK_4)
35832 +    COMPILE_TIME_ASSERT(sizeof(lzo_uint32) == 4);
35833 +    if (r == 1)
35834 +    {
35835 +       lzo_uint32 a[4];
35836 +
35837 +       for (i = 0; i < 4; i++)
35838 +           a[i] = * (const lzo_uint32 *) &u.x[i];
35839 +
35840 +#  if (LZO_BYTE_ORDER == LZO_LITTLE_ENDIAN)
35841 +       r &= __lzo_assert(a[0] == 0x03020100L);
35842 +       r &= __lzo_assert(a[1] == 0x04030201L);
35843 +       r &= __lzo_assert(a[2] == 0x05040302L);
35844 +       r &= __lzo_assert(a[3] == 0x06050403L);
35845 +#  elif (LZO_BYTE_ORDER == LZO_BIG_ENDIAN)
35846 +       r &= __lzo_assert(a[0] == 0x00010203L);
35847 +       r &= __lzo_assert(a[1] == 0x01020304L);
35848 +       r &= __lzo_assert(a[2] == 0x02030405L);
35849 +       r &= __lzo_assert(a[3] == 0x03040506L);
35850 +#  endif
35851 +    }
35852 +#endif
35853 +
35854 +#if defined(LZO_ALIGNED_OK_4)
35855 +    COMPILE_TIME_ASSERT(sizeof(lzo_uint32) == 4);
35856 +#endif
35857 +
35858 +    COMPILE_TIME_ASSERT(lzo_sizeof_dict_t == sizeof(lzo_dict_t));
35859 +
35860 +#if defined(__LZO_IN_MINLZO)
35861 +    if (r == 1)
35862 +    {
35863 +       lzo_uint32 adler;
35864 +       adler = lzo_adler32(0, NULL, 0);
35865 +       adler = lzo_adler32(adler, lzo_copyright(), 200);
35866 +       r &= __lzo_assert(adler == 0xc76f1751L);
35867 +    }
35868 +#endif
35869 +
35870 +    if (r == 1)
35871 +    {
35872 +       r &= __lzo_assert(!schedule_insns_bug());
35873 +    }
35874 +
35875 +    if (r == 1)
35876 +    {
35877 +       static int x[3];
35878 +       static unsigned xn = 3;
35879 +       register unsigned j;
35880 +
35881 +       for (j = 0; j < xn; j++)
35882 +           x[j] = (int)j - 3;
35883 +       r &= __lzo_assert(!strength_reduce_bug(x));
35884 +    }
35885 +
35886 +    if (r == 1)
35887 +    {
35888 +       r &= ptr_check();
35889 +    }
35890 +
35891 +    return r == 1 ? LZO_E_OK : LZO_E_ERROR;
35892 +}
35893 +
35894 +static lzo_bool schedule_insns_bug(void)
35895 +{
35896 +#if defined(__LZO_CHECKER)
35897 +    return 0;
35898 +#else
35899 +    const int clone[] = {1, 2, 0};
35900 +    const int *q;
35901 +    q = clone;
35902 +    return (*q) ? 0 : 1;
35903 +#endif
35904 +}
35905 +
35906 +static lzo_bool strength_reduce_bug(int *x)
35907 +{
35908 +    return x[0] != -3 || x[1] != -2 || x[2] != -1;
35909 +}
35910 +
35911 +#undef COMPILE_TIME_ASSERT
35912 +
35913 +int __lzo_init_done = 0;
35914 +
35915 +LZO_PUBLIC(int)
35916 +__lzo_init2(unsigned v, int s1, int s2, int s3, int s4, int s5,
35917 +                       int s6, int s7, int s8, int s9)
35918 +{
35919 +    int r;
35920 +
35921 +    __lzo_init_done = 1;
35922 +
35923 +    if (v == 0)
35924 +       return LZO_E_ERROR;
35925 +
35926 +    r = (s1 == -1 || s1 == (int) sizeof(short)) &&
35927 +       (s2 == -1 || s2 == (int) sizeof(int)) &&
35928 +       (s3 == -1 || s3 == (int) sizeof(long)) &&
35929 +       (s4 == -1 || s4 == (int) sizeof(lzo_uint32)) &&
35930 +       (s5 == -1 || s5 == (int) sizeof(lzo_uint)) &&
35931 +       (s6 == -1 || s6 == (int) lzo_sizeof_dict_t) &&
35932 +       (s7 == -1 || s7 == (int) sizeof(char *)) &&
35933 +       (s8 == -1 || s8 == (int) sizeof(lzo_voidp)) &&
35934 +       (s9 == -1 || s9 == (int) sizeof(lzo_compress_t));
35935 +    if (!r)
35936 +       return LZO_E_ERROR;
35937 +
35938 +    r = _lzo_config_check();
35939 +    if (r != LZO_E_OK)
35940 +       return r;
35941 +
35942 +    return r;
35943 +}
35944 +
35945 +#if !defined(__LZO_IN_MINILZO)
35946 +
35947 +LZO_EXTERN(int)
35948 +__lzo_init(unsigned v,int s1,int s2,int s3,int s4,int s5,int s6,int s7);
35949 +
35950 +LZO_PUBLIC(int)
35951 +__lzo_init(unsigned v,int s1,int s2,int s3,int s4,int s5,int s6,int s7)
35952 +{
35953 +    if (v == 0 || v > 0x1010)
35954 +       return LZO_E_ERROR;
35955 +    return __lzo_init2(v,s1,s2,s3,s4,s5,-1,-1,s6,s7);
35956 +}
35957 +
35958 +#endif
35959 +
35960 +#define do_compress         _lzo1x_1_do_compress
35961 +
35962 +#define LZO_NEED_DICT_H
35963 +#define D_BITS          14
35964 +#define D_INDEX1(d,p)       d = DM((0x21*DX3(p,5,5,6)) >> 5)
35965 +#define D_INDEX2(d,p)       d = (d & (D_MASK & 0x7ff)) ^ (D_HIGH | 0x1f)
35966 +
35967 +#ifndef __LZO_CONFIG1X_H
35968 +#define __LZO_CONFIG1X_H
35969 +
35970 +#if !defined(LZO1X) && !defined(LZO1Y) && !defined(LZO1Z)
35971 +#  define LZO1X
35972 +#endif
35973 +
35974 +#if !defined(__LZO_IN_MINILZO)
35975 +#include <lzo1x.h>
35976 +#endif
35977 +
35978 +#define LZO_EOF_CODE
35979 +#undef LZO_DETERMINISTIC
35980 +
35981 +#define M1_MAX_OFFSET   0x0400
35982 +#ifndef M2_MAX_OFFSET
35983 +#define M2_MAX_OFFSET   0x0800
35984 +#endif
35985 +#define M3_MAX_OFFSET   0x4000
35986 +#define M4_MAX_OFFSET   0xbfff
35987 +
35988 +#define MX_MAX_OFFSET   (M1_MAX_OFFSET + M2_MAX_OFFSET)
35989 +
35990 +#define M1_MIN_LEN      2
35991 +#define M1_MAX_LEN      2
35992 +#define M2_MIN_LEN      3
35993 +#ifndef M2_MAX_LEN
35994 +#define M2_MAX_LEN      8
35995 +#endif
35996 +#define M3_MIN_LEN      3
35997 +#define M3_MAX_LEN      33
35998 +#define M4_MIN_LEN      3
35999 +#define M4_MAX_LEN      9
36000 +
36001 +#define M1_MARKER       0
36002 +#define M2_MARKER       64
36003 +#define M3_MARKER       32
36004 +#define M4_MARKER       16
36005 +
36006 +#ifndef MIN_LOOKAHEAD
36007 +#define MIN_LOOKAHEAD       (M2_MAX_LEN + 1)
36008 +#endif
36009 +
36010 +#if defined(LZO_NEED_DICT_H)
36011 +
36012 +#ifndef LZO_HASH
36013 +#define LZO_HASH            LZO_HASH_LZO_INCREMENTAL_B
36014 +#endif
36015 +#define DL_MIN_LEN          M2_MIN_LEN
36016 +
36017 +#ifndef __LZO_DICT_H
36018 +#define __LZO_DICT_H
36019 +
36020 +#ifdef __cplusplus
36021 +extern "C" {
36022 +#endif
36023 +
36024 +#if !defined(D_BITS) && defined(DBITS)
36025 +#  define D_BITS        DBITS
36026 +#endif
36027 +#if !defined(D_BITS)
36028 +#  error "D_BITS is not defined"
36029 +#endif
36030 +#if (D_BITS < 16)
36031 +#  define D_SIZE        LZO_SIZE(D_BITS)
36032 +#  define D_MASK        LZO_MASK(D_BITS)
36033 +#else
36034 +#  define D_SIZE        LZO_USIZE(D_BITS)
36035 +#  define D_MASK        LZO_UMASK(D_BITS)
36036 +#endif
36037 +#define D_HIGH          ((D_MASK >> 1) + 1)
36038 +
36039 +#if !defined(DD_BITS)
36040 +#  define DD_BITS       0
36041 +#endif
36042 +#define DD_SIZE         LZO_SIZE(DD_BITS)
36043 +#define DD_MASK         LZO_MASK(DD_BITS)
36044 +
36045 +#if !defined(DL_BITS)
36046 +#  define DL_BITS       (D_BITS - DD_BITS)
36047 +#endif
36048 +#if (DL_BITS < 16)
36049 +#  define DL_SIZE       LZO_SIZE(DL_BITS)
36050 +#  define DL_MASK       LZO_MASK(DL_BITS)
36051 +#else
36052 +#  define DL_SIZE       LZO_USIZE(DL_BITS)
36053 +#  define DL_MASK       LZO_UMASK(DL_BITS)
36054 +#endif
36055 +
36056 +#if (D_BITS != DL_BITS + DD_BITS)
36057 +#  error "D_BITS does not match"
36058 +#endif
36059 +#if (D_BITS < 8 || D_BITS > 18)
36060 +#  error "invalid D_BITS"
36061 +#endif
36062 +#if (DL_BITS < 8 || DL_BITS > 20)
36063 +#  error "invalid DL_BITS"
36064 +#endif
36065 +#if (DD_BITS < 0 || DD_BITS > 6)
36066 +#  error "invalid DD_BITS"
36067 +#endif
36068 +
36069 +#if !defined(DL_MIN_LEN)
36070 +#  define DL_MIN_LEN    3
36071 +#endif
36072 +#if !defined(DL_SHIFT)
36073 +#  define DL_SHIFT      ((DL_BITS + (DL_MIN_LEN - 1)) / DL_MIN_LEN)
36074 +#endif
36075 +
36076 +#define LZO_HASH_GZIP                   1
36077 +#define LZO_HASH_GZIP_INCREMENTAL       2
36078 +#define LZO_HASH_LZO_INCREMENTAL_A      3
36079 +#define LZO_HASH_LZO_INCREMENTAL_B      4
36080 +
36081 +#if !defined(LZO_HASH)
36082 +#  error "choose a hashing strategy"
36083 +#endif
36084 +
36085 +#if (DL_MIN_LEN == 3)
36086 +#  define _DV2_A(p,shift1,shift2) \
36087 +       (((( (lzo_uint32)((p)[0]) << shift1) ^ (p)[1]) << shift2) ^ (p)[2])
36088 +#  define _DV2_B(p,shift1,shift2) \
36089 +       (((( (lzo_uint32)((p)[2]) << shift1) ^ (p)[1]) << shift2) ^ (p)[0])
36090 +#  define _DV3_B(p,shift1,shift2,shift3) \
36091 +       ((_DV2_B((p)+1,shift1,shift2) << (shift3)) ^ (p)[0])
36092 +#elif (DL_MIN_LEN == 2)
36093 +#  define _DV2_A(p,shift1,shift2) \
36094 +       (( (lzo_uint32)(p[0]) << shift1) ^ p[1])
36095 +#  define _DV2_B(p,shift1,shift2) \
36096 +       (( (lzo_uint32)(p[1]) << shift1) ^ p[2])
36097 +#else
36098 +#  error "invalid DL_MIN_LEN"
36099 +#endif
36100 +#define _DV_A(p,shift)      _DV2_A(p,shift,shift)
36101 +#define _DV_B(p,shift)      _DV2_B(p,shift,shift)
36102 +#define DA2(p,s1,s2) \
36103 +       (((((lzo_uint32)((p)[2]) << (s2)) + (p)[1]) << (s1)) + (p)[0])
36104 +#define DS2(p,s1,s2) \
36105 +       (((((lzo_uint32)((p)[2]) << (s2)) - (p)[1]) << (s1)) - (p)[0])
36106 +#define DX2(p,s1,s2) \
36107 +       (((((lzo_uint32)((p)[2]) << (s2)) ^ (p)[1]) << (s1)) ^ (p)[0])
36108 +#define DA3(p,s1,s2,s3) ((DA2((p)+1,s2,s3) << (s1)) + (p)[0])
36109 +#define DS3(p,s1,s2,s3) ((DS2((p)+1,s2,s3) << (s1)) - (p)[0])
36110 +#define DX3(p,s1,s2,s3) ((DX2((p)+1,s2,s3) << (s1)) ^ (p)[0])
36111 +#define DMS(v,s)        ((lzo_uint) (((v) & (D_MASK >> (s))) << (s)))
36112 +#define DM(v)           DMS(v,0)
36113 +
36114 +#if (LZO_HASH == LZO_HASH_GZIP)
36115 +#  define _DINDEX(dv,p)     (_DV_A((p),DL_SHIFT))
36116 +
36117 +#elif (LZO_HASH == LZO_HASH_GZIP_INCREMENTAL)
36118 +#  define __LZO_HASH_INCREMENTAL
36119 +#  define DVAL_FIRST(dv,p)  dv = _DV_A((p),DL_SHIFT)
36120 +#  define DVAL_NEXT(dv,p)   dv = (((dv) << DL_SHIFT) ^ p[2])
36121 +#  define _DINDEX(dv,p)     (dv)
36122 +#  define DVAL_LOOKAHEAD    DL_MIN_LEN
36123 +
36124 +#elif (LZO_HASH == LZO_HASH_LZO_INCREMENTAL_A)
36125 +#  define __LZO_HASH_INCREMENTAL
36126 +#  define DVAL_FIRST(dv,p)  dv = _DV_A((p),5)
36127 +#  define DVAL_NEXT(dv,p) \
36128 +               dv ^= (lzo_uint32)(p[-1]) << (2*5); dv = (((dv) << 5) ^ p[2])
36129 +#  define _DINDEX(dv,p)     ((0x9f5f * (dv)) >> 5)
36130 +#  define DVAL_LOOKAHEAD    DL_MIN_LEN
36131 +
36132 +#elif (LZO_HASH == LZO_HASH_LZO_INCREMENTAL_B)
36133 +#  define __LZO_HASH_INCREMENTAL
36134 +#  define DVAL_FIRST(dv,p)  dv = _DV_B((p),5)
36135 +#  define DVAL_NEXT(dv,p) \
36136 +               dv ^= p[-1]; dv = (((dv) >> 5) ^ ((lzo_uint32)(p[2]) << (2*5)))
36137 +#  define _DINDEX(dv,p)     ((0x9f5f * (dv)) >> 5)
36138 +#  define DVAL_LOOKAHEAD    DL_MIN_LEN
36139 +
36140 +#else
36141 +#  error "choose a hashing strategy"
36142 +#endif
36143 +
36144 +#ifndef DINDEX
36145 +#define DINDEX(dv,p)        ((lzo_uint)((_DINDEX(dv,p)) & DL_MASK) << DD_BITS)
36146 +#endif
36147 +#if !defined(DINDEX1) && defined(D_INDEX1)
36148 +#define DINDEX1             D_INDEX1
36149 +#endif
36150 +#if !defined(DINDEX2) && defined(D_INDEX2)
36151 +#define DINDEX2             D_INDEX2
36152 +#endif
36153 +
36154 +#if !defined(__LZO_HASH_INCREMENTAL)
36155 +#  define DVAL_FIRST(dv,p)  ((void) 0)
36156 +#  define DVAL_NEXT(dv,p)   ((void) 0)
36157 +#  define DVAL_LOOKAHEAD    0
36158 +#endif
36159 +
36160 +#if !defined(DVAL_ASSERT)
36161 +#if defined(__LZO_HASH_INCREMENTAL) && !defined(NDEBUG)
36162 +static void DVAL_ASSERT(lzo_uint32 dv, const lzo_byte *p)
36163 +{
36164 +    lzo_uint32 df;
36165 +    DVAL_FIRST(df,(p));
36166 +    assert(DINDEX(dv,p) == DINDEX(df,p));
36167 +}
36168 +#else
36169 +#  define DVAL_ASSERT(dv,p) ((void) 0)
36170 +#endif
36171 +#endif
36172 +
36173 +#if defined(LZO_DICT_USE_PTR)
36174 +#  define DENTRY(p,in)                          (p)
36175 +#  define GINDEX(m_pos,m_off,dict,dindex,in)    m_pos = dict[dindex]
36176 +#else
36177 +#  define DENTRY(p,in)                          ((lzo_uint) ((p)-(in)))
36178 +#  define GINDEX(m_pos,m_off,dict,dindex,in)    m_off = dict[dindex]
36179 +#endif
36180 +
36181 +#if (DD_BITS == 0)
36182 +
36183 +#  define UPDATE_D(dict,drun,dv,p,in)       dict[ DINDEX(dv,p) ] = DENTRY(p,in)
36184 +#  define UPDATE_I(dict,drun,index,p,in)    dict[index] = DENTRY(p,in)
36185 +#  define UPDATE_P(ptr,drun,p,in)           (ptr)[0] = DENTRY(p,in)
36186 +
36187 +#else
36188 +
36189 +#  define UPDATE_D(dict,drun,dv,p,in)   \
36190 +       dict[ DINDEX(dv,p) + drun++ ] = DENTRY(p,in); drun &= DD_MASK
36191 +#  define UPDATE_I(dict,drun,index,p,in)    \
36192 +       dict[ (index) + drun++ ] = DENTRY(p,in); drun &= DD_MASK
36193 +#  define UPDATE_P(ptr,drun,p,in)   \
36194 +       (ptr) [ drun++ ] = DENTRY(p,in); drun &= DD_MASK
36195 +
36196 +#endif
36197 +
36198 +#if defined(LZO_DICT_USE_PTR)
36199 +
36200 +#define LZO_CHECK_MPOS_DET(m_pos,m_off,in,ip,max_offset) \
36201 +       (m_pos == NULL || (m_off = (lzo_moff_t) (ip - m_pos)) > max_offset)
36202 +
36203 +#define LZO_CHECK_MPOS_NON_DET(m_pos,m_off,in,ip,max_offset) \
36204 +    (BOUNDS_CHECKING_OFF_IN_EXPR( \
36205 +       (PTR_LT(m_pos,in) || \
36206 +        (m_off = (lzo_moff_t) PTR_DIFF(ip,m_pos)) <= 0 || \
36207 +         m_off > max_offset) ))
36208 +
36209 +#else
36210 +
36211 +#define LZO_CHECK_MPOS_DET(m_pos,m_off,in,ip,max_offset) \
36212 +       (m_off == 0 || \
36213 +        ((m_off = (lzo_moff_t) ((ip)-(in)) - m_off) > max_offset) || \
36214 +        (m_pos = (ip) - (m_off), 0) )
36215 +
36216 +#define LZO_CHECK_MPOS_NON_DET(m_pos,m_off,in,ip,max_offset) \
36217 +       ((lzo_moff_t) ((ip)-(in)) <= m_off || \
36218 +        ((m_off = (lzo_moff_t) ((ip)-(in)) - m_off) > max_offset) || \
36219 +        (m_pos = (ip) - (m_off), 0) )
36220 +
36221 +#endif
36222 +
36223 +#if defined(LZO_DETERMINISTIC)
36224 +#  define LZO_CHECK_MPOS    LZO_CHECK_MPOS_DET
36225 +#else
36226 +#  define LZO_CHECK_MPOS    LZO_CHECK_MPOS_NON_DET
36227 +#endif
36228 +
36229 +#ifdef __cplusplus
36230 +}
36231 +#endif
36232 +
36233 +#endif
36234 +
36235 +#endif
36236 +
36237 +#endif
36238 +
36239 +#define DO_COMPRESS     lzo1x_1_compress
36240 +
36241 +static
36242 +lzo_uint do_compress     ( const lzo_byte *in , lzo_uint  in_len,
36243 +                                lzo_byte *out, lzo_uintp out_len,
36244 +                                lzo_voidp wrkmem )
36245 +{
36246 +#if 0 && defined(__GNUC__) && defined(__i386__)
36247 +    register const lzo_byte *ip __asm__("%esi");
36248 +#else
36249 +    register const lzo_byte *ip;
36250 +#endif
36251 +    lzo_byte *op;
36252 +    const lzo_byte * const in_end = in + in_len;
36253 +    const lzo_byte * const ip_end = in + in_len - M2_MAX_LEN - 5;
36254 +    const lzo_byte *ii;
36255 +    lzo_dict_p const dict = (lzo_dict_p) wrkmem;
36256 +
36257 +    op = out;
36258 +    ip = in;
36259 +    ii = ip;
36260 +
36261 +    ip += 4;
36262 +    for (;;)
36263 +    {
36264 +#if 0 && defined(__GNUC__) && defined(__i386__)
36265 +       register const lzo_byte *m_pos __asm__("%edi");
36266 +#else
36267 +       register const lzo_byte *m_pos;
36268 +#endif
36269 +       lzo_moff_t m_off;
36270 +       lzo_uint m_len;
36271 +       lzo_uint dindex;
36272 +
36273 +       DINDEX1(dindex,ip);
36274 +       GINDEX(m_pos,m_off,dict,dindex,in);
36275 +       if (LZO_CHECK_MPOS_NON_DET(m_pos,m_off,in,ip,M4_MAX_OFFSET))
36276 +           goto literal;
36277 +#if 1
36278 +       if (m_off <= M2_MAX_OFFSET || m_pos[3] == ip[3])
36279 +           goto try_match;
36280 +       DINDEX2(dindex,ip);
36281 +#endif
36282 +       GINDEX(m_pos,m_off,dict,dindex,in);
36283 +       if (LZO_CHECK_MPOS_NON_DET(m_pos,m_off,in,ip,M4_MAX_OFFSET))
36284 +           goto literal;
36285 +       if (m_off <= M2_MAX_OFFSET || m_pos[3] == ip[3])
36286 +           goto try_match;
36287 +       goto literal;
36288 +
36289 +try_match:
36290 +#if 1 && defined(LZO_UNALIGNED_OK_2)
36291 +       if (* (const lzo_ushortp) m_pos != * (const lzo_ushortp) ip)
36292 +#else
36293 +       if (m_pos[0] != ip[0] || m_pos[1] != ip[1])
36294 +#endif
36295 +       {
36296 +       }
36297 +       else
36298 +       {
36299 +           if (m_pos[2] == ip[2])
36300 +           {
36301 +#if 0
36302 +               if (m_off <= M2_MAX_OFFSET)
36303 +                   goto match;
36304 +               if (lit <= 3)
36305 +                   goto match;
36306 +               if (lit == 3)
36307 +               {
36308 +                   assert(op - 2 > out); op[-2] |= LZO_BYTE(3);
36309 +                   *op++ = *ii++; *op++ = *ii++; *op++ = *ii++;
36310 +                   goto code_match;
36311 +               }
36312 +               if (m_pos[3] == ip[3])
36313 +#endif
36314 +                   goto match;
36315 +           }
36316 +           else
36317 +           {
36318 +#if 0
36319 +#if 0
36320 +               if (m_off <= M1_MAX_OFFSET && lit > 0 && lit <= 3)
36321 +#else
36322 +               if (m_off <= M1_MAX_OFFSET && lit == 3)
36323 +#endif
36324 +               {
36325 +                   register lzo_uint t;
36326 +
36327 +                   t = lit;
36328 +                   assert(op - 2 > out); op[-2] |= LZO_BYTE(t);
36329 +                   do *op++ = *ii++; while (--t > 0);
36330 +                   assert(ii == ip);
36331 +                   m_off -= 1;
36332 +                   *op++ = LZO_BYTE(M1_MARKER | ((m_off & 3) << 2));
36333 +                   *op++ = LZO_BYTE(m_off >> 2);
36334 +                   ip += 2;
36335 +                   goto match_done;
36336 +               }
36337 +#endif
36338 +           }
36339 +       }
36340 +
36341 +literal:
36342 +       UPDATE_I(dict,0,dindex,ip,in);
36343 +       ++ip;
36344 +       if (ip >= ip_end)
36345 +           break;
36346 +       continue;
36347 +
36348 +match:
36349 +       UPDATE_I(dict,0,dindex,ip,in);
36350 +       if (pd(ip,ii) > 0)
36351 +       {
36352 +           register lzo_uint t = pd(ip,ii);
36353 +
36354 +           if (t <= 3)
36355 +           {
36356 +               assert("lzo-04", op - 2 > out);
36357 +               op[-2] |= LZO_BYTE(t);
36358 +           }
36359 +           else if (t <= 18)
36360 +               *op++ = LZO_BYTE(t - 3);
36361 +           else
36362 +           {
36363 +               register lzo_uint tt = t - 18;
36364 +
36365 +               *op++ = 0;
36366 +               while (tt > 255)
36367 +               {
36368 +                   tt -= 255;
36369 +                   *op++ = 0;
36370 +               }
36371 +               assert("lzo-05", tt > 0);
36372 +               *op++ = LZO_BYTE(tt);
36373 +           }
36374 +           do *op++ = *ii++; while (--t > 0);
36375 +       }
36376 +
36377 +       assert("lzo-06", ii == ip);
36378 +       ip += 3;
36379 +       if (m_pos[3] != *ip++ || m_pos[4] != *ip++ || m_pos[5] != *ip++ ||
36380 +           m_pos[6] != *ip++ || m_pos[7] != *ip++ || m_pos[8] != *ip++
36381 +#ifdef LZO1Y
36382 +           || m_pos[ 9] != *ip++ || m_pos[10] != *ip++ || m_pos[11] != *ip++
36383 +           || m_pos[12] != *ip++ || m_pos[13] != *ip++ || m_pos[14] != *ip++
36384 +#endif
36385 +          )
36386 +       {
36387 +           --ip;
36388 +           m_len = ip - ii;
36389 +           assert("lzo-07", m_len >= 3); assert("lzo-08", m_len <= M2_MAX_LEN);
36390 +
36391 +           if (m_off <= M2_MAX_OFFSET)
36392 +           {
36393 +               m_off -= 1;
36394 +#if defined(LZO1X)
36395 +               *op++ = LZO_BYTE(((m_len - 1) << 5) | ((m_off & 7) << 2));
36396 +               *op++ = LZO_BYTE(m_off >> 3);
36397 +#elif defined(LZO1Y)
36398 +               *op++ = LZO_BYTE(((m_len + 1) << 4) | ((m_off & 3) << 2));
36399 +               *op++ = LZO_BYTE(m_off >> 2);
36400 +#endif
36401 +           }
36402 +           else if (m_off <= M3_MAX_OFFSET)
36403 +           {
36404 +               m_off -= 1;
36405 +               *op++ = LZO_BYTE(M3_MARKER | (m_len - 2));
36406 +               goto m3_m4_offset;
36407 +           }
36408 +           else
36409 +#if defined(LZO1X)
36410 +           {
36411 +               m_off -= 0x4000;
36412 +               assert("lzo-09", m_off > 0); assert("lzo-10", m_off <= 0x7fff);
36413 +               *op++ = LZO_BYTE(M4_MARKER |
36414 +                                ((m_off & 0x4000) >> 11) | (m_len - 2));
36415 +               goto m3_m4_offset;
36416 +           }
36417 +#elif defined(LZO1Y)
36418 +               goto m4_match;
36419 +#endif
36420 +       }
36421 +       else
36422 +       {
36423 +           {
36424 +               const lzo_byte *end = in_end;
36425 +               const lzo_byte *m = m_pos + M2_MAX_LEN + 1;
36426 +               while (ip < end && *m == *ip)
36427 +                   m++, ip++;
36428 +               m_len = (ip - ii);
36429 +           }
36430 +           assert("lzo-11", m_len > M2_MAX_LEN);
36431 +
36432 +           if (m_off <= M3_MAX_OFFSET)
36433 +           {
36434 +               m_off -= 1;
36435 +               if (m_len <= 33)
36436 +                   *op++ = LZO_BYTE(M3_MARKER | (m_len - 2));
36437 +               else
36438 +               {
36439 +                   m_len -= 33;
36440 +                   *op++ = M3_MARKER | 0;
36441 +                   goto m3_m4_len;
36442 +               }
36443 +           }
36444 +           else
36445 +           {
36446 +#if defined(LZO1Y)
36447 +m4_match:
36448 +#endif
36449 +               m_off -= 0x4000;
36450 +               assert("lzo-12", m_off > 0); assert("lzo-13", m_off <= 0x7fff);
36451 +               if (m_len <= M4_MAX_LEN)
36452 +                   *op++ = LZO_BYTE(M4_MARKER |
36453 +                                    ((m_off & 0x4000) >> 11) | (m_len - 2));
36454 +               else
36455 +               {
36456 +                   m_len -= M4_MAX_LEN;
36457 +                   *op++ = LZO_BYTE(M4_MARKER | ((m_off & 0x4000) >> 11));
36458 +m3_m4_len:
36459 +                   while (m_len > 255)
36460 +                   {
36461 +                       m_len -= 255;
36462 +                       *op++ = 0;
36463 +                   }
36464 +                   assert("lzo-14", m_len > 0);
36465 +                   *op++ = LZO_BYTE(m_len);
36466 +               }
36467 +           }
36468 +
36469 +m3_m4_offset:
36470 +           *op++ = LZO_BYTE((m_off & 63) << 2);
36471 +           *op++ = LZO_BYTE(m_off >> 6);
36472 +       }
36473 +
36474 +#if 0
36475 +match_done:
36476 +#endif
36477 +       ii = ip;
36478 +       if (ip >= ip_end)
36479 +           break;
36480 +    }
36481 +
36482 +    *out_len = op - out;
36483 +    return pd(in_end,ii);
36484 +}
36485 +
36486 +LZO_PUBLIC(int)
36487 +DO_COMPRESS      ( const lzo_byte *in , lzo_uint  in_len,
36488 +                        lzo_byte *out, lzo_uintp out_len,
36489 +                        lzo_voidp wrkmem )
36490 +{
36491 +    lzo_byte *op = out;
36492 +    lzo_uint t;
36493 +
36494 +#if defined(__LZO_QUERY_COMPRESS)
36495 +    if (__LZO_IS_COMPRESS_QUERY(in,in_len,out,out_len,wrkmem))
36496 +       return __LZO_QUERY_COMPRESS(in,in_len,out,out_len,wrkmem,D_SIZE,lzo_sizeof(lzo_dict_t));
36497 +#endif
36498 +
36499 +    if (in_len <= M2_MAX_LEN + 5)
36500 +       t = in_len;
36501 +    else
36502 +    {
36503 +       t = do_compress(in,in_len,op,out_len,wrkmem);
36504 +       op += *out_len;
36505 +    }
36506 +
36507 +    if (t > 0)
36508 +    {
36509 +       const lzo_byte *ii = in + in_len - t;
36510 +
36511 +       if (op == out && t <= 238)
36512 +           *op++ = LZO_BYTE(17 + t);
36513 +       else if (t <= 3)
36514 +           op[-2] |= LZO_BYTE(t);
36515 +       else if (t <= 18)
36516 +           *op++ = LZO_BYTE(t - 3);
36517 +       else
36518 +       {
36519 +           lzo_uint tt = t - 18;
36520 +
36521 +           *op++ = 0;
36522 +           while (tt > 255)
36523 +           {
36524 +               tt -= 255;
36525 +               *op++ = 0;
36526 +           }
36527 +           assert("lzo-15", tt > 0);
36528 +           *op++ = LZO_BYTE(tt);
36529 +       }
36530 +       do *op++ = *ii++; while (--t > 0);
36531 +    }
36532 +
36533 +    *op++ = M4_MARKER | 1;
36534 +    *op++ = 0;
36535 +    *op++ = 0;
36536 +
36537 +    *out_len = op - out;
36538 +    return LZO_E_OK;
36539 +}
36540 +
36541 +#undef do_compress
36542 +#undef DO_COMPRESS
36543 +#undef LZO_HASH
36544 +
36545 +#undef LZO_TEST_DECOMPRESS_OVERRUN
36546 +#undef LZO_TEST_DECOMPRESS_OVERRUN_INPUT
36547 +#undef LZO_TEST_DECOMPRESS_OVERRUN_OUTPUT
36548 +#undef LZO_TEST_DECOMPRESS_OVERRUN_LOOKBEHIND
36549 +#undef DO_DECOMPRESS
36550 +#define DO_DECOMPRESS       lzo1x_decompress
36551 +
36552 +#if defined(LZO_TEST_DECOMPRESS_OVERRUN)
36553 +#  if !defined(LZO_TEST_DECOMPRESS_OVERRUN_INPUT)
36554 +#    define LZO_TEST_DECOMPRESS_OVERRUN_INPUT       2
36555 +#  endif
36556 +#  if !defined(LZO_TEST_DECOMPRESS_OVERRUN_OUTPUT)
36557 +#    define LZO_TEST_DECOMPRESS_OVERRUN_OUTPUT      2
36558 +#  endif
36559 +#  if !defined(LZO_TEST_DECOMPRESS_OVERRUN_LOOKBEHIND)
36560 +#    define LZO_TEST_DECOMPRESS_OVERRUN_LOOKBEHIND
36561 +#  endif
36562 +#endif
36563 +
36564 +#undef TEST_IP
36565 +#undef TEST_OP
36566 +#undef TEST_LOOKBEHIND
36567 +#undef NEED_IP
36568 +#undef NEED_OP
36569 +#undef HAVE_TEST_IP
36570 +#undef HAVE_TEST_OP
36571 +#undef HAVE_NEED_IP
36572 +#undef HAVE_NEED_OP
36573 +#undef HAVE_ANY_IP
36574 +#undef HAVE_ANY_OP
36575 +
36576 +#if defined(LZO_TEST_DECOMPRESS_OVERRUN_INPUT)
36577 +#  if (LZO_TEST_DECOMPRESS_OVERRUN_INPUT >= 1)
36578 +#    define TEST_IP             (ip < ip_end)
36579 +#  endif
36580 +#  if (LZO_TEST_DECOMPRESS_OVERRUN_INPUT >= 2)
36581 +#    define NEED_IP(x) \
36582 +           if ((lzo_uint)(ip_end - ip) < (lzo_uint)(x))  goto input_overrun
36583 +#  endif
36584 +#endif
36585 +
36586 +#if defined(LZO_TEST_DECOMPRESS_OVERRUN_OUTPUT)
36587 +#  if (LZO_TEST_DECOMPRESS_OVERRUN_OUTPUT >= 1)
36588 +#    define TEST_OP             (op <= op_end)
36589 +#  endif
36590 +#  if (LZO_TEST_DECOMPRESS_OVERRUN_OUTPUT >= 2)
36591 +#    undef TEST_OP
36592 +#    define NEED_OP(x) \
36593 +           if ((lzo_uint)(op_end - op) < (lzo_uint)(x))  goto output_overrun
36594 +#  endif
36595 +#endif
36596 +
36597 +#if defined(LZO_TEST_DECOMPRESS_OVERRUN_LOOKBEHIND)
36598 +#  define TEST_LOOKBEHIND(m_pos,out)    if (m_pos < out) goto lookbehind_overrun
36599 +#else
36600 +#  define TEST_LOOKBEHIND(m_pos,op)     ((void) 0)
36601 +#endif
36602 +
36603 +#if !defined(LZO_EOF_CODE) && !defined(TEST_IP)
36604 +#  define TEST_IP               (ip < ip_end)
36605 +#endif
36606 +
36607 +#if defined(TEST_IP)
36608 +#  define HAVE_TEST_IP
36609 +#else
36610 +#  define TEST_IP               1
36611 +#endif
36612 +#if defined(TEST_OP)
36613 +#  define HAVE_TEST_OP
36614 +#else
36615 +#  define TEST_OP               1
36616 +#endif
36617 +
36618 +#if defined(NEED_IP)
36619 +#  define HAVE_NEED_IP
36620 +#else
36621 +#  define NEED_IP(x)            ((void) 0)
36622 +#endif
36623 +#if defined(NEED_OP)
36624 +#  define HAVE_NEED_OP
36625 +#else
36626 +#  define NEED_OP(x)            ((void) 0)
36627 +#endif
36628 +
36629 +#if defined(HAVE_TEST_IP) || defined(HAVE_NEED_IP)
36630 +#  define HAVE_ANY_IP
36631 +#endif
36632 +#if defined(HAVE_TEST_OP) || defined(HAVE_NEED_OP)
36633 +#  define HAVE_ANY_OP
36634 +#endif
36635 +
36636 +#undef __COPY4
36637 +#define __COPY4(dst,src)    * (lzo_uint32p)(dst) = * (const lzo_uint32p)(src)
36638 +
36639 +#undef COPY4
36640 +#if defined(LZO_UNALIGNED_OK_4)
36641 +#  define COPY4(dst,src)    __COPY4(dst,src)
36642 +#elif defined(LZO_ALIGNED_OK_4)
36643 +#  define COPY4(dst,src)    __COPY4((lzo_ptr_t)(dst),(lzo_ptr_t)(src))
36644 +#endif
36645 +
36646 +#if defined(DO_DECOMPRESS)
36647 +LZO_PUBLIC(int)
36648 +DO_DECOMPRESS  ( const lzo_byte *in , lzo_uint  in_len,
36649 +                      lzo_byte *out, lzo_uintp out_len,
36650 +                      lzo_voidp wrkmem )
36651 +#endif
36652 +{
36653 +    register lzo_byte *op;
36654 +    register const lzo_byte *ip;
36655 +    register lzo_uint t;
36656 +#if defined(COPY_DICT)
36657 +    lzo_uint m_off;
36658 +    const lzo_byte *dict_end;
36659 +#else
36660 +    register const lzo_byte *m_pos;
36661 +#endif
36662 +
36663 +    const lzo_byte * const ip_end = in + in_len;
36664 +#if defined(HAVE_ANY_OP)
36665 +    lzo_byte * const op_end = out + *out_len;
36666 +#endif
36667 +#if defined(LZO1Z)
36668 +    lzo_uint last_m_off = 0;
36669 +#endif
36670 +
36671 +    LZO_UNUSED(wrkmem);
36672 +
36673 +#if defined(__LZO_QUERY_DECOMPRESS)
36674 +    if (__LZO_IS_DECOMPRESS_QUERY(in,in_len,out,out_len,wrkmem))
36675 +       return __LZO_QUERY_DECOMPRESS(in,in_len,out,out_len,wrkmem,0,0);
36676 +#endif
36677 +
36678 +#if defined(COPY_DICT)
36679 +    if (dict)
36680 +    {
36681 +       if (dict_len > M4_MAX_OFFSET)
36682 +       {
36683 +           dict += dict_len - M4_MAX_OFFSET;
36684 +           dict_len = M4_MAX_OFFSET;
36685 +       }
36686 +       dict_end = dict + dict_len;
36687 +    }
36688 +    else
36689 +    {
36690 +       dict_len = 0;
36691 +       dict_end = NULL;
36692 +    }
36693 +#endif
36694 +
36695 +    *out_len = 0;
36696 +
36697 +    op = out;
36698 +    ip = in;
36699 +
36700 +    if (*ip > 17)
36701 +    {
36702 +       t = *ip++ - 17;
36703 +       if (t < 4)
36704 +           goto match_next;
36705 +       assert("lzo-16", t > 0); NEED_OP(t); NEED_IP(t+1);
36706 +       do *op++ = *ip++; while (--t > 0);
36707 +       goto first_literal_run;
36708 +    }
36709 +
36710 +    while (TEST_IP && TEST_OP)
36711 +    {
36712 +       t = *ip++;
36713 +       if (t >= 16)
36714 +           goto match;
36715 +       if (t == 0)
36716 +       {
36717 +           NEED_IP(1);
36718 +           while (*ip == 0)
36719 +           {
36720 +               t += 255;
36721 +               ip++;
36722 +               NEED_IP(1);
36723 +           }
36724 +           t += 15 + *ip++;
36725 +       }
36726 +       assert("lzo-17", t > 0); NEED_OP(t+3); NEED_IP(t+4);
36727 +#if defined(LZO_UNALIGNED_OK_4) || defined(LZO_ALIGNED_OK_4)
36728 +#if !defined(LZO_UNALIGNED_OK_4)
36729 +       if (PTR_ALIGNED2_4(op,ip))
36730 +       {
36731 +#endif
36732 +       COPY4(op,ip);
36733 +       op += 4; ip += 4;
36734 +       if (--t > 0)
36735 +       {
36736 +           if (t >= 4)
36737 +           {
36738 +               do {
36739 +                   COPY4(op,ip);
36740 +                   op += 4; ip += 4; t -= 4;
36741 +               } while (t >= 4);
36742 +               if (t > 0) do *op++ = *ip++; while (--t > 0);
36743 +           }
36744 +           else
36745 +               do *op++ = *ip++; while (--t > 0);
36746 +       }
36747 +#if !defined(LZO_UNALIGNED_OK_4)
36748 +       }
36749 +       else
36750 +#endif
36751 +#endif
36752 +#if !defined(LZO_UNALIGNED_OK_4)
36753 +       {
36754 +           *op++ = *ip++; *op++ = *ip++; *op++ = *ip++;
36755 +           do *op++ = *ip++; while (--t > 0);
36756 +       }
36757 +#endif
36758 +
36759 +first_literal_run:
36760 +
36761 +       t = *ip++;
36762 +       if (t >= 16)
36763 +           goto match;
36764 +#if defined(COPY_DICT)
36765 +#if defined(LZO1Z)
36766 +       m_off = (1 + M2_MAX_OFFSET) + (t << 6) + (*ip++ >> 2);
36767 +       last_m_off = m_off;
36768 +#else
36769 +       m_off = (1 + M2_MAX_OFFSET) + (t >> 2) + (*ip++ << 2);
36770 +#endif
36771 +       NEED_OP(3);
36772 +       t = 3; COPY_DICT(t,m_off)
36773 +#else
36774 +#if defined(LZO1Z)
36775 +       t = (1 + M2_MAX_OFFSET) + (t << 6) + (*ip++ >> 2);
36776 +       m_pos = op - t;
36777 +       last_m_off = t;
36778 +#else
36779 +       m_pos = op - (1 + M2_MAX_OFFSET);
36780 +       m_pos -= t >> 2;
36781 +       m_pos -= *ip++ << 2;
36782 +#endif
36783 +       TEST_LOOKBEHIND(m_pos,out); NEED_OP(3);
36784 +       *op++ = *m_pos++; *op++ = *m_pos++; *op++ = *m_pos;
36785 +#endif
36786 +       goto match_done;
36787 +
36788 +       while (TEST_IP && TEST_OP)
36789 +       {
36790 +match:
36791 +           if (t >= 64)
36792 +           {
36793 +#if defined(COPY_DICT)
36794 +#if defined(LZO1X)
36795 +               m_off = 1 + ((t >> 2) & 7) + (*ip++ << 3);
36796 +               t = (t >> 5) - 1;
36797 +#elif defined(LZO1Y)
36798 +               m_off = 1 + ((t >> 2) & 3) + (*ip++ << 2);
36799 +               t = (t >> 4) - 3;
36800 +#elif defined(LZO1Z)
36801 +               m_off = t & 0x1f;
36802 +               if (m_off >= 0x1c)
36803 +                   m_off = last_m_off;
36804 +               else
36805 +               {
36806 +                   m_off = 1 + (m_off << 6) + (*ip++ >> 2);
36807 +                   last_m_off = m_off;
36808 +               }
36809 +               t = (t >> 5) - 1;
36810 +#endif
36811 +#else
36812 +#if defined(LZO1X)
36813 +               m_pos = op - 1;
36814 +               m_pos -= (t >> 2) & 7;
36815 +               m_pos -= *ip++ << 3;
36816 +               t = (t >> 5) - 1;
36817 +#elif defined(LZO1Y)
36818 +               m_pos = op - 1;
36819 +               m_pos -= (t >> 2) & 3;
36820 +               m_pos -= *ip++ << 2;
36821 +               t = (t >> 4) - 3;
36822 +#elif defined(LZO1Z)
36823 +               {
36824 +                   lzo_uint off = t & 0x1f;
36825 +                   m_pos = op;
36826 +                   if (off >= 0x1c)
36827 +                   {
36828 +                       assert(last_m_off > 0);
36829 +                       m_pos -= last_m_off;
36830 +                   }
36831 +                   else
36832 +                   {
36833 +                       off = 1 + (off << 6) + (*ip++ >> 2);
36834 +                       m_pos -= off;
36835 +                       last_m_off = off;
36836 +                   }
36837 +               }
36838 +               t = (t >> 5) - 1;
36839 +#endif
36840 +               TEST_LOOKBEHIND(m_pos,out); assert("lzo-18", t > 0); NEED_OP(t+3-1);
36841 +               goto copy_match;
36842 +#endif
36843 +           }
36844 +           else if (t >= 32)
36845 +           {
36846 +               t &= 31;
36847 +               if (t == 0)
36848 +               {
36849 +                   NEED_IP(1);
36850 +                   while (*ip == 0)
36851 +                   {
36852 +                       t += 255;
36853 +                       ip++;
36854 +                       NEED_IP(1);
36855 +                   }
36856 +                   t += 31 + *ip++;
36857 +               }
36858 +#if defined(COPY_DICT)
36859 +#if defined(LZO1Z)
36860 +               m_off = 1 + (ip[0] << 6) + (ip[1] >> 2);
36861 +               last_m_off = m_off;
36862 +#else
36863 +               m_off = 1 + (ip[0] >> 2) + (ip[1] << 6);
36864 +#endif
36865 +#else
36866 +#if defined(LZO1Z)
36867 +               {
36868 +                   lzo_uint off = 1 + (ip[0] << 6) + (ip[1] >> 2);
36869 +                   m_pos = op - off;
36870 +                   last_m_off = off;
36871 +               }
36872 +#elif defined(LZO_UNALIGNED_OK_2) && (LZO_BYTE_ORDER == LZO_LITTLE_ENDIAN)
36873 +               m_pos = op - 1;
36874 +               m_pos -= (* (const lzo_ushortp) ip) >> 2;
36875 +#else
36876 +               m_pos = op - 1;
36877 +               m_pos -= (ip[0] >> 2) + (ip[1] << 6);
36878 +#endif
36879 +#endif
36880 +               ip += 2;
36881 +           }
36882 +           else if (t >= 16)
36883 +           {
36884 +#if defined(COPY_DICT)
36885 +               m_off = (t & 8) << 11;
36886 +#else
36887 +               m_pos = op;
36888 +               m_pos -= (t & 8) << 11;
36889 +#endif
36890 +               t &= 7;
36891 +               if (t == 0)
36892 +               {
36893 +                   NEED_IP(1);
36894 +                   while (*ip == 0)
36895 +                   {
36896 +                       t += 255;
36897 +                       ip++;
36898 +                       NEED_IP(1);
36899 +                   }
36900 +                   t += 7 + *ip++;
36901 +               }
36902 +#if defined(COPY_DICT)
36903 +#if defined(LZO1Z)
36904 +               m_off += (ip[0] << 6) + (ip[1] >> 2);
36905 +#else
36906 +               m_off += (ip[0] >> 2) + (ip[1] << 6);
36907 +#endif
36908 +               ip += 2;
36909 +               if (m_off == 0)
36910 +                   goto eof_found;
36911 +               m_off += 0x4000;
36912 +#if defined(LZO1Z)
36913 +               last_m_off = m_off;
36914 +#endif
36915 +#else
36916 +#if defined(LZO1Z)
36917 +               m_pos -= (ip[0] << 6) + (ip[1] >> 2);
36918 +#elif defined(LZO_UNALIGNED_OK_2) && (LZO_BYTE_ORDER == LZO_LITTLE_ENDIAN)
36919 +               m_pos -= (* (const lzo_ushortp) ip) >> 2;
36920 +#else
36921 +               m_pos -= (ip[0] >> 2) + (ip[1] << 6);
36922 +#endif
36923 +               ip += 2;
36924 +               if (m_pos == op)
36925 +                   goto eof_found;
36926 +               m_pos -= 0x4000;
36927 +#if defined(LZO1Z)
36928 +               last_m_off = op - m_pos;
36929 +#endif
36930 +#endif
36931 +           }
36932 +           else
36933 +           {
36934 +#if defined(COPY_DICT)
36935 +#if defined(LZO1Z)
36936 +               m_off = 1 + (t << 6) + (*ip++ >> 2);
36937 +               last_m_off = m_off;
36938 +#else
36939 +               m_off = 1 + (t >> 2) + (*ip++ << 2);
36940 +#endif
36941 +               NEED_OP(2);
36942 +               t = 2; COPY_DICT(t,m_off)
36943 +#else
36944 +#if defined(LZO1Z)
36945 +               t = 1 + (t << 6) + (*ip++ >> 2);
36946 +               m_pos = op - t;
36947 +               last_m_off = t;
36948 +#else
36949 +               m_pos = op - 1;
36950 +               m_pos -= t >> 2;
36951 +               m_pos -= *ip++ << 2;
36952 +#endif
36953 +               TEST_LOOKBEHIND(m_pos,out); NEED_OP(2);
36954 +               *op++ = *m_pos++; *op++ = *m_pos;
36955 +#endif
36956 +               goto match_done;
36957 +           }
36958 +
36959 +#if defined(COPY_DICT)
36960 +
36961 +           NEED_OP(t+3-1);
36962 +           t += 3-1; COPY_DICT(t,m_off)
36963 +
36964 +#else
36965 +
36966 +           TEST_LOOKBEHIND(m_pos,out); assert("lzo-19", t > 0); NEED_OP(t+3-1);
36967 +#if defined(LZO_UNALIGNED_OK_4) || defined(LZO_ALIGNED_OK_4)
36968 +#if !defined(LZO_UNALIGNED_OK_4)
36969 +           if (t >= 2 * 4 - (3 - 1) && PTR_ALIGNED2_4(op,m_pos))
36970 +           {
36971 +               assert((op - m_pos) >= 4);
36972 +#else
36973 +           if (t >= 2 * 4 - (3 - 1) && (op - m_pos) >= 4)
36974 +           {
36975 +#endif
36976 +               COPY4(op,m_pos);
36977 +               op += 4; m_pos += 4; t -= 4 - (3 - 1);
36978 +               do {
36979 +                   COPY4(op,m_pos);
36980 +                   op += 4; m_pos += 4; t -= 4;
36981 +               } while (t >= 4);
36982 +               if (t > 0) do *op++ = *m_pos++; while (--t > 0);
36983 +           }
36984 +           else
36985 +#endif
36986 +           {
36987 +copy_match:
36988 +               *op++ = *m_pos++; *op++ = *m_pos++;
36989 +               do *op++ = *m_pos++; while (--t > 0);
36990 +           }
36991 +
36992 +#endif
36993 +
36994 +match_done:
36995 +#if defined(LZO1Z)
36996 +           t = ip[-1] & 3;
36997 +#else
36998 +           t = ip[-2] & 3;
36999 +#endif
37000 +           if (t == 0)
37001 +               break;
37002 +
37003 +match_next:
37004 +           assert("lzo-20", t > 0); NEED_OP(t); NEED_IP(t+1);
37005 +           do *op++ = *ip++; while (--t > 0);
37006 +           t = *ip++;
37007 +       }
37008 +    }
37009 +
37010 +#if defined(HAVE_TEST_IP) || defined(HAVE_TEST_OP)
37011 +    *out_len = op - out;
37012 +    return LZO_E_EOF_NOT_FOUND;
37013 +#endif
37014 +
37015 +eof_found:
37016 +    assert("lzo-21", t == 1);
37017 +    *out_len = op - out;
37018 +    return (ip == ip_end ? LZO_E_OK :
37019 +          (ip < ip_end  ? LZO_E_INPUT_NOT_CONSUMED : LZO_E_INPUT_OVERRUN));
37020 +
37021 +#if defined(HAVE_NEED_IP)
37022 +input_overrun:
37023 +    *out_len = op - out;
37024 +    return LZO_E_INPUT_OVERRUN;
37025 +#endif
37026 +
37027 +#if defined(HAVE_NEED_OP)
37028 +output_overrun:
37029 +    *out_len = op - out;
37030 +    return LZO_E_OUTPUT_OVERRUN;
37031 +#endif
37032 +
37033 +#if defined(LZO_TEST_DECOMPRESS_OVERRUN_LOOKBEHIND)
37034 +lookbehind_overrun:
37035 +    *out_len = op - out;
37036 +    return LZO_E_LOOKBEHIND_OVERRUN;
37037 +#endif
37038 +}
37039 +
37040 +#define LZO_TEST_DECOMPRESS_OVERRUN
37041 +#undef DO_DECOMPRESS
37042 +#define DO_DECOMPRESS       lzo1x_decompress_safe
37043 +
37044 +#if defined(LZO_TEST_DECOMPRESS_OVERRUN)
37045 +#  if !defined(LZO_TEST_DECOMPRESS_OVERRUN_INPUT)
37046 +#    define LZO_TEST_DECOMPRESS_OVERRUN_INPUT       2
37047 +#  endif
37048 +#  if !defined(LZO_TEST_DECOMPRESS_OVERRUN_OUTPUT)
37049 +#    define LZO_TEST_DECOMPRESS_OVERRUN_OUTPUT      2
37050 +#  endif
37051 +#  if !defined(LZO_TEST_DECOMPRESS_OVERRUN_LOOKBEHIND)
37052 +#    define LZO_TEST_DECOMPRESS_OVERRUN_LOOKBEHIND
37053 +#  endif
37054 +#endif
37055 +
37056 +#undef TEST_IP
37057 +#undef TEST_OP
37058 +#undef TEST_LOOKBEHIND
37059 +#undef NEED_IP
37060 +#undef NEED_OP
37061 +#undef HAVE_TEST_IP
37062 +#undef HAVE_TEST_OP
37063 +#undef HAVE_NEED_IP
37064 +#undef HAVE_NEED_OP
37065 +#undef HAVE_ANY_IP
37066 +#undef HAVE_ANY_OP
37067 +
37068 +#if defined(LZO_TEST_DECOMPRESS_OVERRUN_INPUT)
37069 +#  if (LZO_TEST_DECOMPRESS_OVERRUN_INPUT >= 1)
37070 +#    define TEST_IP             (ip < ip_end)
37071 +#  endif
37072 +#  if (LZO_TEST_DECOMPRESS_OVERRUN_INPUT >= 2)
37073 +#    define NEED_IP(x) \
37074 +           if ((lzo_uint)(ip_end - ip) < (lzo_uint)(x))  goto input_overrun
37075 +#  endif
37076 +#endif
37077 +
37078 +#if defined(LZO_TEST_DECOMPRESS_OVERRUN_OUTPUT)
37079 +#  if (LZO_TEST_DECOMPRESS_OVERRUN_OUTPUT >= 1)
37080 +#    define TEST_OP             (op <= op_end)
37081 +#  endif
37082 +#  if (LZO_TEST_DECOMPRESS_OVERRUN_OUTPUT >= 2)
37083 +#    undef TEST_OP
37084 +#    define NEED_OP(x) \
37085 +           if ((lzo_uint)(op_end - op) < (lzo_uint)(x))  goto output_overrun
37086 +#  endif
37087 +#endif
37088 +
37089 +#if defined(LZO_TEST_DECOMPRESS_OVERRUN_LOOKBEHIND)
37090 +#  define TEST_LOOKBEHIND(m_pos,out)    if (m_pos < out) goto lookbehind_overrun
37091 +#else
37092 +#  define TEST_LOOKBEHIND(m_pos,op)     ((void) 0)
37093 +#endif
37094 +
37095 +#if !defined(LZO_EOF_CODE) && !defined(TEST_IP)
37096 +#  define TEST_IP               (ip < ip_end)
37097 +#endif
37098 +
37099 +#if defined(TEST_IP)
37100 +#  define HAVE_TEST_IP
37101 +#else
37102 +#  define TEST_IP               1
37103 +#endif
37104 +#if defined(TEST_OP)
37105 +#  define HAVE_TEST_OP
37106 +#else
37107 +#  define TEST_OP               1
37108 +#endif
37109 +
37110 +#if defined(NEED_IP)
37111 +#  define HAVE_NEED_IP
37112 +#else
37113 +#  define NEED_IP(x)            ((void) 0)
37114 +#endif
37115 +#if defined(NEED_OP)
37116 +#  define HAVE_NEED_OP
37117 +#else
37118 +#  define NEED_OP(x)            ((void) 0)
37119 +#endif
37120 +
37121 +#if defined(HAVE_TEST_IP) || defined(HAVE_NEED_IP)
37122 +#  define HAVE_ANY_IP
37123 +#endif
37124 +#if defined(HAVE_TEST_OP) || defined(HAVE_NEED_OP)
37125 +#  define HAVE_ANY_OP
37126 +#endif
37127 +
37128 +#undef __COPY4
37129 +#define __COPY4(dst,src)    * (lzo_uint32p)(dst) = * (const lzo_uint32p)(src)
37130 +
37131 +#undef COPY4
37132 +#if defined(LZO_UNALIGNED_OK_4)
37133 +#  define COPY4(dst,src)    __COPY4(dst,src)
37134 +#elif defined(LZO_ALIGNED_OK_4)
37135 +#  define COPY4(dst,src)    __COPY4((lzo_ptr_t)(dst),(lzo_ptr_t)(src))
37136 +#endif
37137 +
37138 +#if defined(DO_DECOMPRESS)
37139 +LZO_PUBLIC(int)
37140 +DO_DECOMPRESS  ( const lzo_byte *in , lzo_uint  in_len,
37141 +                      lzo_byte *out, lzo_uintp out_len,
37142 +                      lzo_voidp wrkmem )
37143 +#endif
37144 +{
37145 +    register lzo_byte *op;
37146 +    register const lzo_byte *ip;
37147 +    register lzo_uint t;
37148 +#if defined(COPY_DICT)
37149 +    lzo_uint m_off;
37150 +    const lzo_byte *dict_end;
37151 +#else
37152 +    register const lzo_byte *m_pos;
37153 +#endif
37154 +
37155 +    const lzo_byte * const ip_end = in + in_len;
37156 +#if defined(HAVE_ANY_OP)
37157 +    lzo_byte * const op_end = out + *out_len;
37158 +#endif
37159 +#if defined(LZO1Z)
37160 +    lzo_uint last_m_off = 0;
37161 +#endif
37162 +
37163 +    LZO_UNUSED(wrkmem);
37164 +
37165 +#if defined(__LZO_QUERY_DECOMPRESS)
37166 +    if (__LZO_IS_DECOMPRESS_QUERY(in,in_len,out,out_len,wrkmem))
37167 +       return __LZO_QUERY_DECOMPRESS(in,in_len,out,out_len,wrkmem,0,0);
37168 +#endif
37169 +
37170 +#if defined(COPY_DICT)
37171 +    if (dict)
37172 +    {
37173 +       if (dict_len > M4_MAX_OFFSET)
37174 +       {
37175 +           dict += dict_len - M4_MAX_OFFSET;
37176 +           dict_len = M4_MAX_OFFSET;
37177 +       }
37178 +       dict_end = dict + dict_len;
37179 +    }
37180 +    else
37181 +    {
37182 +       dict_len = 0;
37183 +       dict_end = NULL;
37184 +    }
37185 +#endif
37186 +
37187 +    *out_len = 0;
37188 +
37189 +    op = out;
37190 +    ip = in;
37191 +
37192 +    if (*ip > 17)
37193 +    {
37194 +       t = *ip++ - 17;
37195 +       if (t < 4)
37196 +           goto match_next;
37197 +       assert("lzo-22", t > 0); NEED_OP(t); NEED_IP(t+1);
37198 +       do *op++ = *ip++; while (--t > 0);
37199 +       goto first_literal_run;
37200 +    }
37201 +
37202 +    while (TEST_IP && TEST_OP)
37203 +    {
37204 +       t = *ip++;
37205 +       if (t >= 16)
37206 +           goto match;
37207 +       if (t == 0)
37208 +       {
37209 +           NEED_IP(1);
37210 +           while (*ip == 0)
37211 +           {
37212 +               t += 255;
37213 +               ip++;
37214 +               NEED_IP(1);
37215 +           }
37216 +           t += 15 + *ip++;
37217 +       }
37218 +       assert("lzo-23", t > 0); NEED_OP(t+3); NEED_IP(t+4);
37219 +#if defined(LZO_UNALIGNED_OK_4) || defined(LZO_ALIGNED_OK_4)
37220 +#if !defined(LZO_UNALIGNED_OK_4)
37221 +       if (PTR_ALIGNED2_4(op,ip))
37222 +       {
37223 +#endif
37224 +       COPY4(op,ip);
37225 +       op += 4; ip += 4;
37226 +       if (--t > 0)
37227 +       {
37228 +           if (t >= 4)
37229 +           {
37230 +               do {
37231 +                   COPY4(op,ip);
37232 +                   op += 4; ip += 4; t -= 4;
37233 +               } while (t >= 4);
37234 +               if (t > 0) do *op++ = *ip++; while (--t > 0);
37235 +           }
37236 +           else
37237 +               do *op++ = *ip++; while (--t > 0);
37238 +       }
37239 +#if !defined(LZO_UNALIGNED_OK_4)
37240 +       }
37241 +       else
37242 +#endif
37243 +#endif
37244 +#if !defined(LZO_UNALIGNED_OK_4)
37245 +       {
37246 +           *op++ = *ip++; *op++ = *ip++; *op++ = *ip++;
37247 +           do *op++ = *ip++; while (--t > 0);
37248 +       }
37249 +#endif
37250 +
37251 +first_literal_run:
37252 +
37253 +       t = *ip++;
37254 +       if (t >= 16)
37255 +           goto match;
37256 +#if defined(COPY_DICT)
37257 +#if defined(LZO1Z)
37258 +       m_off = (1 + M2_MAX_OFFSET) + (t << 6) + (*ip++ >> 2);
37259 +       last_m_off = m_off;
37260 +#else
37261 +       m_off = (1 + M2_MAX_OFFSET) + (t >> 2) + (*ip++ << 2);
37262 +#endif
37263 +       NEED_OP(3);
37264 +       t = 3; COPY_DICT(t,m_off)
37265 +#else
37266 +#if defined(LZO1Z)
37267 +       t = (1 + M2_MAX_OFFSET) + (t << 6) + (*ip++ >> 2);
37268 +       m_pos = op - t;
37269 +       last_m_off = t;
37270 +#else
37271 +       m_pos = op - (1 + M2_MAX_OFFSET);
37272 +       m_pos -= t >> 2;
37273 +       m_pos -= *ip++ << 2;
37274 +#endif
37275 +       TEST_LOOKBEHIND(m_pos,out); NEED_OP(3);
37276 +       *op++ = *m_pos++; *op++ = *m_pos++; *op++ = *m_pos;
37277 +#endif
37278 +       goto match_done;
37279 +
37280 +       while (TEST_IP && TEST_OP)
37281 +       {
37282 +match:
37283 +           if (t >= 64)
37284 +           {
37285 +#if defined(COPY_DICT)
37286 +#if defined(LZO1X)
37287 +               m_off = 1 + ((t >> 2) & 7) + (*ip++ << 3);
37288 +               t = (t >> 5) - 1;
37289 +#elif defined(LZO1Y)
37290 +               m_off = 1 + ((t >> 2) & 3) + (*ip++ << 2);
37291 +               t = (t >> 4) - 3;
37292 +#elif defined(LZO1Z)
37293 +               m_off = t & 0x1f;
37294 +               if (m_off >= 0x1c)
37295 +                   m_off = last_m_off;
37296 +               else
37297 +               {
37298 +                   m_off = 1 + (m_off << 6) + (*ip++ >> 2);
37299 +                   last_m_off = m_off;
37300 +               }
37301 +               t = (t >> 5) - 1;
37302 +#endif
37303 +#else
37304 +#if defined(LZO1X)
37305 +               m_pos = op - 1;
37306 +               m_pos -= (t >> 2) & 7;
37307 +               m_pos -= *ip++ << 3;
37308 +               t = (t >> 5) - 1;
37309 +#elif defined(LZO1Y)
37310 +               m_pos = op - 1;
37311 +               m_pos -= (t >> 2) & 3;
37312 +               m_pos -= *ip++ << 2;
37313 +               t = (t >> 4) - 3;
37314 +#elif defined(LZO1Z)
37315 +               {
37316 +                   lzo_uint off = t & 0x1f;
37317 +                   m_pos = op;
37318 +                   if (off >= 0x1c)
37319 +                   {
37320 +                       assert(last_m_off > 0);
37321 +                       m_pos -= last_m_off;
37322 +                   }
37323 +                   else
37324 +                   {
37325 +                       off = 1 + (off << 6) + (*ip++ >> 2);
37326 +                       m_pos -= off;
37327 +                       last_m_off = off;
37328 +                   }
37329 +               }
37330 +               t = (t >> 5) - 1;
37331 +#endif
37332 +               TEST_LOOKBEHIND(m_pos,out); assert("lzo-24", t > 0); NEED_OP(t+3-1);
37333 +               goto copy_match;
37334 +#endif
37335 +           }
37336 +           else if (t >= 32)
37337 +           {
37338 +               t &= 31;
37339 +               if (t == 0)
37340 +               {
37341 +                   NEED_IP(1);
37342 +                   while (*ip == 0)
37343 +                   {
37344 +                       t += 255;
37345 +                       ip++;
37346 +                       NEED_IP(1);
37347 +                   }
37348 +                   t += 31 + *ip++;
37349 +               }
37350 +#if defined(COPY_DICT)
37351 +#if defined(LZO1Z)
37352 +               m_off = 1 + (ip[0] << 6) + (ip[1] >> 2);
37353 +               last_m_off = m_off;
37354 +#else
37355 +               m_off = 1 + (ip[0] >> 2) + (ip[1] << 6);
37356 +#endif
37357 +#else
37358 +#if defined(LZO1Z)
37359 +               {
37360 +                   lzo_uint off = 1 + (ip[0] << 6) + (ip[1] >> 2);
37361 +                   m_pos = op - off;
37362 +                   last_m_off = off;
37363 +               }
37364 +#elif defined(LZO_UNALIGNED_OK_2) && (LZO_BYTE_ORDER == LZO_LITTLE_ENDIAN)
37365 +               m_pos = op - 1;
37366 +               m_pos -= (* (const lzo_ushortp) ip) >> 2;
37367 +#else
37368 +               m_pos = op - 1;
37369 +               m_pos -= (ip[0] >> 2) + (ip[1] << 6);
37370 +#endif
37371 +#endif
37372 +               ip += 2;
37373 +           }
37374 +           else if (t >= 16)
37375 +           {
37376 +#if defined(COPY_DICT)
37377 +               m_off = (t & 8) << 11;
37378 +#else
37379 +               m_pos = op;
37380 +               m_pos -= (t & 8) << 11;
37381 +#endif
37382 +               t &= 7;
37383 +               if (t == 0)
37384 +               {
37385 +                   NEED_IP(1);
37386 +                   while (*ip == 0)
37387 +                   {
37388 +                       t += 255;
37389 +                       ip++;
37390 +                       NEED_IP(1);
37391 +                   }
37392 +                   t += 7 + *ip++;
37393 +               }
37394 +#if defined(COPY_DICT)
37395 +#if defined(LZO1Z)
37396 +               m_off += (ip[0] << 6) + (ip[1] >> 2);
37397 +#else
37398 +               m_off += (ip[0] >> 2) + (ip[1] << 6);
37399 +#endif
37400 +               ip += 2;
37401 +               if (m_off == 0)
37402 +                   goto eof_found;
37403 +               m_off += 0x4000;
37404 +#if defined(LZO1Z)
37405 +               last_m_off = m_off;
37406 +#endif
37407 +#else
37408 +#if defined(LZO1Z)
37409 +               m_pos -= (ip[0] << 6) + (ip[1] >> 2);
37410 +#elif defined(LZO_UNALIGNED_OK_2) && (LZO_BYTE_ORDER == LZO_LITTLE_ENDIAN)
37411 +               m_pos -= (* (const lzo_ushortp) ip) >> 2;
37412 +#else
37413 +               m_pos -= (ip[0] >> 2) + (ip[1] << 6);
37414 +#endif
37415 +               ip += 2;
37416 +               if (m_pos == op)
37417 +                   goto eof_found;
37418 +               m_pos -= 0x4000;
37419 +#if defined(LZO1Z)
37420 +               last_m_off = op - m_pos;
37421 +#endif
37422 +#endif
37423 +           }
37424 +           else
37425 +           {
37426 +#if defined(COPY_DICT)
37427 +#if defined(LZO1Z)
37428 +               m_off = 1 + (t << 6) + (*ip++ >> 2);
37429 +               last_m_off = m_off;
37430 +#else
37431 +               m_off = 1 + (t >> 2) + (*ip++ << 2);
37432 +#endif
37433 +               NEED_OP(2);
37434 +               t = 2; COPY_DICT(t,m_off)
37435 +#else
37436 +#if defined(LZO1Z)
37437 +               t = 1 + (t << 6) + (*ip++ >> 2);
37438 +               m_pos = op - t;
37439 +               last_m_off = t;
37440 +#else
37441 +               m_pos = op - 1;
37442 +               m_pos -= t >> 2;
37443 +               m_pos -= *ip++ << 2;
37444 +#endif
37445 +               TEST_LOOKBEHIND(m_pos,out); NEED_OP(2);
37446 +               *op++ = *m_pos++; *op++ = *m_pos;
37447 +#endif
37448 +               goto match_done;
37449 +           }
37450 +
37451 +#if defined(COPY_DICT)
37452 +
37453 +           NEED_OP(t+3-1);
37454 +           t += 3-1; COPY_DICT(t,m_off)
37455 +
37456 +#else
37457 +
37458 +           TEST_LOOKBEHIND(m_pos,out); assert("lzo-25", t > 0); NEED_OP(t+3-1);
37459 +#if defined(LZO_UNALIGNED_OK_4) || defined(LZO_ALIGNED_OK_4)
37460 +#if !defined(LZO_UNALIGNED_OK_4)
37461 +           if (t >= 2 * 4 - (3 - 1) && PTR_ALIGNED2_4(op,m_pos))
37462 +           {
37463 +               assert((op - m_pos) >= 4);
37464 +#else
37465 +           if (t >= 2 * 4 - (3 - 1) && (op - m_pos) >= 4)
37466 +           {
37467 +#endif
37468 +               COPY4(op,m_pos);
37469 +               op += 4; m_pos += 4; t -= 4 - (3 - 1);
37470 +               do {
37471 +                   COPY4(op,m_pos);
37472 +                   op += 4; m_pos += 4; t -= 4;
37473 +               } while (t >= 4);
37474 +               if (t > 0) do *op++ = *m_pos++; while (--t > 0);
37475 +           }
37476 +           else
37477 +#endif
37478 +           {
37479 +copy_match:
37480 +               *op++ = *m_pos++; *op++ = *m_pos++;
37481 +               do *op++ = *m_pos++; while (--t > 0);
37482 +           }
37483 +
37484 +#endif
37485 +
37486 +match_done:
37487 +#if defined(LZO1Z)
37488 +           t = ip[-1] & 3;
37489 +#else
37490 +           t = ip[-2] & 3;
37491 +#endif
37492 +           if (t == 0)
37493 +               break;
37494 +
37495 +match_next:
37496 +           assert("lzo-26", t > 0); NEED_OP(t); NEED_IP(t+1);
37497 +           do *op++ = *ip++; while (--t > 0);
37498 +           t = *ip++;
37499 +       }
37500 +    }
37501 +
37502 +#if defined(HAVE_TEST_IP) || defined(HAVE_TEST_OP)
37503 +    *out_len = op - out;
37504 +    return LZO_E_EOF_NOT_FOUND;
37505 +#endif
37506 +
37507 +eof_found:
37508 +    assert("lzo-27", t == 1);
37509 +    *out_len = op - out;
37510 +    return (ip == ip_end ? LZO_E_OK :
37511 +          (ip < ip_end  ? LZO_E_INPUT_NOT_CONSUMED : LZO_E_INPUT_OVERRUN));
37512 +
37513 +#if defined(HAVE_NEED_IP)
37514 +input_overrun:
37515 +    *out_len = op - out;
37516 +    return LZO_E_INPUT_OVERRUN;
37517 +#endif
37518 +
37519 +#if defined(HAVE_NEED_OP)
37520 +output_overrun:
37521 +    *out_len = op - out;
37522 +    return LZO_E_OUTPUT_OVERRUN;
37523 +#endif
37524 +
37525 +#if defined(LZO_TEST_DECOMPRESS_OVERRUN_LOOKBEHIND)
37526 +lookbehind_overrun:
37527 +    *out_len = op - out;
37528 +    return LZO_E_LOOKBEHIND_OVERRUN;
37529 +#endif
37530 +}
37531 +
37532 +/***** End of minilzo.c *****/
37533 +
37534 diff -rupN linux-2.6.8-rc3/fs/reiser4/plugin/compress/minilzo.h linux-2.6.8-rc3-a/fs/reiser4/plugin/compress/minilzo.h
37535 --- linux-2.6.8-rc3/fs/reiser4/plugin/compress/minilzo.h        1970-01-01 03:00:00.000000000 +0300
37536 +++ linux-2.6.8-rc3-a/fs/reiser4/plugin/compress/minilzo.h      2004-08-05 21:20:53.395594634 +0400
37537 @@ -0,0 +1,100 @@
37538 +/* minilzo.h -- mini subset of the LZO real-time data compression library
37539 +
37540 +   This file is part of the LZO real-time data compression library.
37541 +
37542 +   Copyright (C) 2002 Markus Franz Xaver Johannes Oberhumer
37543 +   Copyright (C) 2001 Markus Franz Xaver Johannes Oberhumer
37544 +   Copyright (C) 2000 Markus Franz Xaver Johannes Oberhumer
37545 +   Copyright (C) 1999 Markus Franz Xaver Johannes Oberhumer
37546 +   Copyright (C) 1998 Markus Franz Xaver Johannes Oberhumer
37547 +   Copyright (C) 1997 Markus Franz Xaver Johannes Oberhumer
37548 +   Copyright (C) 1996 Markus Franz Xaver Johannes Oberhumer
37549 +   All Rights Reserved.
37550 +
37551 +   The LZO library is free software; you can redistribute it and/or
37552 +   modify it under the terms of the GNU General Public License as
37553 +   published by the Free Software Foundation; either version 2 of
37554 +   the License, or (at your option) any later version.
37555 +
37556 +   The LZO library is distributed in the hope that it will be useful,
37557 +   but WITHOUT ANY WARRANTY; without even the implied warranty of
37558 +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
37559 +   GNU General Public License for more details.
37560 +
37561 +   You should have received a copy of the GNU General Public License
37562 +   along with the LZO library; see the file COPYING.
37563 +   If not, write to the Free Software Foundation, Inc.,
37564 +   59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
37565 +
37566 +   Markus F.X.J. Oberhumer
37567 +   <markus@oberhumer.com>
37568 +   http://www.oberhumer.com/opensource/lzo/
37569 + */
37570 +
37571 +/*
37572 + * NOTE:
37573 + *   the full LZO package can be found at
37574 + *   http://www.oberhumer.com/opensource/lzo/
37575 + */
37576 +
37577 +
37578 +#ifndef __MINILZO_H
37579 +#define __MINILZO_H
37580 +
37581 +#define MINILZO_VERSION         0x1080
37582 +
37583 +#ifdef __LZOCONF_H
37584 +#  error "you cannot use both LZO and miniLZO"
37585 +#endif
37586 +
37587 +#undef LZO_HAVE_CONFIG_H
37588 +#include "lzoconf.h"
37589 +
37590 +#if !defined(LZO_VERSION) || (LZO_VERSION != MINILZO_VERSION)
37591 +#  error "version mismatch in header files"
37592 +#endif
37593 +
37594 +
37595 +#ifdef __cplusplus
37596 +extern "C" {
37597 +#endif
37598 +
37599 +
37600 +/***********************************************************************
37601 +//
37602 +************************************************************************/
37603 +
37604 +/* Memory required for the wrkmem parameter.
37605 + * When the required size is 0, you can also pass a NULL pointer.
37606 + */
37607 +
37608 +#define LZO1X_MEM_COMPRESS      LZO1X_1_MEM_COMPRESS
37609 +#define LZO1X_1_MEM_COMPRESS    ((lzo_uint32) (16384L * lzo_sizeof_dict_t))
37610 +#define LZO1X_MEM_DECOMPRESS    (0)
37611 +
37612 +
37613 +/* compression */
37614 +LZO_EXTERN(int)
37615 +lzo1x_1_compress        ( const lzo_byte *src, lzo_uint  src_len,
37616 +                                lzo_byte *dst, lzo_uintp dst_len,
37617 +                                lzo_voidp wrkmem );
37618 +
37619 +/* decompression */
37620 +LZO_EXTERN(int)
37621 +lzo1x_decompress        ( const lzo_byte *src, lzo_uint  src_len,
37622 +                                lzo_byte *dst, lzo_uintp dst_len,
37623 +                                lzo_voidp wrkmem /* NOT USED */ );
37624 +
37625 +/* safe decompression with overrun testing */
37626 +LZO_EXTERN(int)
37627 +lzo1x_decompress_safe   ( const lzo_byte *src, lzo_uint  src_len,
37628 +                                lzo_byte *dst, lzo_uintp dst_len,
37629 +                                lzo_voidp wrkmem /* NOT USED */ );
37630 +
37631 +
37632 +#ifdef __cplusplus
37633 +} /* extern "C" */
37634 +#endif
37635 +
37636 +#endif /* already included */
37637 +
37638 diff -rupN linux-2.6.8-rc3/fs/reiser4/plugin/cryptcompress.c linux-2.6.8-rc3-a/fs/reiser4/plugin/cryptcompress.c
37639 --- linux-2.6.8-rc3/fs/reiser4/plugin/cryptcompress.c   1970-01-01 03:00:00.000000000 +0300
37640 +++ linux-2.6.8-rc3-a/fs/reiser4/plugin/cryptcompress.c 2004-08-05 21:20:52.801719897 +0400
37641 @@ -0,0 +1,3087 @@
37642 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README
37643 +
37644 +This file contains all cluster operations and methods of the reiser4
37645 +cryptcompress object plugin (see http://www.namesys.com/cryptcompress_design.html
37646 +for details).
37647 +The list of cryptcompress specific EA:
37648 +
37649 +                 Incore inode                               Disk stat-data
37650 +********************************************************************************************
37651 +* data structure       *         field        * data structure       *          field      *
37652 +********************************************************************************************
37653 +* plugin_set           *file plugin id        * reiser4_plugin_stat  *file plugin id       *
37654 +*                      *crypto plugin id      *                      *crypto plugin id     *
37655 +*                      *digest plugin id      *                      *digest plugin id     *
37656 +*                      *compression plugin id *                      *compression plugin id*
37657 +********************************************************************************************
37658 +* crypto_stat_t        *      keysize         * reiser4_crypto_stat  *      keysize        *
37659 +*                      *      keyid           *                      *      keyid          *
37660 +********************************************************************************************
37661 +* cluster_stat_t       *      cluster_shift   * reiser4_cluster_stat *      cluster_shift  *
37662 +********************************************************************************************
37663 +* cryptcompress_info_t *      crypto_tfm      *                      *                     *
37664 +********************************************************************************************
37665 +*/
37666 +#include "../debug.h"
37667 +#include "../inode.h"
37668 +#include "../jnode.h"
37669 +#include "../tree.h"
37670 +#include "../page_cache.h"
37671 +#include "../readahead.h"
37672 +#include "../forward.h"
37673 +#include "../super.h"
37674 +#include "../context.h"
37675 +#include "../cluster.h"
37676 +#include "../seal.h"
37677 +#include "plugin.h"
37678 +#include "object.h"
37679 +#include "file/funcs.h"
37680 +
37681 +#include <asm/scatterlist.h>
37682 +#include <linux/writeback.h>
37683 +#include <linux/pagemap.h>
37684 +#include <linux/crypto.h>
37685 +#include <linux/swap.h>
37686 +
37687 +int do_readpage_ctail(reiser4_cluster_t *, struct page * page);
37688 +int ctail_read_cluster (reiser4_cluster_t *, struct inode *, int);
37689 +reiser4_key * append_cluster_key_ctail(const coord_t *, reiser4_key *);
37690 +int setattr_reserve(reiser4_tree *);
37691 +int reserve_cut_iteration(reiser4_tree *);
37692 +int writepage_ctail(struct page *);
37693 +int truncate_jnodes_range(struct inode *inode, unsigned long from, int count);
37694 +int cut_file_items(struct inode *inode, loff_t new_size, int update_sd, loff_t cur_size, int mode);
37695 +int delete_object(struct inode *inode, int mode);
37696 +__u8 cluster_shift_by_coord(const coord_t * coord);
37697 +int ctail_make_unprepped_cluster(reiser4_cluster_t * clust, struct inode * inode);
37698 +unsigned long clust_by_coord(const coord_t * coord);
37699 +int hint_is_set(const hint_t *hint);
37700 +reiser4_plugin * get_default_plugin(pset_member memb);
37701 +
37702 +/* get cryptcompress specific portion of inode */
37703 +reiser4_internal cryptcompress_info_t *
37704 +cryptcompress_inode_data(const struct inode * inode)
37705 +{
37706 +       return &reiser4_inode_data(inode)->file_plugin_data.cryptcompress_info;
37707 +}
37708 +
37709 +/* plugin->u.file.init_inode_data */
37710 +reiser4_internal void
37711 +init_inode_data_cryptcompress(struct inode *inode,
37712 +                             reiser4_object_create_data *crd, int create)
37713 +{
37714 +       cryptcompress_info_t * data;
37715 +
37716 +       data = cryptcompress_inode_data(inode);
37717 +       assert("edward-685", data != NULL);
37718 +
37719 +       xmemset(data, 0, sizeof (*data));
37720 +}
37721 +
37722 +reiser4_internal int
37723 +crc_inode_ok(struct inode * inode)
37724 +{
37725 +       reiser4_inode * info = reiser4_inode_data(inode);
37726 +       cryptcompress_info_t * data = cryptcompress_inode_data(inode);
37727 +
37728 +       if ((info->cluster_shift <= MAX_CLUSTER_SHIFT) &&
37729 +           (data->tfm[CRYPTO_TFM] == NULL) &&
37730 +           (data->tfm[DIGEST_TFM] == NULL))
37731 +               return 1;
37732 +       assert("edward-686", 0);
37733 +       return 0;
37734 +}
37735 +
37736 +reiser4_internal crypto_stat_t * inode_crypto_stat (struct inode * inode)
37737 +{
37738 +       assert("edward-90", inode != NULL);
37739 +       assert("edward-91", reiser4_inode_data(inode) != NULL);
37740 +       return (reiser4_inode_data(inode)->crypt);
37741 +}
37742 +
37743 +/* NOTE-EDWARD: Do not use crypto without digest */
37744 +static int
37745 +alloc_crypto_tfm(struct inode * inode, crypto_data_t * data)
37746 +{
37747 +       int result;
37748 +       crypto_plugin * cplug = crypto_plugin_by_id(data->cra);
37749 +       digest_plugin * dplug = digest_plugin_by_id(data->dia);
37750 +
37751 +       assert("edward-414", dplug != NULL);
37752 +       assert("edward-415", cplug != NULL);
37753 +
37754 +       result = dplug->alloc(inode);
37755 +       if (result)
37756 +               return result;
37757 +       result = cplug->alloc(inode);
37758 +       if (result) {
37759 +               dplug->free(inode);
37760 +               return result;
37761 +       }
37762 +       return 0;
37763 +}
37764 +
37765 +static void
37766 +free_crypto_tfm(struct inode * inode)
37767 +{
37768 +       reiser4_inode * info;
37769 +
37770 +       assert("edward-410", inode != NULL);
37771 +
37772 +       info = reiser4_inode_data(inode);
37773 +
37774 +       if (!inode_get_crypto(inode))
37775 +               return;
37776 +
37777 +       assert("edward-411", inode_crypto_plugin(inode));
37778 +       assert("edward-763", inode_digest_plugin(inode));
37779 +
37780 +       inode_crypto_plugin(inode)->free(inode);
37781 +       inode_digest_plugin(inode)->free(inode);
37782 +}
37783 +
37784 +static int
37785 +attach_crypto_stat(struct inode * inode, crypto_data_t * data)
37786 +{
37787 +       __u8 * txt;
37788 +
37789 +       crypto_stat_t * stat;
37790 +       struct scatterlist sg;
37791 +       struct crypto_tfm * dtfm;
37792 +
37793 +       assert("edward-690", inode_get_crypto(inode));
37794 +       assert("edward-766", inode_get_digest(inode));
37795 +
37796 +       dtfm =  inode_get_digest(inode);
37797 +
37798 +       stat = reiser4_kmalloc(sizeof(*stat), GFP_KERNEL);
37799 +       if (!stat)
37800 +               return -ENOMEM;
37801 +
37802 +       stat->keyid = reiser4_kmalloc((size_t)crypto_tfm_alg_digestsize(dtfm), GFP_KERNEL);
37803 +       if (!stat->keyid) {
37804 +               reiser4_kfree(stat);
37805 +               return -ENOMEM;
37806 +       }
37807 +       txt = reiser4_kmalloc(data->keyid_size, GFP_KERNEL);
37808 +       if (!txt) {
37809 +               reiser4_kfree(stat->keyid);
37810 +               reiser4_kfree(stat);
37811 +               return -ENOMEM;
37812 +       }
37813 +       xmemcpy(txt, data->keyid, data->keyid_size);
37814 +       sg.page = virt_to_page (txt);
37815 +       sg.offset = offset_in_page (txt);
37816 +       sg.length = data->keyid_size;
37817 +
37818 +       crypto_digest_init (dtfm);
37819 +       crypto_digest_update (dtfm, &sg, 1);
37820 +       crypto_digest_final (dtfm, stat->keyid);
37821 +
37822 +       reiser4_inode_data(inode)->crypt = stat;
37823 +       reiser4_kfree(txt);
37824 +
37825 +       return 0;
37826 +}
37827 +
37828 +static void
37829 +detach_crypto_stat(struct inode * object)
37830 +{
37831 +       crypto_stat_t * stat;
37832 +
37833 +       stat = inode_crypto_stat(object);
37834 +
37835 +       assert("edward-691", crc_inode_ok(object));
37836 +
37837 +       if (!inode_get_crypto(object))
37838 +               return;
37839 +
37840 +       assert("edward-412", stat != NULL);
37841 +
37842 +       reiser4_kfree(stat->keyid);
37843 +       reiser4_kfree(stat);
37844 +}
37845 +
37846 +static void
37847 +init_default_crypto(crypto_data_t * data)
37848 +{
37849 +       assert("edward-692", data != NULL);
37850 +
37851 +       xmemset(data, 0, sizeof(*data));
37852 +
37853 +       data->cra = get_default_plugin(PSET_CRYPTO)->h.id;
37854 +       data->dia = get_default_plugin(PSET_DIGEST)->h.id;
37855 +       return;
37856 +}
37857 +
37858 +static void
37859 +init_default_compression(compression_data_t * data)
37860 +{
37861 +       assert("edward-693", data != NULL);
37862 +
37863 +       xmemset(data, 0, sizeof(*data));
37864 +
37865 +       data->coa = get_default_plugin(PSET_COMPRESSION)->h.id;
37866 +}
37867 +
37868 +static void
37869 +init_default_cluster(cluster_data_t * data)
37870 +{
37871 +       assert("edward-694", data != NULL);
37872 +
37873 +       *data = DEFAULT_CLUSTER_SHIFT;
37874 +}
37875 +
37876 +/*  1) fill crypto specific part of inode
37877 +    2) set inode crypto stat which is supposed to be saved in stat-data */
37878 +static int
37879 +inode_set_crypto(struct inode * object, crypto_data_t * data)
37880 +{
37881 +       int result;
37882 +       crypto_data_t def;
37883 +       struct crypto_tfm * tfm;
37884 +       crypto_plugin * cplug;
37885 +       digest_plugin * dplug;
37886 +       reiser4_inode * info = reiser4_inode_data(object);
37887 +
37888 +       if (!data) {
37889 +               init_default_crypto(&def);
37890 +               data = &def;
37891 +       }
37892 +       cplug = crypto_plugin_by_id(data->cra);
37893 +       dplug = digest_plugin_by_id(data->dia);
37894 +
37895 +       plugin_set_crypto(&info->pset, cplug);
37896 +       plugin_set_digest(&info->pset, dplug);
37897 +
37898 +       result = alloc_crypto_tfm(object, data);
37899 +       if (!result)
37900 +               return result;
37901 +
37902 +       if (!inode_get_crypto(object))
37903 +               /* nothing to do anymore */
37904 +               return 0;
37905 +
37906 +       assert("edward-416", data != NULL);
37907 +       assert("edward-414", dplug != NULL);
37908 +       assert("edward-415", cplug != NULL);
37909 +       assert("edward-417", data->key!= NULL);
37910 +       assert("edward-88", data->keyid != NULL);
37911 +       assert("edward-83", data->keyid_size != 0);
37912 +       assert("edward-89", data->keysize != 0);
37913 +
37914 +       tfm = inode_get_tfm(object, CRYPTO_TFM);
37915 +       assert("edward-695", tfm != NULL);
37916 +
37917 +       result = cplug->setkey(tfm, data->key, data->keysize);
37918 +       if (result) {
37919 +               free_crypto_tfm(object);
37920 +               return result;
37921 +       }
37922 +       assert ("edward-34", !inode_get_flag(object, REISER4_SECRET_KEY_INSTALLED));
37923 +       inode_set_flag(object, REISER4_SECRET_KEY_INSTALLED);
37924 +
37925 +       info->extmask |= (1 << CRYPTO_STAT);
37926 +
37927 +       result = attach_crypto_stat(object, data);
37928 +       if (result)
37929 +               goto error;
37930 +
37931 +       info->plugin_mask |= (1 << PSET_CRYPTO) | (1 << PSET_DIGEST);
37932 +
37933 +       return 0;
37934 + error:
37935 +       free_crypto_tfm(object);
37936 +       inode_clr_flag(object, REISER4_SECRET_KEY_INSTALLED);
37937 +       return result;
37938 +}
37939 +
37940 +static void
37941 +inode_set_compression(struct inode * object, compression_data_t * data)
37942 +{
37943 +       compression_data_t def;
37944 +       reiser4_inode * info = reiser4_inode_data(object);
37945 +
37946 +       if (!data) {
37947 +               init_default_compression(&def);
37948 +               data = &def;
37949 +       }
37950 +       plugin_set_compression(&info->pset, compression_plugin_by_id(data->coa));
37951 +       info->plugin_mask |= (1 << PSET_COMPRESSION);
37952 +
37953 +       return;
37954 +}
37955 +
37956 +static int
37957 +inode_set_cluster(struct inode * object, cluster_data_t * data)
37958 +{
37959 +       int result = 0;
37960 +       cluster_data_t def;
37961 +       reiser4_inode * info;
37962 +
37963 +       assert("edward-696", object != NULL);
37964 +
37965 +       info = reiser4_inode_data(object);
37966 +
37967 +       if(!data) {
37968 +               /* NOTE-EDWARD:
37969 +                  this is necessary parameter for cryptcompress objects! */
37970 +               printk("edward-418, create_cryptcompress: default cluster size (4K) was assigned\n");
37971 +
37972 +               init_default_cluster(&def);
37973 +               data = &def;
37974 +       }
37975 +       assert("edward-697", *data <= MAX_CLUSTER_SHIFT);
37976 +
37977 +       info->cluster_shift = *data;
37978 +       info->extmask |= (1 << CLUSTER_STAT);
37979 +       return result;
37980 +}
37981 +
37982 +/* plugin->create() method for crypto-compressed files
37983 +
37984 +. install plugins
37985 +. attach crypto info if specified
37986 +. attach compression info if specified
37987 +. attach cluster info
37988 +*/
37989 +reiser4_internal int
37990 +create_cryptcompress(struct inode *object, struct inode *parent, reiser4_object_create_data * data)
37991 +{
37992 +       int result;
37993 +       reiser4_inode * info;
37994 +
37995 +       assert("edward-23", object != NULL);
37996 +       assert("edward-24", parent != NULL);
37997 +       assert("edward-30", data != NULL);
37998 +       assert("edward-26", inode_get_flag(object, REISER4_NO_SD));
37999 +       assert("edward-27", data->id == CRC_FILE_PLUGIN_ID);
38000 +
38001 +       info = reiser4_inode_data(object);
38002 +
38003 +       assert("edward-29", info != NULL);
38004 +
38005 +       /* set file bit */
38006 +       info->plugin_mask |= (1 << PSET_FILE);
38007 +
38008 +       /* set crypto */
38009 +       result = inode_set_crypto(object, data->crypto);
38010 +       if (result)
38011 +               goto error;
38012 +
38013 +       /* set compression */
38014 +       inode_set_compression(object, data->compression);
38015 +
38016 +       /* set cluster info */
38017 +       result = inode_set_cluster(object, data->cluster);
38018 +       if (result)
38019 +               goto error;
38020 +       /* set plugin mask */
38021 +       info->extmask |= (1 << PLUGIN_STAT);
38022 +
38023 +       /* save everything in disk stat-data */
38024 +       result = write_sd_by_inode_common(object);
38025 +       if (!result)
38026 +               return 0;
38027 +       /* save() method failed, release attached crypto info */
38028 +       inode_clr_flag(object, REISER4_CRYPTO_STAT_LOADED);
38029 +       inode_clr_flag(object, REISER4_CLUSTER_KNOWN);
38030 + error:
38031 +       free_crypto_tfm(object);
38032 +       detach_crypto_stat(object);
38033 +       inode_clr_flag(object, REISER4_SECRET_KEY_INSTALLED);
38034 +       return result;
38035 +}
38036 +
38037 +reiser4_internal int open_cryptcompress(struct inode * inode, struct file * file)
38038 +{
38039 +       /* FIXME-EDWARD: should be powered by key management */
38040 +       assert("edward-698", inode_file_plugin(inode) == file_plugin_by_id(CRC_FILE_PLUGIN_ID));
38041 +       return 0;
38042 +}
38043 +
38044 +/* plugin->destroy_inode() */
38045 +reiser4_internal void
38046 +destroy_inode_cryptcompress(struct inode * inode)
38047 +{
38048 +       assert("edward-802", inode_file_plugin(inode) == file_plugin_by_id(SYMLINK_FILE_PLUGIN_ID));
38049 +       assert("edward-803", !is_bad_inode(inode) && is_inode_loaded(inode));
38050 +       assert("edward-804", inode_get_flag(inode, REISER4_CLUSTER_KNOWN));
38051 +
38052 +       free_crypto_tfm(inode);
38053 +       if (inode_get_flag(inode, REISER4_CRYPTO_STAT_LOADED))
38054 +               detach_crypto_stat(inode);
38055 +       inode_clr_flag(inode, REISER4_CLUSTER_KNOWN);
38056 +       inode_clr_flag(inode, REISER4_CRYPTO_STAT_LOADED);
38057 +       inode_clr_flag(inode, REISER4_SECRET_KEY_INSTALLED);
38058 +}
38059 +
38060 +static int
38061 +save_len_cryptcompress_plugin(struct inode * inode, reiser4_plugin * plugin)
38062 +{
38063 +       assert("edward-457", inode != NULL);
38064 +       assert("edward-458", plugin != NULL);
38065 +       assert("edward-459", plugin->h.id == CRC_FILE_PLUGIN_ID);
38066 +       return 0;
38067 +}
38068 +
38069 +reiser4_internal int
38070 +load_cryptcompress_plugin(struct inode * inode, reiser4_plugin * plugin, char **area, int *len)
38071 +{
38072 +       assert("edward-455", inode != NULL);
38073 +       assert("edward-456", (reiser4_inode_data(inode)->pset != NULL));
38074 +
38075 +       plugin_set_file(&reiser4_inode_data(inode)->pset, file_plugin_by_id(CRC_FILE_PLUGIN_ID));
38076 +       return 0;
38077 +}
38078 +
38079 +static int
38080 +change_crypto_file(struct inode * inode, reiser4_plugin * plugin)
38081 +{
38082 +       /* cannot change object plugin of already existing object */
38083 +       return RETERR(-EINVAL);
38084 +}
38085 +
38086 +struct reiser4_plugin_ops cryptcompress_plugin_ops = {
38087 +       .load      = load_cryptcompress_plugin,
38088 +       .save_len  = save_len_cryptcompress_plugin,
38089 +       .save      = NULL,
38090 +       .alignment = 8,
38091 +       .change    = change_crypto_file
38092 +};
38093 +
38094 +/* returns translated offset */
38095 +reiser4_internal loff_t inode_scaled_offset (struct inode * inode,
38096 +                                            const loff_t src_off /* input offset */)
38097 +{
38098 +       assert("edward-97", inode != NULL);
38099 +
38100 +       if (!inode_get_crypto(inode) || src_off == get_key_offset(max_key()))
38101 +               return src_off;
38102 +
38103 +       return inode_crypto_plugin(inode)->scale(inode, crypto_blocksize(inode), src_off);
38104 +}
38105 +
38106 +/* returns disk cluster size */
38107 +reiser4_internal size_t
38108 +inode_scaled_cluster_size (struct inode * inode)
38109 +{
38110 +       assert("edward-110", inode != NULL);
38111 +       assert("edward-111", inode_get_flag(inode, REISER4_CLUSTER_KNOWN));
38112 +
38113 +       return inode_scaled_offset(inode, inode_cluster_size(inode));
38114 +}
38115 +
38116 +reiser4_internal void reiser4_cluster_init (reiser4_cluster_t * clust){
38117 +       assert("edward-84", clust != NULL);
38118 +       xmemset(clust, 0, sizeof *clust);
38119 +       clust->stat = DATA_CLUSTER;
38120 +}
38121 +
38122 +/* release cluster's data */
38123 +reiser4_internal void
38124 +release_cluster_buf(reiser4_cluster_t * clust)
38125 +{
38126 +       assert("edward-121", clust != NULL);
38127 +
38128 +       if (clust->buf) {
38129 +               assert("edward-615", clust->bsize != 0);
38130 +               reiser4_kfree(clust->buf);
38131 +               clust->buf = NULL;
38132 +       }
38133 +}
38134 +
38135 +reiser4_internal void
38136 +put_cluster_data(reiser4_cluster_t * clust)
38137 +{
38138 +       assert("edward-435", clust != NULL);
38139 +
38140 +       release_cluster_buf(clust);
38141 +       xmemset(clust, 0, sizeof *clust);
38142 +}
38143 +
38144 +/* returns true if we don't need to read new cluster from disk */
38145 +reiser4_internal int cluster_is_uptodate (reiser4_cluster_t * clust)
38146 +{
38147 +       assert("edward-126", clust != NULL);
38148 +       return (clust->buf != NULL);
38149 +}
38150 +
38151 +/* return true if the cluster contains specified page */
38152 +reiser4_internal int
38153 +page_of_cluster(struct page * page, reiser4_cluster_t * clust, struct inode * inode)
38154 +{
38155 +       assert("edward-162", page != NULL);
38156 +       assert("edward-163", clust != NULL);
38157 +       assert("edward-164", inode != NULL);
38158 +       assert("edward-165", inode_get_flag(inode, REISER4_CLUSTER_KNOWN));
38159 +
38160 +       return (pg_to_clust(page->index, inode) == clust->index);
38161 +}
38162 +
38163 +reiser4_internal int count_to_nrpages(unsigned count)
38164 +{
38165 +       return (!count ? 0 : off_to_pg(count - 1) + 1);
38166 +}
38167 +
38168 +static int
38169 +new_cluster(reiser4_cluster_t * clust, struct inode * inode)
38170 +{
38171 +       return (clust_to_off(clust->index, inode) >= inode->i_size);
38172 +}
38173 +
38174 +/* set minimal number of cluster pages (start from first one)
38175 +   which cover hole and users data */
38176 +static void
38177 +set_nrpages_by_frame(reiser4_cluster_t * clust)
38178 +{
38179 +       assert("edward-180", clust != NULL);
38180 +
38181 +       if (clust->count + clust->delta == 0) {
38182 +               /* nothing to write - nothing to read */
38183 +               clust->nr_pages = 0;
38184 +               return;
38185 +       }
38186 +       clust->nr_pages = count_to_nrpages(clust->off + clust->count + clust->delta);
38187 +}
38188 +
38189 +/* cluster index should be valid */
38190 +reiser4_internal void
38191 +set_nrpages_by_inode(reiser4_cluster_t * clust, struct inode * inode)
38192 +{
38193 +       assert("edward-785", clust != NULL);
38194 +       assert("edward-786", inode != NULL);
38195 +
38196 +       clust->nr_pages = count_to_nrpages(fsize_to_count(clust, inode));
38197 +}
38198 +
38199 +/* plugin->key_by_inode() */
38200 +/* see plugin/plugin.h for details */
38201 +reiser4_internal int
38202 +key_by_inode_cryptcompress(struct inode *inode, loff_t off, reiser4_key * key)
38203 +{
38204 +       assert("edward-64", inode != 0);
38205 +       assert("edward-112", ergo(off != get_key_offset(max_key()), !off_to_cloff(off, inode)));
38206 +       /* don't come here with other offsets */
38207 +
38208 +       key_by_inode_and_offset_common(inode, 0, key);
38209 +       set_key_offset(key, (__u64) (!inode_crypto_stat(inode) ? off : inode_scaled_offset(inode, off)));
38210 +       return 0;
38211 +}
38212 +
38213 +/* plugin->flow_by_inode */
38214 +reiser4_internal int
38215 +flow_by_inode_cryptcompress(struct inode *inode /* file to build flow for */ ,
38216 +                           char *buf /* user level buffer */ ,
38217 +                           int user    /* 1 if @buf is of user space, 0 - if it is
38218 +                                          kernel space */ ,
38219 +                           loff_t size /* buffer size */ ,
38220 +                           loff_t off /* offset to start io from */ ,
38221 +                           rw_op op /* READ or WRITE */ ,
38222 +                           flow_t * f /* resulting flow */)
38223 +{
38224 +       assert("edward-436", f != NULL);
38225 +       assert("edward-149", inode != NULL);
38226 +       assert("edward-150", inode_file_plugin(inode) != NULL);
38227 +       assert("edward-151", inode_file_plugin(inode)->key_by_inode == key_by_inode_cryptcompress);
38228 +
38229 +
38230 +       f->length = size;
38231 +       f->data = buf;
38232 +       f->user = user;
38233 +       f->op = op;
38234 +
38235 +       if (op == WRITE_OP && user == 1)
38236 +               return 0;
38237 +       return key_by_inode_cryptcompress(inode, off, &f->key);
38238 +}
38239 +
38240 +reiser4_internal int
38241 +hint_prev_cluster(reiser4_cluster_t * clust)
38242 +{
38243 +       assert("edward-699", clust != NULL);
38244 +       assert("edward-700", clust->hint != NULL);
38245 +
38246 +       if (!clust->hint->coord.valid)
38247 +               return 0;
38248 +       assert("edward-701", clust->file != NULL);
38249 +       assert("edward-702", clust->file->f_dentry != NULL);
38250 +       assert("edward-703", clust->file->f_dentry->d_inode != NULL);
38251 +
38252 +       return (clust->index == off_to_clust(clust->hint->offset, clust->file->f_dentry->d_inode));
38253 +}
38254 +
38255 +static int
38256 +crc_hint_validate(hint_t *hint, const reiser4_key *key, int check_key, znode_lock_mode lock_mode)
38257 +{
38258 +       assert("edward-704", hint != NULL);
38259 +
38260 +       if (hint->coord.valid) {
38261 +               assert("edward-705", znode_is_any_locked(hint->coord.base_coord.node));
38262 +               return 0;
38263 +       }
38264 +       assert("edward-706", hint->coord.lh->owner == NULL);
38265 +       if (!hint || !hint_is_set(hint) || hint->mode != lock_mode)
38266 +               /* hint either not set or set by different operation */
38267 +               return RETERR(-E_REPEAT);
38268 +
38269 +#if 0
38270 +       if (check_key && get_key_offset(key) != hint->offset)
38271 +               /* hint is set for different key */
38272 +               return RETERR(-E_REPEAT);
38273 +#endif
38274 +       assert("edward-707", schedulable());
38275 +
38276 +       return seal_validate(&hint->seal, &hint->coord.base_coord, key,
38277 +                            hint->level, hint->coord.lh, FIND_MAX_NOT_MORE_THAN, lock_mode, ZNODE_LOCK_LOPRI);
38278 +}
38279 +
38280 +static inline void
38281 +crc_validate_extended_coord(uf_coord_t *uf_coord, loff_t offset)
38282 +{
38283 +       //      assert("edward-764", uf_coord->valid == 0);
38284 +       assert("edward-708", item_plugin_by_coord(&uf_coord->base_coord)->s.file.init_coord_extension);
38285 +
38286 +       item_plugin_by_coord(&uf_coord->base_coord)->s.file.init_coord_extension(uf_coord, offset);
38287 +}
38288 +
38289 +static inline void
38290 +crc_invalidate_extended_coord(uf_coord_t *uf_coord)
38291 +{
38292 +       coord_clear_iplug(&uf_coord->base_coord);
38293 +       uf_coord->valid = 0;
38294 +}
38295 +
38296 +static int all_but_offset_key_eq(const reiser4_key *k1, const reiser4_key *k2)
38297 +{
38298 +       return (get_key_locality(k1) == get_key_locality(k2) &&
38299 +               get_key_type(k1) == get_key_type(k2) &&
38300 +               get_key_band(k1) == get_key_band(k2) &&
38301 +               get_key_ordering(k1) == get_key_ordering(k2) &&
38302 +               get_key_objectid(k1) == get_key_objectid(k2));
38303 +}
38304 +
38305 +/* Search a disk cluster item.
38306 +   If result is not cbk_errored current znode is locked */
38307 +reiser4_internal int
38308 +find_cluster_item(hint_t * hint, /* coord, lh, seal */
38309 +                 const reiser4_key *key, /* key of next cluster item to read */
38310 +                 int check_key,
38311 +                 znode_lock_mode lock_mode /* which lock */,
38312 +                 ra_info_t *ra_info,
38313 +                 lookup_bias bias)
38314 +{
38315 +       int result;
38316 +       reiser4_key ikey;
38317 +       coord_t * coord = &hint->coord.base_coord;
38318 +
38319 +       assert("edward-152", hint != NULL);
38320 +
38321 +       if (hint->coord.valid) {
38322 +               assert("edward-709", znode_is_any_locked(coord->node));
38323 +               if (coord->between == BEFORE_ITEM) {
38324 +                       if (equal_to_rdk(coord->node, key)) {
38325 +                               result = goto_right_neighbor(coord, hint->coord.lh);
38326 +                               if (result == -E_NO_NEIGHBOR) {
38327 +                                       crc_invalidate_extended_coord(&hint->coord);
38328 +                                       return CBK_COORD_NOTFOUND;
38329 +                               }
38330 +                       }
38331 +                       coord->between = AT_UNIT;
38332 +                       result = zload(coord->node);
38333 +                       if (result)
38334 +                               return result;
38335 +                       /* check current item */
38336 +
38337 +                       if(!coord_is_existing_item(coord)) {
38338 +                               /* FIXME-EDWARD: This was the last item
38339 +                                  of the object */
38340 +                               crc_invalidate_extended_coord(&hint->coord);
38341 +                               zrelse(coord->node);
38342 +                               longterm_unlock_znode(hint->coord.lh);
38343 +                               goto traverse_tree;
38344 +                       }
38345 +                       item_key_by_coord(coord, &ikey);
38346 +                       if (!all_but_offset_key_eq(key, &ikey)) {
38347 +                               unset_hint(hint);
38348 +                               zrelse(coord->node);
38349 +                               return CBK_COORD_NOTFOUND;
38350 +                       }
38351 +                       if (get_key_offset(key) == get_key_offset(&ikey)) {
38352 +                               zrelse(coord->node);
38353 +                               return CBK_COORD_FOUND;
38354 +                       }
38355 +                       //assert("edward-765", get_key_offset(key) > get_key_offset(&ikey));
38356 +                       zrelse(coord->node);
38357 +                       return CBK_COORD_NOTFOUND;
38358 +               }
38359 +               else {
38360 +                       assert("edward-710", coord->between == AT_UNIT);
38361 +
38362 +                       /* repeat check with updated @key */
38363 +                       result = zload(coord->node);
38364 +                       if (result)
38365 +                               return result;
38366 +                       item_key_by_coord(coord, &ikey);
38367 +                       assert("edward-711", all_but_offset_key_eq(key, &ikey));
38368 +
38369 +                       if (get_key_offset(key) == get_key_offset(&ikey)) {
38370 +                               zrelse(coord->node);
38371 +                               return CBK_COORD_FOUND;
38372 +                       }
38373 +                       zrelse(coord->node);
38374 +                       /* status is not changed, perhaps this is a hole */
38375 +                       return CBK_COORD_NOTFOUND;
38376 +               }
38377 +       }
38378 +       else {
38379 +               /* extended coord is invalid */
38380 +               result = crc_hint_validate(hint, key, check_key, lock_mode);
38381 +               if (result)
38382 +                       goto traverse_tree;
38383 +
38384 +               assert("edward-712", znode_is_any_locked(coord->node));
38385 +
38386 +               /* hint is valid, extended coord is invalid */
38387 +               if (check_key) {
38388 +                       coord->between = AT_UNIT;
38389 +                       return CBK_COORD_FOUND;
38390 +               }
38391 +               return CBK_COORD_NOTFOUND;
38392 +       }
38393 +       assert("edward-713", hint->coord.lh->owner == NULL);
38394 + traverse_tree:
38395 +
38396 +       assert("edward-714", schedulable());
38397 +
38398 +       coord_init_zero(coord);
38399 +       hint->coord.valid = 0;
38400 +       return  coord_by_key(current_tree, key, coord, hint->coord.lh, lock_mode,
38401 +                            bias, LEAF_LEVEL, LEAF_LEVEL, CBK_UNIQUE, ra_info);
38402 +}
38403 +
38404 +/* This represent reiser4 crypto alignment policy.
38405 +   Returns the size > 0 of aligning overhead, if we should align/cut,
38406 +   returns 0, if we shouldn't (alignment assumes appinding an overhead of the size > 0) */
38407 +static int
38408 +crypto_overhead(size_t len /* advised length */,
38409 +               reiser4_cluster_t * clust,
38410 +               struct inode * inode, rw_op rw)
38411 +{
38412 +       size_t size = 0;
38413 +       int result = 0;
38414 +       int oh;
38415 +
38416 +       assert("edward-486", clust != 0);
38417 +
38418 +       if (!inode_get_crypto(inode) || !inode_crypto_plugin(inode)->align_cluster)
38419 +               return 0;
38420 +       if (!len)
38421 +               size = clust->len;
38422 +
38423 +       assert("edward-615", size != 0);
38424 +       assert("edward-489", crypto_blocksize(inode) != 0);
38425 +
38426 +       switch (rw) {
38427 +       case WRITE_OP: /* align */
38428 +               assert("edward-488", size <= inode_cluster_size(inode));
38429 +
38430 +               oh = size % crypto_blocksize(inode);
38431 +
38432 +               if (!oh && size == fsize_to_count(clust, inode))
38433 +                       /* cluster don't need alignment and didn't get compressed */
38434 +                       return 0;
38435 +               result = (crypto_blocksize(inode) - oh);
38436 +               break;
38437 +       case READ_OP: /* cut */
38438 +               assert("edward-490", size <= inode_scaled_cluster_size(inode));
38439 +               if (size >= inode_scaled_offset(inode, fsize_to_count(clust, inode)))
38440 +                       /* cluster didn't get aligned */
38441 +                       return 0;
38442 +               assert("edward-491", clust->buf != NULL);
38443 +
38444 +               result = *(clust->buf + size - 1);
38445 +               break;
38446 +       default:
38447 +               impossible("edward-493", "bad option for getting alignment");
38448 +       }
38449 +       return result;
38450 +}
38451 +
38452 +/* alternating the pairs (@clust->buf, @clust->bsize) and (@buf, @bufsize) */
38453 +static void
38454 +alternate_buffers(reiser4_cluster_t * clust, __u8 ** buf, size_t * bufsize)
38455 +{
38456 +       __u8 * tmp_buf;
38457 +       size_t tmp_size;
38458 +
38459 +       assert("edward-405", bufsize != NULL);
38460 +       assert("edward-406", *bufsize != 0);
38461 +
38462 +       tmp_buf = *buf;
38463 +       tmp_size = *bufsize;
38464 +
38465 +       *buf = clust->buf;
38466 +       *bufsize = clust->bsize;
38467 +
38468 +       clust->buf = tmp_buf;
38469 +       clust->bsize = tmp_size;
38470 +}
38471 +
38472 +/* maximal aligning overhead which can be appended
38473 +   to the flow before encryption if any */
38474 +reiser4_internal unsigned
38475 +max_crypto_overhead(struct inode * inode)
38476 +{
38477 +       if (!inode_get_crypto(inode) || !inode_crypto_plugin(inode)->align_cluster)
38478 +               return 0;
38479 +       return crypto_blocksize(inode);
38480 +}
38481 +
38482 +reiser4_internal unsigned
38483 +compress_overhead(struct inode * inode, int in_len)
38484 +{
38485 +       return inode_compression_plugin(inode)->overrun(in_len);
38486 +}
38487 +
38488 +/* The following two functions represent reiser4 compression policy */
38489 +static int
38490 +try_compress(reiser4_cluster_t * clust, struct inode * inode)
38491 +{
38492 +       return (inode_compression_plugin(inode) != compression_plugin_by_id(NONE_COMPRESSION_ID)) &&
38493 +               (clust->count >= MIN_SIZE_FOR_COMPRESSION);
38494 +}
38495 +
38496 +static int
38497 +try_encrypt(reiser4_cluster_t * clust, struct inode * inode)
38498 +{
38499 +       return inode_get_crypto(inode) != NULL;
38500 +}
38501 +
38502 +/* If this is true, we don't use copy on clustering, and page cluster will be
38503 +   flushed a bit later, during deflate_cluster(). This should be highly useful
38504 +   when PAGE_CACHE_SIZE is much more then 4K */
38505 +static int
38506 +delay_flush_pgcluster(reiser4_cluster_t * clust, struct inode * inode)
38507 +{
38508 +       return (clust->count <= PAGE_CACHE_SIZE) && (try_compress(clust, inode) || try_encrypt(clust, inode));
38509 +}
38510 +
38511 +/* Decide by the lengths of compressed and decompressed cluster, should we save or should
38512 +   we discard the result of compression. The policy is that the length of compressed then
38513 +   encrypted cluster including _all_ appended infrasrtucture should be _less_ then its lenght
38514 +   before compression. */
38515 +static int
38516 +save_compressed(reiser4_cluster_t * clust, struct inode * inode)
38517 +{
38518 +       /* NOTE: Actually we use max_crypto_overhead instead of precise overhead
38519 +          (a bit stronger condition) to avoid divisions */
38520 +       return (clust->len + CLUSTER_MAGIC_SIZE + max_crypto_overhead(inode) < clust->count);
38521 +}
38522 +
38523 +/* guess if the cluster was compressed */
38524 +static int
38525 +need_decompression(reiser4_cluster_t * clust, struct inode * inode,
38526 +                  int encrypted /* is cluster encrypted */)
38527 +{
38528 +       assert("edward-142", clust != 0);
38529 +       assert("edward-143", inode != NULL);
38530 +
38531 +       return (inode_compression_plugin(inode) != compression_plugin_by_id(NONE_COMPRESSION_ID)) &&
38532 +               (clust->len < (encrypted ? inode_scaled_offset(inode, fsize_to_count(clust, inode)) : fsize_to_count(clust, inode)));
38533 +}
38534 +
38535 +reiser4_internal void set_compression_magic(__u8 * magic)
38536 +{
38537 +       /* FIXME-EDWARD: If crypto_plugin != NULL, this should be private!
38538 +          Use 4 bytes of decrypted keyid. PARANOID? */
38539 +       assert("edward-279", magic != NULL);
38540 +       xmemset(magic, 0, CLUSTER_MAGIC_SIZE);
38541 +}
38542 +
38543 +/*
38544 +  Common cluster deflate manager.
38545 +
38546 +  . accept a flow as a single page or cluster of pages assembled into a buffer
38547 +    of cluster handle @clust
38548 +  . maybe allocate buffer @bf to store temporary results
38549 +  . maybe compress accepted flow and attach compression magic if result of
38550 +    compression is acceptable
38551 +  . maybe align and encrypt the flow.
38552 +  . stores the result in the buffer of cluster handle
38553 +                                     _ _ _ _ _ _ _ _
38554 +                                   |               |
38555 +                                    |  disk cluster |
38556 +                                    |_ _ _ _ _ _ _ _|
38557 +                                            ^
38558 +                                            |
38559 +          _______________            _______|_______       _ _ _ _ _ _ _ _
38560 +         |               | <---1--- |               |     |               |
38561 +         |     @bf       | ----2--> |     @clust    |<----|  page cluster |
38562 +         |_______________| ----3--> |_______________|     |_ _ _ _ _ _ _ _|
38563 +                 ^                          ^
38564 +              4 |      ______________      | 5
38565 +                 |     |              |     |
38566 +                 +---- |     page     | ----+
38567 +                      |______________|
38568 +
38569 +
38570 +  " --n-> " means one of the following operations on a pair of pointers (src, dst)
38571 +
38572 +  1 - compression or encryption
38573 +  2 - encryption
38574 +  3 - alternation
38575 +  4 - compression
38576 +  5 - compression or encryption or copy
38577 +
38578 +  where
38579 +  . compression is plugin->compress(),
38580 +  . encryption is plugin->encrypt(),
38581 +  . alternation is alternate_buffers() (if the final result is contained in temporary buffer @bf,
38582 +    we should move it to the cluster handle @clust)
38583 +  . copy is memcpy()
38584 +
38585 +
38586 +  FIXME-EDWARD: Currently the only symmetric crypto algorithms with ecb are
38587 +  supported
38588 +*/
38589 +
38590 +reiser4_internal int
38591 +deflate_cluster(tfm_info_t ctx, /* data for compression plugin, which can be allocated per flush positionn */
38592 +               reiser4_cluster_t *clust, /* contains data to process */
38593 +               struct inode *inode)
38594 +{
38595 +       int result = 0;
38596 +       __u8 * bf = NULL;
38597 +       __u8 * src = NULL;
38598 +       __u8 * dst = NULL;
38599 +       size_t bfsize = clust->count;
38600 +       struct page * pg = NULL;
38601 +
38602 +       assert("edward-401", inode != NULL);
38603 +       assert("edward-495", clust != NULL);
38604 +       assert("edward-496", clust->count != 0);
38605 +       assert("edward-497", clust->len == 0);
38606 +       assert("edward-498", clust->buf && clust->bsize);
38607 +
38608 +       if (try_compress(clust, inode)) {
38609 +               /* try to compress, discard bad results */
38610 +               __u32 dst_len;
38611 +               compression_plugin * cplug = inode_compression_plugin(inode);
38612 +
38613 +               assert("edward-602", cplug != NULL);
38614 +
38615 +               if (try_encrypt(clust, inode) || clust->nr_pages != 1) {
38616 +                       /* [12], [42], [13], tmp buffer is required */
38617 +                       bfsize += compress_overhead(inode, clust->count);
38618 +                       bf = reiser4_kmalloc(bfsize, GFP_KERNEL);
38619 +                       if (!bf)
38620 +                               return -ENOMEM;
38621 +                       dst = bf;
38622 +               }
38623 +               else
38624 +                       /* [5] */
38625 +                       dst = clust->buf;
38626 +               if (clust->nr_pages == 1) {
38627 +                       /* [42], [5] */
38628 +                       assert("edward-619", clust->pages != NULL);
38629 +                       assert("edward-620", PageDirty(*clust->pages));
38630 +
38631 +                       pg = *clust->pages;
38632 +                       lock_page(pg);
38633 +                       assert("edward-621", PageDirty(pg));
38634 +                       src = kmap(pg);
38635 +               }
38636 +               else
38637 +                       /* [12], [13] */
38638 +                       src = clust->buf;
38639 +
38640 +               dst_len = bfsize;
38641 +
38642 +               cplug->compress(ctx, src, clust->count, dst/* res */, &dst_len);
38643 +               assert("edward-763", !in_interrupt());
38644 +
38645 +               clust->len = dst_len;
38646 +
38647 +               assert("edward-603", clust->len <= (bf ? bfsize : clust->bsize));
38648 +
38649 +               /* estimate compression quality to accept or discard
38650 +                  the results of our efforts */
38651 +               if (save_compressed(clust, inode)) {
38652 +                       /* Accepted */
38653 +                       set_compression_magic(dst + clust->len);
38654 +                       clust->len += CLUSTER_MAGIC_SIZE;
38655 +               }
38656 +               else
38657 +                       /* discard */
38658 +                       clust->len = clust->count;
38659 +       }
38660 +
38661 +       if (try_encrypt(clust, inode)) {
38662 +               /* align and encrypt */
38663 +               int oh; /* ohhh, the crypto alignment overhead */
38664 +               int i, icb, ocb;
38665 +               __u32 * expkey;
38666 +               crypto_plugin * cplug = inode_crypto_plugin(inode);
38667 +
38668 +               assert("edward-716", inode_get_crypto(inode) != NULL);
38669 +
38670 +               icb = crypto_blocksize(inode);
38671 +               ocb = inode_scaled_offset(inode, icb);
38672 +
38673 +               assert("edward-605", icb != 0);
38674 +
38675 +               /* precise crypto-overhead */
38676 +               oh = crypto_overhead(0, clust, inode, WRITE_OP);
38677 +
38678 +               if (dst) {
38679 +                       /* compression is specified */
38680 +                       assert("edward-622", src != NULL);
38681 +                       assert("edward-623", bf != NULL && clust->len != 0);
38682 +                       assert("edward-624", clust->len <= clust->count);
38683 +
38684 +                       if (clust->len != clust->count)
38685 +                               /* saved */
38686 +                               src = dst;
38687 +                       else
38688 +                               /* refused */
38689 +                               ;
38690 +                       if (pg) {
38691 +                                /* release flushed page */
38692 +                               assert("edward-625", PageLocked(pg));
38693 +
38694 +                               kunmap(pg);
38695 +                               uncapture_page(pg);
38696 +                               unlock_page(pg);
38697 +                               page_cache_release(pg);
38698 +                               reiser4_kfree(clust->pages);
38699 +                               pg = NULL;
38700 +                       }
38701 +               }
38702 +               else {
38703 +                       /* [13], [5], compression wasn't specified */
38704 +
38705 +                       assert("edward-626", !clust->len);
38706 +
38707 +                       if (clust->nr_pages != 1) {
38708 +                               /* [13], tmp buffer required */
38709 +                               assert("edward-627", !bf);
38710 +
38711 +                               bfsize += oh;
38712 +                               bf = reiser4_kmalloc(bfsize, GFP_KERNEL);
38713 +                               if (!bf) {
38714 +                                       result = -ENOMEM;
38715 +                                       goto exit;
38716 +                               }
38717 +                               alternate_buffers(clust, &bf, &bfsize);
38718 +                               src = bf;
38719 +                       }
38720 +                       else {
38721 +                               /* [5] */
38722 +                               pg = *clust->pages;
38723 +                               lock_page(pg);
38724 +                               assert("edward-628", PageDirty(pg));
38725 +                               src = kmap(pg);
38726 +                       }
38727 +                       clust->len = clust->count;
38728 +               }
38729 +
38730 +               dst = clust->buf;
38731 +
38732 +               if (oh) {
38733 +                       /* align the source */
38734 +                       clust->len += cplug->align_cluster(src + clust->len, clust->len, icb);
38735 +
38736 +                       assert("edward-402", clust->len <= (pg ? PAGE_CACHE_SIZE : bfsize));
38737 +
38738 +                       *(src + clust->len - 1) = oh;
38739 +               }
38740 +#if REISER4_DEBUG
38741 +               if (clust->len % icb)
38742 +                       impossible("edward-403", "bad alignment");
38743 +#endif
38744 +
38745 +               expkey = cryptcompress_inode_data(inode)->expkey;
38746 +
38747 +               assert("edward-404", expkey != NULL);
38748 +
38749 +               for (i=0; i < clust->len/icb; i++)
38750 +                       cplug->encrypt(expkey, clust->buf + i*ocb /* dst */, src + i*icb);
38751 +       }
38752 +
38753 +       else if (dst && clust->len != clust->count) {
38754 +               /* [13], [5], saved compression, no encryption */
38755 +               if (bf) {
38756 +                       /* [13] */
38757 +                       assert("edward-635", bf == dst);
38758 +                       assert("edward-636", !clust->pages);
38759 +                       alternate_buffers(clust, &bf, &bfsize);
38760 +               }
38761 +       }
38762 +       else {
38763 +               /* not specified or discarded compression, no encryption,
38764 +                  [13], [5], [] */
38765 +
38766 +               if (delay_flush_pgcluster(clust, inode)) {
38767 +                       if (!pg) {
38768 +                               assert("edward-629", !src);
38769 +                               assert("edward-631", !clust->len);
38770 +                               /* -not specified, [5] */
38771 +                               pg = *clust->pages;
38772 +                               lock_page(pg);
38773 +                               src = kmap(pg);
38774 +                               clust->len = clust->count;
38775 +                       }
38776 +                       else {
38777 +                               /* -discarded, [13] */
38778 +                               assert("edward-630", src != NULL);
38779 +                               assert("edward-632", clust->len == clust->count);
38780 +                       }
38781 +                       xmemcpy(clust->buf, src, clust->count);
38782 +               }
38783 +               if (!clust->len)
38784 +                       /* not specified, [] */
38785 +                       clust->len = clust->count;
38786 +       }
38787 + exit:
38788 +       if (bf)
38789 +               reiser4_kfree(bf);
38790 +       if (pg) {
38791 +               assert("edward-621", PageLocked(pg));
38792 +
38793 +               kunmap(pg);
38794 +               uncapture_page(pg);
38795 +               unlock_page(pg);
38796 +               page_cache_release(pg);
38797 +               reiser4_kfree(clust->pages);
38798 +       }
38799 +       return result;
38800 +}
38801 +
38802 +/* Common inflate cluster manager. Is used in readpage() or readpages() methods of
38803 +   cryptcompress object plugins.
38804 +   . maybe allocate temporary buffer (@bf)
38805 +   . maybe decrypt disk cluster (assembled in united flow of cluster handle) and
38806 +     cut crypto-alignment overhead (if any)
38807 +   . maybe check for compression magic and decompress
38808 +
38809 +   The final result is stored in the buffer of the cluster handle (@clust)
38810 +   (which contained assembled disk cluster at the beginning of this procedure)
38811 +   and is supposed to be sliced into page cluster by appropriate fillers, but if
38812 +   cluster size is equal PAGE_SIZE we fill the single page (@pg) right here:
38813 +
38814 +                                      _ _ _ _ _ _ _ _
38815 +                                     |               |
38816 +                                     |  disk cluster |
38817 +                                     |_ _ _ _ _ _ _ _|
38818 +                                             |
38819 +                                             |
38820 +          ________________            _______V_______       _ _ _ _ _ _ _ _
38821 +         |                | <---1--- |               |     |               |
38822 +         |      @bf       | ----2--> |     @clust    |---->|  page cluster |
38823 +         |________________| ----3--> |_______________|     |_ _ _ _ _ _ _ _|
38824 +                 |                           |
38825 +              4 |      _______________      | 5
38826 +                 |     |               |     |
38827 +                 +---> |      @pg      | <---+
38828 +                      |_______________|
38829 +
38830 +
38831 +  " --n-> " means one of the following functions on a pair of pointers (src, dst):
38832 +
38833 +  1, 5 - decryption or decompression
38834 +  2, 4 - decompression
38835 +  3    - alternation
38836 +
38837 +  Where:
38838 +
38839 +  decryption is plugin->decrypt(),
38840 +  decompression is plugin->decompress,
38841 +  alternation is alternate_buffers()
38842 +*/
38843 +reiser4_internal int
38844 +inflate_cluster(reiser4_cluster_t *clust, /* cluster handle, contains assembled
38845 +                                            disk cluster to process */
38846 +               struct inode *inode)
38847 +{
38848 +       int result = 0;
38849 +       __u8 * dst = NULL;
38850 +       __u8 * bf = NULL;  /* buffer to handle temporary results */
38851 +       size_t bfsize = 0; /* size of the buffer above */
38852 +       struct page * pg = NULL; /* pointer to a single page if cluster size
38853 +                                   is equal page size */
38854 +       if (clust->stat == FAKE_CLUSTER)
38855 +               /* nothing to inflate */
38856 +               return 0;
38857 +
38858 +       assert("edward-407", clust->buf != NULL);
38859 +       assert("edward-408", clust->len != 0);
38860 +
38861 +       if (inode_get_crypto(inode) != NULL) {
38862 +               /* decrypt */
38863 +               int i;
38864 +               int oh = 0;
38865 +               int icb, ocb;
38866 +               __u32 * expkey;
38867 +               crypto_plugin * cplug = inode_crypto_plugin(inode);
38868 +
38869 +               assert("edward-617", cplug != 0);
38870 +
38871 +               if (clust->nr_pages == 1)
38872 +                       pg = *clust->pages;
38873 +               oh = crypto_overhead(0, clust, inode, READ_OP);
38874 +
38875 +               /* input/output crypto blocksizes */
38876 +               icb = crypto_blocksize(inode);
38877 +               ocb = inode_scaled_offset(inode, icb);
38878 +
38879 +               assert("edward-608", clust->len % ocb);
38880 +
38881 +               if (pg && !need_decompression(clust, inode,
38882 +                                             1 /* estimate for encrypted cluster */)) {
38883 +                       /* [5] */
38884 +                       assert("edward-609", clust->nr_pages == 1);
38885 +                       assert("edward-610", inode_cluster_size(inode) == PAGE_CACHE_SIZE);
38886 +
38887 +                       lock_page(pg);
38888 +                       if (PageUptodate(pg)) {
38889 +                               /* races with other read/write */
38890 +                               goto exit;
38891 +                       }
38892 +                       dst = kmap(pg);
38893 +               }
38894 +               else { /* [12] or [13], tmp buffer is needed, estimate its size */
38895 +                       bfsize = fsize_to_count(clust, inode);
38896 +                       bfsize += crypto_overhead(bfsize, clust, inode, WRITE_OP);
38897 +                       bf = reiser4_kmalloc(bfsize, GFP_KERNEL);
38898 +                       if (!bf) {
38899 +                               result = -ENOMEM;
38900 +                               goto exit;
38901 +                       }
38902 +                       dst = bf;
38903 +               }
38904 +
38905 +               /* decrypt cluster with the simplest mode
38906 +                * FIXME-EDWARD: call here stream mode plugin */
38907 +
38908 +               expkey = cryptcompress_inode_data(inode)->expkey;
38909 +
38910 +               assert("edward-141", expkey != NULL);
38911 +
38912 +               for (i=0; i < clust->len/ocb; i++)
38913 +                       cplug->decrypt(expkey, dst + i*icb /* dst */, clust->buf + i*ocb /* src */);
38914 +
38915 +                /* cut the alignment overhead */
38916 +               clust->len -= crypto_overhead(0, clust, inode, READ_OP);
38917 +       }
38918 +       if (need_decompression(clust, inode, 0 /* estimate for decrypted cluster */)) {
38919 +               unsigned dst_len = inode_cluster_size(inode);
38920 +               compression_plugin * cplug = inode_compression_plugin(inode);
38921 +               tfm_info_t ctx = NULL;
38922 +               __u8 * src = bf;
38923 +               __u8 magic[CLUSTER_MAGIC_SIZE];
38924 +
38925 +               src = bf;
38926 +
38927 +               if (clust->nr_pages == 1)
38928 +                       pg = *clust->pages;
38929 +
38930 +               if (pg) {
38931 +                       /* [5] or [14] */
38932 +                       lock_page(pg);
38933 +                       if (PageUptodate(pg)) {
38934 +                               /* races with other read/write */
38935 +                               goto exit;
38936 +                       }
38937 +                       dst = kmap(pg);
38938 +                       if (!bf)
38939 +                               src = clust->buf;
38940 +               }
38941 +               else {
38942 +                       /* [12] or [13] */
38943 +                       if (!bf) {
38944 +                               /* [13], tmp buffer is needed, estimate its size */
38945 +                               bfsize = fsize_to_count(clust, inode);
38946 +                               bf = reiser4_kmalloc(bfsize, GFP_KERNEL);
38947 +                               if (!bf) {
38948 +                                       result = -ENOMEM;
38949 +                                       goto exit;
38950 +                               }
38951 +                               alternate_buffers(clust, &bf, &bfsize);
38952 +                       }
38953 +                       dst = clust->buf;
38954 +                       src = bf;
38955 +               }
38956 +
38957 +               /* Check compression magic for possible IO errors.
38958 +
38959 +                  End-of-cluster format created before encryption:
38960 +
38961 +                  data
38962 +                  compression_magic  (4)   Indicates presence of compression
38963 +                                           infrastructure, should be private.
38964 +                                           Can be absent.
38965 +                  crypto_overhead          Created by ->align() method of crypto-plugin,
38966 +                                           Can be absent.
38967 +
38968 +                  Crypto overhead format:
38969 +
38970 +                  data
38971 +                  tail_size           (1)   size of aligning tail,
38972 +                                            1 <= tail_size <= blksize
38973 +               */
38974 +               set_compression_magic(magic);
38975 +
38976 +               if (memcmp(src + (clust->len - (size_t)CLUSTER_MAGIC_SIZE),
38977 +                          magic, (size_t)CLUSTER_MAGIC_SIZE)) {
38978 +                       printk("edward-156: wrong compression magic %d (should be %d)\n",
38979 +                              *((int *)(src + (clust->len - (size_t)CLUSTER_MAGIC_SIZE))), *((int *)magic));
38980 +                       result = -EIO;
38981 +                       goto exit;
38982 +               }
38983 +               clust->len -= (size_t)CLUSTER_MAGIC_SIZE;
38984 +               /* decompress cluster */
38985 +               cplug->decompress(ctx, src, clust->len, dst, &dst_len);
38986 +
38987 +               /* check length */
38988 +               assert("edward-157", dst_len == fsize_to_count(clust, inode));
38989 +
38990 +               clust->len = dst_len;
38991 +       }
38992 + exit:
38993 +       if (bf)
38994 +               reiser4_kfree(bf);
38995 +       if (clust->nr_pages == 1) {
38996 +
38997 +               assert("edward-618", clust->len <= PAGE_CACHE_SIZE);
38998 +
38999 +               if (!pg) {
39000 +                       /* no encryption, no compression */
39001 +                       pg = *clust->pages;
39002 +                       lock_page(pg);
39003 +                       if (PageUptodate(pg)) {
39004 +                               /* races with other read/write */
39005 +                               unlock_page(pg);
39006 +                               return result;
39007 +                       }
39008 +                       dst = kmap(pg);
39009 +                       xmemcpy(dst, clust->buf, clust->len);
39010 +               }
39011 +
39012 +               assert("edward-611", PageLocked(pg));
39013 +               assert("edward-637", !PageUptodate(pg));
39014 +               assert("edward-638", dst != NULL);
39015 +
39016 +               xmemset(dst + clust->len, 0, (size_t)PAGE_CACHE_SIZE - clust->len);
39017 +               kunmap(pg);
39018 +               SetPageUptodate(pg);
39019 +               unlock_page(pg);
39020 +       }
39021 +       return result;
39022 +}
39023 +
39024 +/* plugin->read() :
39025 + * generic_file_read()
39026 + * All key offsets don't make sense in traditional unix semantics unless they
39027 + * represent the beginning of clusters, so the only thing we can do is start
39028 + * right from mapping to the address space (this is precisely what filemap
39029 + * generic method does) */
39030 +
39031 +/* plugin->readpage() */
39032 +reiser4_internal int
39033 +readpage_cryptcompress(void *vp, struct page *page)
39034 +{
39035 +       reiser4_cluster_t clust;
39036 +       struct file * file;
39037 +       item_plugin * iplug;
39038 +       int result;
39039 +
39040 +       assert("edward-88", PageLocked(page));
39041 +       assert("edward-89", page->mapping && page->mapping->host);
39042 +
39043 +       file = vp;
39044 +       if (file)
39045 +               assert("edward-113", page->mapping == file->f_dentry->d_inode->i_mapping);
39046 +
39047 +       if (PageUptodate(page)) {
39048 +               printk("readpage_cryptcompress: page became already uptodate\n");
39049 +               unlock_page(page);
39050 +               return 0;
39051 +       }
39052 +       reiser4_cluster_init(&clust);
39053 +
39054 +       iplug = item_plugin_by_id(CTAIL_ID);
39055 +       if (!iplug->s.file.readpage)
39056 +               return -EINVAL;
39057 +
39058 +       result = iplug->s.file.readpage(&clust, page);
39059 +
39060 +       assert("edward-64", ergo(result == 0, (PageLocked(page) || PageUptodate(page))));
39061 +       /* if page has jnode - that jnode is mapped
39062 +          assert("edward-65", ergo(result == 0 && PagePrivate(page),
39063 +          jnode_mapped(jprivate(page))));
39064 +       */
39065 +       return result;
39066 +}
39067 +
39068 +/* plugin->readpages() */
39069 +reiser4_internal void
39070 +readpages_cryptcompress(struct file *file, struct address_space *mapping,
39071 +                       struct list_head *pages)
39072 +{
39073 +       item_plugin *iplug;
39074 +
39075 +       iplug = item_plugin_by_id(CTAIL_ID);
39076 +       iplug->s.file.readpages(file, mapping, pages);
39077 +       return;
39078 +}
39079 +
39080 +static void
39081 +set_cluster_pages_dirty(reiser4_cluster_t * clust)
39082 +{
39083 +       int i;
39084 +       struct page * pg;
39085 +
39086 +       for (i=0; i < clust->nr_pages; i++) {
39087 +
39088 +               pg = clust->pages[i];
39089 +
39090 +               lock_page(pg);
39091 +
39092 +               set_page_dirty_internal(pg, 0);
39093 +               SetPageUptodate(pg);
39094 +               mark_page_accessed(pg);
39095 +
39096 +               unlock_page(pg);
39097 +
39098 +               page_cache_release(pg);
39099 +       }
39100 +}
39101 +
39102 +/* This is the interface to capture cluster nodes via their struct page reference.
39103 +   Any two blocks of the same cluster contain dependent modification and should
39104 +   commit at the same time */
39105 +static int
39106 +try_capture_cluster(reiser4_cluster_t * clust)
39107 +{
39108 +       int i;
39109 +       int result = 0;
39110 +
39111 +       for (i=0; i < clust->nr_pages; i++) {
39112 +               jnode * node;
39113 +               struct page *pg;
39114 +
39115 +               pg = clust->pages[i];
39116 +               node = jprivate(pg);
39117 +
39118 +               assert("edward-220", node != NULL);
39119 +
39120 +               LOCK_JNODE(node);
39121 +
39122 +               result = try_capture(node, ZNODE_WRITE_LOCK, 0/* not non-blocking */, 0 /* no can_coc */);
39123 +               if (result) {
39124 +                       UNLOCK_JNODE(node);
39125 +                       jput(node);
39126 +                       break;
39127 +               }
39128 +               UNLOCK_JNODE(node);
39129 +       }
39130 +       if(result)
39131 +               /* drop nodes */
39132 +               while(i) {
39133 +                       i--;
39134 +                       uncapture_jnode(jprivate(clust->pages[i]));
39135 +               }
39136 +       return result;
39137 +}
39138 +
39139 +static void
39140 +make_cluster_jnodes_dirty(reiser4_cluster_t * clust)
39141 +{
39142 +       int i;
39143 +       jnode * node;
39144 +
39145 +       for (i=0; i < clust->nr_pages; i++) {
39146 +               node = jprivate(clust->pages[i]);
39147 +
39148 +               assert("edward-221", node != NULL);
39149 +
39150 +               LOCK_JNODE(node);
39151 +               jnode_make_dirty_locked(node);
39152 +               UNLOCK_JNODE(node);
39153 +
39154 +               jput(node);
39155 +       }
39156 +}
39157 +
39158 +/* collect unlocked cluster pages and jnodes */
39159 +static int
39160 +grab_cache_cluster(struct inode * inode, reiser4_cluster_t * clust)
39161 +{
39162 +       int i;
39163 +       int result = 0;
39164 +       jnode * node;
39165 +
39166 +       assert("edward-182", clust != NULL);
39167 +       assert("edward-183", clust->pages != NULL);
39168 +       assert("edward-437", clust->nr_pages != 0);
39169 +       assert("edward-184", 0 < clust->nr_pages <= inode_cluster_pages(inode));
39170 +
39171 +       for (i = 0; i < clust->nr_pages; i++) {
39172 +               clust->pages[i] = grab_cache_page(inode->i_mapping, clust_to_pg(clust->index, inode) + i);
39173 +               if (!(clust->pages[i])) {
39174 +                       result = RETERR(-ENOMEM);
39175 +                       break;
39176 +               }
39177 +               node = jnode_of_page(clust->pages[i]);
39178 +               unlock_page(clust->pages[i]);
39179 +               if (IS_ERR(node)) {
39180 +                       page_cache_release(clust->pages[i]);
39181 +                       result = PTR_ERR(node);
39182 +                       break;
39183 +               }
39184 +               LOCK_JNODE(node);
39185 +               JF_SET(node, JNODE_CLUSTER_PAGE);
39186 +               UNLOCK_JNODE(node);
39187 +       }
39188 +       if (result) {
39189 +               while(i) {
39190 +                       i--;
39191 +                       page_cache_release(clust->pages[i]);
39192 +                       assert("edward-222", jprivate(clust->pages[i]) != NULL);
39193 +                       jput(jprivate(clust->pages[i]));
39194 +               }
39195 +       }
39196 +       return result;
39197 +}
39198 +
39199 +/* collect unlocked cluster pages */
39200 +reiser4_internal int
39201 +grab_cluster_pages(struct inode * inode, reiser4_cluster_t * clust)
39202 +{
39203 +       int i;
39204 +       int result = 0;
39205 +
39206 +       assert("edward-787", clust != NULL);
39207 +       assert("edward-788", clust->pages != NULL);
39208 +       assert("edward-789", clust->nr_pages != 0);
39209 +       assert("edward-790", 0 < clust->nr_pages <= inode_cluster_pages(inode));
39210 +
39211 +       for (i = 0; i < clust->nr_pages; i++) {
39212 +               clust->pages[i] = grab_cache_page(inode->i_mapping, clust_to_pg(clust->index, inode) + i);
39213 +               if (!(clust->pages[i])) {
39214 +                       result = RETERR(-ENOMEM);
39215 +                       break;
39216 +               }
39217 +               unlock_page(clust->pages[i]);
39218 +       }
39219 +       if (result) {
39220 +               while(i) {
39221 +                       i--;
39222 +                       page_cache_release(clust->pages[i]);
39223 +               }
39224 +       }
39225 +       return result;
39226 +}
39227 +
39228 +UNUSED_ARG static void
39229 +set_cluster_unlinked(reiser4_cluster_t * clust, struct inode * inode)
39230 +{
39231 +       jnode * node;
39232 +
39233 +       node = jprivate(clust->pages[0]);
39234 +
39235 +       assert("edward-640", node);
39236 +
39237 +       LOCK_JNODE(node);
39238 +       JF_SET(node, JNODE_NEW);
39239 +       UNLOCK_JNODE(node);
39240 +}
39241 +
39242 +static void
39243 +put_cluster_jnodes(reiser4_cluster_t * clust)
39244 +{
39245 +       int i;
39246 +
39247 +       assert("edward-223", clust != NULL);
39248 +
39249 +       for (i=0; i < clust->nr_pages; i++) {
39250 +
39251 +               assert("edward-208", clust->pages[i] != NULL);
39252 +               assert("edward-224", jprivate(clust->pages[i]) != NULL);
39253 +
39254 +               jput(jprivate(clust->pages[i]));
39255 +       }
39256 +}
39257 +
39258 +/* put cluster pages */
39259 +reiser4_internal void
39260 +release_cluster_pages(reiser4_cluster_t * clust, int from)
39261 +{
39262 +       int i;
39263 +
39264 +       assert("edward-447", clust != NULL);
39265 +       assert("edward-448", from < clust->nr_pages);
39266 +
39267 +       for (i = from; i < clust->nr_pages; i++) {
39268 +
39269 +               assert("edward-449", clust->pages[i] != NULL);
39270 +
39271 +               page_cache_release(clust->pages[i]);
39272 +       }
39273 +}
39274 +
39275 +static void
39276 +release_cluster(reiser4_cluster_t * clust)
39277 +{
39278 +       int i;
39279 +
39280 +       assert("edward-445", clust != NULL);
39281 +
39282 +       for (i=0; i < clust->nr_pages; i++) {
39283 +
39284 +               assert("edward-446", clust->pages[i] != NULL);
39285 +               assert("edward-447", jprivate(clust->pages[i]) != NULL);
39286 +
39287 +               page_cache_release(clust->pages[i]);
39288 +               jput(jprivate(clust->pages[i]));
39289 +       }
39290 +}
39291 +
39292 +/* debugging purposes */
39293 +#if REISER4_DEBUG
39294 +reiser4_internal int
39295 +cluster_invariant(reiser4_cluster_t * clust, struct inode * inode)
39296 +{
39297 +       assert("edward-279", clust != NULL);
39298 +
39299 +       return (clust->pages != NULL &&
39300 +               clust->off < inode_cluster_size(inode) &&
39301 +               ergo(clust->delta != 0, clust->stat == HOLE_CLUSTER) &&
39302 +               clust->off + clust->count + clust->delta <= inode_cluster_size(inode));
39303 +}
39304 +#endif
39305 +
39306 +/* guess next cluster status */
39307 +static inline reiser4_cluster_status
39308 +next_cluster_stat(reiser4_cluster_t * clust)
39309 +{
39310 +       return (clust->stat == HOLE_CLUSTER && clust->delta == 0 /* no non-zero data */ ? HOLE_CLUSTER : DATA_CLUSTER);
39311 +}
39312 +
39313 +/* guess next cluster params */
39314 +static void
39315 +update_cluster(struct inode * inode, reiser4_cluster_t * clust, loff_t file_off, loff_t to_file)
39316 +{
39317 +       assert ("edward-185", clust != NULL);
39318 +       assert ("edward-438", clust->pages != NULL);
39319 +       assert ("edward-281", cluster_invariant(clust, inode));
39320 +
39321 +       switch (clust->stat) {
39322 +       case DATA_CLUSTER:
39323 +               /* increment */
39324 +               clust->stat = DATA_CLUSTER;
39325 +               clust->off = 0;
39326 +               clust->index++;
39327 +               clust->count = min_count(inode_cluster_size(inode), to_file);
39328 +               break;
39329 +       case HOLE_CLUSTER:
39330 +               switch(next_cluster_stat(clust)) {
39331 +               case HOLE_CLUSTER:
39332 +                       /* skip */
39333 +                       clust->stat = HOLE_CLUSTER;
39334 +                       clust->off = 0;
39335 +                       clust->index = off_to_clust(file_off, inode);
39336 +                       clust->count = off_to_cloff(file_off, inode);
39337 +                       clust->delta = min_count(inode_cluster_size(inode) - clust->count, to_file);
39338 +                       break;
39339 +               case DATA_CLUSTER:
39340 +                       /* keep immovability, off+count+delta=inv */
39341 +                       clust->stat = DATA_CLUSTER;
39342 +                       clust->off = clust->off + clust->count;
39343 +                       clust->count = clust->delta;
39344 +                       clust->delta = 0;
39345 +                       break;
39346 +               default:
39347 +                       impossible ("edward-282", "wrong next cluster status");
39348 +               }
39349 +       default:
39350 +               impossible ("edward-283", "wrong current cluster status");
39351 +       }
39352 +}
39353 +
39354 +static int
39355 +__reserve4cluster(struct inode * inode, reiser4_cluster_t * clust)
39356 +{
39357 +       int result = 0;
39358 +       int reserved = 0;
39359 +       jnode * j;
39360 +
39361 +       assert("edward-439", inode != NULL);
39362 +       assert("edward-440", clust != NULL);
39363 +       assert("edward-441", clust->pages != NULL);
39364 +       assert("edward-442", jprivate(clust->pages[0]) != NULL);
39365 +
39366 +       j = jprivate(clust->pages[0]);
39367 +
39368 +       LOCK_JNODE(j);
39369 +       if (JF_ISSET(j, JNODE_CREATED)) {
39370 +               /* jnode mapped <=> space reserved */
39371 +               UNLOCK_JNODE(j);
39372 +               return 0;
39373 +       }
39374 +       reserved = estimate_insert_cluster(inode, 0/* prepped */);
39375 +       result = reiser4_grab_space_force(reserved, 0);
39376 +       if (result)
39377 +               return result;
39378 +       JF_SET(j, JNODE_CREATED);
39379 +
39380 +       grabbed2cluster_reserved(reserved);
39381 +       all_grabbed2free();
39382 +
39383 +#if REISER4_DEBUG
39384 +       {
39385 +               reiser4_context * ctx = get_current_context();
39386 +               assert("edward-777", ctx->grabbed_blocks == 0);
39387 +       }
39388 +#endif
39389 +       UNLOCK_JNODE(j);
39390 +       return 0;
39391 +}
39392 +
39393 +#if REISER4_TRACE
39394 +#define reserve4cluster(inode, clust, msg)    __reserve4cluster(inode, clust)
39395 +#else
39396 +#define reserve4cluster(inode, clust, msg)    __reserve4cluster(inode, clust)
39397 +#endif
39398 +
39399 +static void
39400 +free_reserved4cluster(struct inode * inode, reiser4_cluster_t * clust)
39401 +{
39402 +       jnode * j;
39403 +
39404 +       j = jprivate(clust->pages[0]);
39405 +
39406 +       LOCK_JNODE(j);
39407 +
39408 +       assert("edward-443", jnode_is_cluster_page(j));
39409 +       assert("edward-444", JF_ISSET(j, JNODE_CREATED));
39410 +
39411 +       cluster_reserved2free(estimate_insert_cluster(inode, 0));
39412 +       JF_CLR(j, JNODE_CREATED);
39413 +       UNLOCK_JNODE(j);
39414 +}
39415 +
39416 +static int
39417 +update_inode_cryptcompress(struct inode *inode,
39418 +                             loff_t new_size,
39419 +                             int update_i_size, int update_times,
39420 +                             int do_update)
39421 +{
39422 +       int result = 0;
39423 +       int old_grabbed;
39424 +       reiser4_context *ctx = get_current_context();
39425 +       reiser4_super_info_data * sbinfo = get_super_private(ctx->super);
39426 +
39427 +       old_grabbed = ctx->grabbed_blocks;
39428 +
39429 +       grab_space_enable();
39430 +
39431 +       result = reiser4_grab_space(/* one for stat data update */
39432 +               estimate_update_common(inode),
39433 +               0/* flags */);
39434 +       if (result)
39435 +               return result;
39436 +       if (do_update) {
39437 +               INODE_SET_FIELD(inode, i_size, new_size);
39438 +               inode->i_ctime = inode->i_mtime = CURRENT_TIME;
39439 +               result = reiser4_update_sd(inode);
39440 +       }
39441 +       grabbed2free(ctx, sbinfo, ctx->grabbed_blocks - old_grabbed);
39442 +       return result;
39443 +}
39444 +
39445 +/* stick pages into united flow, then release the ones */
39446 +reiser4_internal int
39447 +flush_cluster_pages(reiser4_cluster_t * clust, struct inode * inode)
39448 +{
39449 +       int i;
39450 +
39451 +       assert("edward-236", inode != NULL);
39452 +       assert("edward-237", clust != NULL);
39453 +       assert("edward-238", clust->off == 0);
39454 +       assert("edward-239", clust->count == 0);
39455 +       assert("edward-240", clust->delta == 0);
39456 +       assert("edward-241", schedulable());
39457 +       assert("edward-718", crc_inode_ok(inode));
39458 +
39459 +       clust->count = fsize_to_count(clust, inode);
39460 +       set_nrpages_by_frame(clust);
39461 +
39462 +       cluster_reserved2grabbed(estimate_insert_cluster(inode, 0));
39463 +
39464 +       /* estimate max size of the cluster after compression and encryption
39465 +          including all appended infrastructure, and allocate a buffer */
39466 +       clust->bsize = clust->count + max_crypto_overhead(inode);
39467 +       clust->bsize = inode_scaled_offset(inode, clust->bsize);
39468 +
39469 +       if (clust->bsize > inode_scaled_cluster_size(inode))
39470 +               clust->bsize = inode_scaled_cluster_size(inode);
39471 +       if (try_compress(clust, inode))
39472 +               clust->bsize += compress_overhead(inode, clust->count);
39473 +
39474 +       clust->buf = reiser4_kmalloc(clust->bsize, GFP_KERNEL);
39475 +       if (!clust->buf)
39476 +               return -ENOMEM;
39477 +
39478 +       if (delay_flush_pgcluster(clust, inode)) {
39479 +               /* delay flushing */
39480 +               assert("edward-612", clust->nr_pages == 1);
39481 +
39482 +               clust->pages = reiser4_kmalloc(sizeof(*clust->pages), GFP_KERNEL);
39483 +               if (!clust->pages) {
39484 +                       reiser4_kfree(clust->buf);
39485 +                       return -ENOMEM;
39486 +               }
39487 +               *clust->pages = find_get_page(inode->i_mapping, clust_to_pg(clust->index, inode));
39488 +
39489 +               assert("edward-613", *clust->pages != NULL);
39490 +               assert("edward-614", PageDirty(*clust->pages));
39491 +               assert("edward-720", crc_inode_ok(inode));
39492 +
39493 +               return 0;
39494 +       }
39495 +
39496 +       /* flush more then one page after its assembling into united flow */
39497 +       for (i=0; i < clust->nr_pages; i++){
39498 +               struct page * page;
39499 +               char * data;
39500 +
39501 +               page = find_get_page(inode->i_mapping, clust_to_pg(clust->index, inode) + i);
39502 +
39503 +               assert("edward-242", page != NULL);
39504 +               assert("edward-243", PageDirty(page));
39505 +               assert("edward-634", clust->count <= clust->bsize);
39506 +               /* FIXME_EDWARD: Make sure that jnodes are from the same dirty list */
39507 +
39508 +               lock_page(page);
39509 +               data = kmap(page);
39510 +               xmemcpy(clust->buf + pg_to_off(i), data, off_to_pgcount(clust->count, i));
39511 +               kunmap(page);
39512 +               uncapture_page(page);
39513 +               unlock_page(page);
39514 +               page_cache_release(page);
39515 +               assert("edward-721", crc_inode_ok(inode));
39516 +       }
39517 +       return 0;
39518 +}
39519 +
39520 +static void
39521 +set_hint_cluster(struct inode * inode, hint_t * hint, unsigned long index, znode_lock_mode mode)
39522 +{
39523 +       reiser4_key key;
39524 +       assert("edward-722", crc_inode_ok(inode));
39525 +       assert("edward-723", inode_file_plugin(inode) == file_plugin_by_id(CRC_FILE_PLUGIN_ID));
39526 +
39527 +       inode_file_plugin(inode)->key_by_inode(inode, clust_to_off(index, inode), &key);
39528 +
39529 +       seal_init(&hint->seal, &hint->coord.base_coord, &key);
39530 +       hint->offset = get_key_offset(&key);
39531 +       hint->level = znode_get_level(hint->coord.base_coord.node);
39532 +       hint->mode = mode;
39533 +       //set_hint(hint, &key, ZNODE_WRITE_LOCK);
39534 +}
39535 +
39536 +static int
39537 +balance_dirty_page_cluster(reiser4_cluster_t * clust, loff_t off, loff_t to_file)
39538 +{
39539 +       int result;
39540 +       loff_t new_size;
39541 +       struct inode * inode;
39542 +
39543 +       assert("edward-724", clust->file != NULL);
39544 +
39545 +       inode = clust->file->f_dentry->d_inode;
39546 +
39547 +       assert("edward-725", crc_inode_ok(inode));
39548 +
39549 +       new_size = clust_to_off(clust->index, inode) + clust->off + clust->count;
39550 +       /* set hint for next cluster */
39551 +       update_cluster(inode, clust, off, to_file);
39552 +       set_hint_cluster(inode, clust->hint, clust->index, ZNODE_WRITE_LOCK);
39553 +
39554 +       longterm_unlock_znode(clust->hint->coord.lh);
39555 +
39556 +       result = update_inode_cryptcompress(inode, new_size, (new_size > inode->i_size ? 1 : 0), 1, 1/* update stat data */);
39557 +       if (result)
39558 +               return result;
39559 +       assert("edward-726", clust->hint->coord.lh->owner == NULL);
39560 +       atomic_inc(&inode->i_count);
39561 +       balance_dirty_pages_ratelimited(inode->i_mapping);
39562 +
39563 +       return 0;
39564 +}
39565 +
39566 +/* set zeroes to the cluster, update it, and maybe, try to capture its pages */
39567 +static int
39568 +write_hole(struct inode *inode, reiser4_cluster_t * clust, loff_t file_off, loff_t to_file)
39569 +{
39570 +       char * data;
39571 +       int result = 0;
39572 +       unsigned cl_off, cl_count = 0;
39573 +       unsigned to_pg, pg_off;
39574 +
39575 +       assert ("edward-190", clust != NULL);
39576 +       assert ("edward-191", inode != NULL);
39577 +       assert ("edward-727", crc_inode_ok(inode));
39578 +       assert ("edward-192", cluster_invariant(clust, inode));
39579 +       assert ("edward-201", clust->stat == HOLE_CLUSTER);
39580 +
39581 +       if ((clust->off == 0 && clust->count == inode_cluster_size(inode)) /* fake cluster */ ||
39582 +           (clust->count == 0) /* nothing to write */) {
39583 +               update_cluster(inode, clust, file_off, to_file);
39584 +               return 0;
39585 +       }
39586 +       cl_count = clust->count; /* number of zeroes to write */
39587 +       cl_off = clust->off;
39588 +       pg_off = off_to_pgoff(clust->off);
39589 +
39590 +       while (cl_count) {
39591 +               struct page * page;
39592 +               page = clust->pages[off_to_pg(cl_off)];
39593 +
39594 +               assert ("edward-284", page != NULL);
39595 +
39596 +               to_pg = min_count(PAGE_CACHE_SIZE - pg_off, cl_count);
39597 +               lock_page(page);
39598 +               data = kmap_atomic(page, KM_USER0);
39599 +               xmemset(data + pg_off, 0, to_pg);
39600 +               kunmap_atomic(data, KM_USER0);
39601 +               unlock_page(page);
39602 +
39603 +               cl_off += to_pg;
39604 +               cl_count -= to_pg;
39605 +               pg_off = 0;
39606 +       }
39607 +       if (!clust->delta) {
39608 +               /* only zeroes, try to flush */
39609 +
39610 +               set_cluster_pages_dirty(clust);
39611 +               result = try_capture_cluster(clust);
39612 +               if (result)
39613 +                       return result;
39614 +               make_cluster_jnodes_dirty(clust);
39615 +               /* hint for updated cluster will be set here */
39616 +               result = balance_dirty_page_cluster(clust, file_off, to_file);
39617 +       }
39618 +       return result;
39619 +}
39620 +
39621 +/*
39622 +  The main disk search procedure for cryptcompress plugins, which
39623 +  . scans all items of disk cluster
39624 +  . maybe reads each one (if @read != 0)
39625 +  . maybe makes its znode dirty  (if @write != 0)
39626 +*/
39627 +reiser4_internal int
39628 +find_cluster(reiser4_cluster_t * clust,
39629 +            struct inode * inode,
39630 +            int read,
39631 +            int write)
39632 +{
39633 +       flow_t f;
39634 +       hint_t * hint;
39635 +       int result;
39636 +       unsigned long cl_idx;
39637 +       ra_info_t ra_info;
39638 +       file_plugin * fplug;
39639 +       item_plugin * iplug;
39640 +
39641 +       assert("edward-138", clust != NULL);
39642 +       assert("edward-728", clust->hint != NULL);
39643 +       assert("edward-225", read || write);
39644 +       assert("edward-226", schedulable());
39645 +       assert("edward-137", inode != NULL);
39646 +       assert("edward-729", crc_inode_ok(inode));
39647 +       assert("edward-461", ergo(read, clust->buf != NULL));
39648 +       assert("edward-462", ergo(!read, !cluster_is_uptodate(clust)));
39649 +       assert("edward-474", get_current_context()->grabbed_blocks == 0);
39650 +
39651 +       hint = clust->hint;
39652 +       cl_idx = clust->index;
39653 +       fplug = inode_file_plugin(inode);
39654 +
39655 +       /* build flow for the cluster */
39656 +       fplug->flow_by_inode(inode, clust->buf, 0 /* kernel space */,
39657 +                            inode_scaled_cluster_size(inode), clust_to_off(cl_idx, inode), READ_OP, &f);
39658 +       if (write) {
39659 +               result = reiser4_grab_space_force(estimate_disk_cluster(inode), 0);
39660 +               if (result)
39661 +                       goto out2;
39662 +       }
39663 +       ra_info.key_to_stop = f.key;
39664 +       set_key_offset(&ra_info.key_to_stop, get_key_offset(max_key()));
39665 +
39666 +       while (f.length) {
39667 +               result = find_cluster_item(hint, &f.key, 1 /* check key */, (write ? ZNODE_WRITE_LOCK : ZNODE_READ_LOCK), NULL, FIND_EXACT);
39668 +               switch (result) {
39669 +               case CBK_COORD_NOTFOUND:
39670 +                       if (inode_scaled_offset(inode, clust_to_off(cl_idx, inode)) == get_key_offset(&f.key)) {
39671 +                               /* first item not found */
39672 +                               if (read)
39673 +                                       /* hole cluster */
39674 +                                       clust->stat = FAKE_CLUSTER;
39675 +                               result = 0;
39676 +                               goto out2;
39677 +                       }
39678 +                       /* we are outside the cluster, stop search here */
39679 +                       assert("edward-146", f.length != inode_scaled_cluster_size(inode));
39680 +                       //crc_invalidate_extended_coord(&hint->coord);
39681 +                       goto ok;
39682 +               case CBK_COORD_FOUND:
39683 +                       assert("edward-148", hint->coord.base_coord.between == AT_UNIT);
39684 +                       assert("edward-460", hint->coord.base_coord.unit_pos == 0);
39685 +
39686 +                       coord_clear_iplug(&hint->coord.base_coord);
39687 +                       result = zload_ra(hint->coord.base_coord.node, &ra_info);
39688 +                       if (unlikely(result))
39689 +                               goto out2;
39690 +                       iplug = item_plugin_by_coord(&hint->coord.base_coord);
39691 +                       assert("edward-147", item_plugin_by_coord(&hint->coord.base_coord) == item_plugin_by_id(CTAIL_ID));
39692 +                       if (read) {
39693 +                               result = iplug->s.file.read(NULL, &f, hint);
39694 +                               if(result)
39695 +                                       goto out;
39696 +                       }
39697 +                       if (write) {
39698 +                               znode_make_dirty(hint->coord.base_coord.node);
39699 +                               znode_set_squeezable(hint->coord.base_coord.node);
39700 +                               if (!read)
39701 +                                       move_flow_forward(&f, iplug->b.nr_units(&hint->coord.base_coord));
39702 +                       }
39703 +                       crc_validate_extended_coord(&hint->coord, get_key_offset(&f.key));
39704 +                       zrelse(hint->coord.base_coord.node);
39705 +                       break;
39706 +               default:
39707 +                       goto out2;
39708 +               }
39709 +       }
39710 + ok:
39711 +       /* at least one item was found  */
39712 +       /* NOTE-EDWARD: Callers should handle the case when disk cluster is incomplete (-EIO) */
39713 +       clust->len = inode_scaled_cluster_size(inode) - f.length;
39714 +       set_hint_cluster(inode, clust->hint, clust->index + 1, write ? ZNODE_WRITE_LOCK : ZNODE_READ_LOCK);
39715 +       all_grabbed2free();
39716 +       return 0;
39717 + out:
39718 +       zrelse(hint->coord.base_coord.node);
39719 + out2:
39720 +       all_grabbed2free();
39721 +       return result;
39722 +}
39723 +
39724 +static int
39725 +get_disk_cluster_locked(reiser4_cluster_t * clust, znode_lock_mode lock_mode)
39726 +{
39727 +       reiser4_key key;
39728 +       ra_info_t ra_info;
39729 +       struct inode * inode;
39730 +
39731 +       assert("edward-730", schedulable());
39732 +       assert("edward-731", clust != NULL);
39733 +       assert("edward-732", clust->file != NULL);
39734 +
39735 +       inode = clust->file->f_dentry->d_inode;
39736 +       key_by_inode_cryptcompress(inode, clust_to_off(clust->index, inode), &key);
39737 +       ra_info.key_to_stop = key;
39738 +       set_key_offset(&ra_info.key_to_stop, get_key_offset(max_key()));
39739 +
39740 +       return find_cluster_item(clust->hint, &key, 0 /* don't check key */, lock_mode, NULL, FIND_MAX_NOT_MORE_THAN);
39741 +}
39742 +
39743 +/* Read before write.
39744 +   We don't take an interest in how much bytes was written when error occures */
39745 +static int
39746 +read_some_cluster_pages(struct inode * inode, reiser4_cluster_t * clust)
39747 +{
39748 +       int i;
39749 +       int result = 0;
39750 +       unsigned to_read;
39751 +       item_plugin * iplug;
39752 +
39753 +       iplug = item_plugin_by_id(CTAIL_ID);
39754 +
39755 +       assert("edward-733", get_current_context()->grabbed_blocks == 0);
39756 +
39757 +       if (new_cluster(clust, inode)) {
39758 +               /* new cluster, nothing to read, but anyway find a position in the tree */
39759 +               assert("edward-734", schedulable());
39760 +               assert("edward-735", clust->hint->coord.lh->owner == NULL);
39761 +
39762 +               /* since we read before write, take a write lock */
39763 +               result = get_disk_cluster_locked(clust, ZNODE_WRITE_LOCK);
39764 +               if (cbk_errored(result))
39765 +                       return result;
39766 +               assert("edward-736", clust->hint->coord.base_coord.node == clust->hint->coord.lh->node);
39767 +               return 0;
39768 +       }
39769 +       /* bytes we wanna read starting from the beginning of cluster
39770 +          to keep first @off ones */
39771 +       to_read = clust->off + clust->count + clust->delta;
39772 +
39773 +       assert("edward-298", to_read <= inode_cluster_size(inode));
39774 +
39775 +       for (i = 0; i < clust->nr_pages; i++) {
39776 +               struct page * pg = clust->pages[i];
39777 +
39778 +               if (clust->off <= pg_to_off(i) && pg_to_off(i) <= to_read - 1)
39779 +                       /* page will be completely overwritten */
39780 +                       continue;
39781 +               lock_page(pg);
39782 +               if (PageUptodate(pg)) {
39783 +                       unlock_page(pg);
39784 +                       continue;
39785 +               }
39786 +               unlock_page(pg);
39787 +
39788 +               if (!cluster_is_uptodate(clust)) {
39789 +                       /* read cluster and mark its znodes dirty */
39790 +                       result = ctail_read_cluster(clust, inode, 1 /* write */);
39791 +                       if (result)
39792 +                               goto out;
39793 +               }
39794 +               lock_page(pg);
39795 +               result =  do_readpage_ctail(clust, pg);
39796 +               unlock_page(pg);
39797 +               if (result) {
39798 +                       impossible("edward-219", "do_readpage_ctail returned crap");
39799 +                       goto out;
39800 +               }
39801 +       }
39802 +       if (!cluster_is_uptodate(clust)) {
39803 +               /* disk cluster unclaimed, but we need to make its znodes dirty
39804 +                  to make flush rewrite its content */
39805 +               result = find_cluster(clust, inode, 0 /* do not read */, 1 /*write */);
39806 +               if (!cbk_errored(result))
39807 +                       result = 0;
39808 +       }
39809 + out:
39810 +       release_cluster_buf(clust);
39811 +       return result;
39812 +}
39813 +
39814 +static int
39815 +crc_make_unprepped_cluster (reiser4_cluster_t * clust, struct inode * inode)
39816 +{
39817 +       assert("edward-737", clust != NULL);
39818 +       assert("edward-738", inode != NULL);
39819 +       assert("edward-739", crc_inode_ok(inode));
39820 +
39821 +       return ctail_make_unprepped_cluster(clust, inode);
39822 +}
39823 +
39824 +/* Prepare before write. Called by write, writepage, truncate, etc..
39825 +   . grab cluster pages,
39826 +   . maybe read pages from disk,
39827 +   . maybe write hole
39828 +*/
39829 +static int
39830 +prepare_cluster(struct inode *inode,
39831 +               loff_t file_off /* write position in the file */,
39832 +               loff_t to_file, /* bytes of users data to write to the file */
39833 +               int * nr_pages, /* advised number of pages */
39834 +               reiser4_cluster_t *clust,
39835 +               const char * msg)
39836 +
39837 +{
39838 +       char *data;
39839 +       int result = 0;
39840 +       unsigned o_c_d;
39841 +
39842 +       assert("edward-177", inode != NULL);
39843 +       assert("edward-741", crc_inode_ok(inode));
39844 +       assert("edward-280", cluster_invariant(clust, inode));
39845 +
39846 +       o_c_d = clust->count + clust->delta;
39847 +
39848 +       if (nr_pages != NULL) {
39849 +               assert("edward-422", *nr_pages <= inode_cluster_pages(inode));
39850 +               clust->nr_pages = *nr_pages;
39851 +       }
39852 +       else {  /* wasn't advised, guess by frame */
39853 +               assert("edward-740", clust->pages != NULL);
39854 +#if REISER4_DEBUG
39855 +               xmemset(clust->pages, 0, sizeof(clust->pages) << inode_cluster_shift(inode));
39856 +#endif
39857 +               set_nrpages_by_frame(clust);
39858 +       }
39859 +       if(!clust->nr_pages)
39860 +               /* do nothing */
39861 +               return 0;
39862 +       /* collect unlocked pages and jnodes */
39863 +       result = grab_cache_cluster(inode, clust);
39864 +       if (result)
39865 +               return result;
39866 +       if (clust->off == 0 && inode->i_size <= clust_to_off(clust->index, inode) + o_c_d) {
39867 +               /* we don't need to read cluster from disk, just
39868 +                  align the current chunk of data up to nr_pages */
39869 +               unsigned off = off_to_pgcount(o_c_d, clust->nr_pages - 1);
39870 +               struct page * pg = clust->pages[clust->nr_pages - 1];
39871 +               crypto_plugin * cplug = inode_crypto_plugin(inode);
39872 +
39873 +               assert("edward-285", pg != NULL);
39874 +
39875 +               lock_page(pg);
39876 +               data = kmap_atomic(pg, KM_USER0);
39877 +               if (inode_get_crypto(inode) && cplug->align_cluster)
39878 +                       cplug->align_cluster(data + off, off, PAGE_CACHE_SIZE);
39879 +               else
39880 +                       xmemset(data + off, 0, PAGE_CACHE_SIZE - off);
39881 +               kunmap_atomic(data, KM_USER0);
39882 +               unlock_page(pg);
39883 +       }
39884 +       result = reserve4cluster(inode, clust, msg);
39885 +       if (result)
39886 +               goto exit1;
39887 +       result = read_some_cluster_pages(inode, clust);
39888 +       if (result)
39889 +               goto exit2;
39890 +
39891 +       assert("edward-742", znode_is_write_locked(clust->hint->coord.base_coord.node));
39892 +
39893 +       switch (clust->stat) {
39894 +       case HOLE_CLUSTER:
39895 +               result = write_hole(inode, clust, file_off, to_file);
39896 +               break;
39897 +       case DATA_CLUSTER:
39898 +               if (!new_cluster(clust, inode))
39899 +                       break;
39900 +       case FAKE_CLUSTER:
39901 +               /* page cluster is unprepped */
39902 +#ifdef HANDLE_VIA_FLUSH_SCAN
39903 +               set_cluster_unlinked(clust, inode);
39904 +#else
39905 +               /* handling via flush squalloc */
39906 +               result = crc_make_unprepped_cluster(clust, inode);
39907 +               assert("edward-743", crc_inode_ok(inode));
39908 +               assert("edward-744", znode_is_write_locked(clust->hint->coord.lh->node));
39909 +               assert("edward-745", znode_is_dirty(clust->hint->coord.lh->node));
39910 +#endif
39911 +               break;
39912 +       default:
39913 +               impossible("edward-746", "bad cluster status");
39914 +       }
39915 +       if (!result)
39916 +               return 0;
39917 + exit2:
39918 +       free_reserved4cluster(inode, clust);
39919 + exit1:
39920 +       put_cluster_jnodes(clust);
39921 +       return result;
39922 +}
39923 +
39924 +/* get cluster handle params by two offsets */
39925 +static void
39926 +clust_by_offs(reiser4_cluster_t * clust, struct inode * inode, loff_t o1, loff_t o2)
39927 +{
39928 +       assert("edward-295", clust != NULL);
39929 +       assert("edward-296", inode != NULL);
39930 +       assert("edward-297", o1 <= o2);
39931 +
39932 +       clust->index = off_to_clust(o1, inode);
39933 +       clust->off = off_to_cloff(o1, inode);
39934 +       clust->count = min_count(inode_cluster_size(inode) - clust->off, o2 - o1);
39935 +       clust->delta = 0;
39936 +}
39937 +
39938 +static void
39939 +set_cluster_params(struct inode * inode, reiser4_cluster_t * clust, flow_t * f, loff_t file_off)
39940 +{
39941 +       assert("edward-197", clust != NULL);
39942 +       assert("edward-286", clust->pages != NULL);
39943 +       assert("edward-198", inode != NULL);
39944 +       assert("edward-747", reiser4_inode_data(inode)->cluster_shift <= MAX_CLUSTER_SHIFT);
39945 +
39946 +       xmemset(clust->pages, 0, sizeof(clust->pages) << inode_cluster_shift(inode));
39947 +
39948 +       if (file_off > inode->i_size) {
39949 +               /* Uhmm, hole in crypto-file... */
39950 +               loff_t hole_size;
39951 +               hole_size = file_off - inode->i_size;
39952 +
39953 +               printk("edward-176, Warning: Hole of size %llu in "
39954 +                      "cryptocompressed file (inode %llu, offset %llu) \n",
39955 +                      hole_size, get_inode_oid(inode), file_off);
39956 +
39957 +               clust_by_offs(clust, inode, inode->i_size, file_off);
39958 +               clust->stat = HOLE_CLUSTER;
39959 +               if (clust->off + hole_size < inode_cluster_size(inode))
39960 +                       /* besides there is also user's data to write to this cluster */
39961 +                       clust->delta = min_count(inode_cluster_size(inode) - (clust->off + clust->count), f->length);
39962 +               return;
39963 +       }
39964 +       clust_by_offs(clust, inode, file_off, file_off + f->length);
39965 +       clust->stat = DATA_CLUSTER;
39966 +}
39967 +
39968 +/* Main write procedure for cryptcompress objects,
39969 +   this slices user's data into clusters and copies to page cache.
39970 +   If @buf != NULL, returns number of bytes in successfully written clusters,
39971 +   otherwise returns error */
39972 +/* FIXME_EDWARD replace flow by something lightweigth */
39973 +
39974 +static loff_t
39975 +write_cryptcompress_flow(struct file * file , struct inode * inode, const char *buf, size_t count, loff_t pos)
39976 +{
39977 +       int i;
39978 +       flow_t f;
39979 +       hint_t hint;
39980 +       lock_handle lh;
39981 +       int result = 0;
39982 +       size_t to_write = 0;
39983 +       loff_t file_off;
39984 +       reiser4_cluster_t clust;
39985 +       struct page ** pages;
39986 +
39987 +       assert("edward-161", schedulable());
39988 +       assert("edward-748", crc_inode_ok(inode));
39989 +       assert("edward-159", current_blocksize == PAGE_CACHE_SIZE);
39990 +       assert("edward-749", reiser4_inode_data(inode)->cluster_shift <= MAX_CLUSTER_SHIFT);
39991 +
39992 +       init_lh(&lh);
39993 +       result = load_file_hint(file, &hint, &lh);
39994 +       if (result)
39995 +               return result;
39996 +       //coord_init_invalid(&hint.coord.base_coord, 0);
39997 +
39998 +       pages = reiser4_kmalloc(sizeof(*pages) << inode_cluster_shift(inode), GFP_KERNEL);
39999 +       if (!pages)
40000 +               return -ENOMEM;
40001 +       result = flow_by_inode_cryptcompress(inode, (char *)buf, 1 /* user space */, count, pos, WRITE_OP, &f);
40002 +       if (result)
40003 +               goto out;
40004 +       to_write = f.length;
40005 +
40006 +        /* current write position in file */
40007 +       file_off = pos;
40008 +       reiser4_cluster_init(&clust);
40009 +       clust.file = file;
40010 +       clust.hint = &hint;
40011 +       clust.pages = pages;
40012 +
40013 +       set_cluster_params(inode, &clust, &f, file_off);
40014 +
40015 +       if (next_cluster_stat(&clust) == HOLE_CLUSTER) {
40016 +               result = prepare_cluster(inode, file_off, f.length, NULL, &clust, "write cryptcompress hole");
40017 +               if (result)
40018 +                       goto out;
40019 +       }
40020 +       do {
40021 +               char *src;
40022 +               unsigned page_off, page_count;
40023 +
40024 +               assert("edward-750", schedulable());
40025 +
40026 +               result = prepare_cluster(inode, file_off, f.length, NULL, &clust, "write cryptcompress flow");  /* jp+ */
40027 +               if (result)
40028 +                       goto out;
40029 +               assert("edward-751", crc_inode_ok(inode));
40030 +               assert("edward-204", clust.stat == DATA_CLUSTER);
40031 +               assert("edward-752", znode_is_write_locked(hint.coord.base_coord.node));
40032 +
40033 +               /* set write position in page */
40034 +               page_off = off_to_pgoff(clust.off);
40035 +
40036 +                /* copy user's data to cluster pages */
40037 +               for (i = off_to_pg(clust.off), src = f.data; i < count_to_nrpages(clust.off + clust.count); i++, src += (int)PAGE_CACHE_SIZE) {
40038 +                       page_count = min_count(PAGE_CACHE_SIZE - page_off, clust.count);
40039 +
40040 +                       assert("edward-287", pages[i] != NULL);
40041 +
40042 +                       lock_page(pages[i]);
40043 +                       result = __copy_from_user((char *)kmap(pages[i]) + page_off, src, page_count);
40044 +                       kunmap(pages[i]);
40045 +                       if (unlikely(result)) {
40046 +                               unlock_page(pages[i]);
40047 +                               result = -EFAULT;
40048 +                               release_cluster(&clust);                            /* jp- */
40049 +                               goto err1;
40050 +                       }
40051 +                       unlock_page(pages[i]);
40052 +                       page_off = 0;
40053 +               }
40054 +               assert("edward-753", crc_inode_ok(inode));
40055 +
40056 +               set_cluster_pages_dirty(&clust);                                    /* p- */
40057 +
40058 +               result = try_capture_cluster(&clust);
40059 +               if (result)
40060 +                       goto err2;
40061 +               assert("edward-754", znode_is_dirty(hint.coord.base_coord.node));
40062 +               make_cluster_jnodes_dirty(&clust);                                  /* j- */
40063 +               move_flow_forward(&f, clust.count);
40064 +
40065 +               /* . update cluster
40066 +                  . set hint for new offset
40067 +                  . unlock znode
40068 +                  . update inode
40069 +                  . balance dirty pages
40070 +               */
40071 +               result = balance_dirty_page_cluster(&clust, 0, f.length);
40072 +               if(result)
40073 +                       goto err1;
40074 +               assert("edward-755", hint.coord.lh->owner == NULL);
40075 +               continue;
40076 +       err2:
40077 +               put_cluster_jnodes(&clust);                                         /* j- */
40078 +       err1:
40079 +               free_reserved4cluster(inode, &clust);
40080 +               break;
40081 +       } while (f.length);
40082 +       done_lh(&lh);
40083 + out:
40084 +       if (result == -EEXIST)
40085 +               printk("write returns EEXIST!\n");
40086 +
40087 +       reiser4_kfree(pages);
40088 +       save_file_hint(file, &hint);
40089 +       if (buf) {
40090 +               /* if nothing were written - there must be an error */
40091 +               assert("edward-195", ergo((to_write == f.length), result < 0));
40092 +               return (to_write - f.length) ? (to_write - f.length) : result;
40093 +       }
40094 +       return result;
40095 +}
40096 +
40097 +static ssize_t
40098 +write_crc_file(struct file * file, /* file to write to */
40099 +          struct inode *inode, /* inode */
40100 +          const char *buf, /* address of user-space buffer */
40101 +          size_t count, /* number of bytes to write */
40102 +          loff_t * off /* position to write which */)
40103 +{
40104 +
40105 +       int result;
40106 +       loff_t pos;
40107 +       ssize_t written;
40108 +
40109 +       assert("edward-196", crc_inode_ok(inode));
40110 +
40111 +       result = generic_write_checks(file, off, &count, 0);
40112 +       if (unlikely(result != 0))
40113 +               return result;
40114 +
40115 +       if (unlikely(count == 0))
40116 +               return 0;
40117 +
40118 +        /* FIXME-EDWARD: other UNIX features */
40119 +
40120 +       pos = *off;
40121 +       written = write_cryptcompress_flow(file, inode, (char *)buf, count, pos);
40122 +       if (written < 0) {
40123 +               if (written == -EEXIST)
40124 +                       printk("write_crc_file returns EEXIST!\n");
40125 +               return written;
40126 +       }
40127 +
40128 +        /* update position in a file */
40129 +       *off = pos + written;
40130 +       /* return number of written bytes */
40131 +       return written;
40132 +}
40133 +
40134 +/* plugin->u.file.write */
40135 +reiser4_internal ssize_t
40136 +write_cryptcompress(struct file * file, /* file to write to */
40137 +                   const char *buf, /* address of user-space buffer */
40138 +                   size_t count, /* number of bytes to write */
40139 +                   loff_t * off /* position to write which */)
40140 +{
40141 +       ssize_t result;
40142 +       struct inode *inode;
40143 +
40144 +       inode = file->f_dentry->d_inode;
40145 +
40146 +       down(&inode->i_sem);
40147 +
40148 +       result = write_crc_file(file, inode, buf, count, off);
40149 +
40150 +       up(&inode->i_sem);
40151 +       return result;
40152 +}
40153 +
40154 +/* Helper function for cryptcompress_truncate */
40155 +static int
40156 +find_object_size(struct inode *inode, loff_t * size)
40157 +{
40158 +       int result;
40159 +       reiser4_key key;
40160 +       hint_t hint;
40161 +       coord_t *coord;
40162 +       lock_handle lh;
40163 +       item_plugin *iplug;
40164 +       file_plugin *fplug = inode_file_plugin(inode);
40165 +
40166 +       assert("edward-95", crc_inode_ok(inode));
40167 +
40168 +       fplug->key_by_inode(inode, get_key_offset(max_key()), &key);
40169 +
40170 +       hint_init_zero(&hint, &lh);
40171 +       /* find the last item of this object */
40172 +       result = find_cluster_item(&hint, &key, 0, ZNODE_READ_LOCK, 0/* ra_info */, FIND_MAX_NOT_MORE_THAN);
40173 +       if (result == CBK_COORD_NOTFOUND) {
40174 +               /* object doesn't have any item */
40175 +               done_lh(&lh);
40176 +               *size = 0;
40177 +               return 0;
40178 +       }
40179 +       if (result != CBK_COORD_FOUND) {
40180 +               /* error occured */
40181 +               done_lh(&lh);
40182 +               return result;
40183 +       }
40184 +       coord = &hint.coord.base_coord;
40185 +
40186 +       /* there is at least one item */
40187 +       coord_clear_iplug(coord);
40188 +       result = zload(coord->node);
40189 +       if (unlikely(result)) {
40190 +               done_lh(&lh);
40191 +               return result;
40192 +       }
40193 +       iplug = item_plugin_by_coord(coord);
40194 +       assert("edward-277", iplug == item_plugin_by_id(CTAIL_ID));
40195 +       assert("edward-659", cluster_shift_by_coord(coord) == inode_cluster_shift(inode));
40196 +
40197 +       iplug->s.file.append_key(coord, &key);
40198 +
40199 +       *size = get_key_offset(&key);
40200 +
40201 +       zrelse(coord->node);
40202 +       done_lh(&lh);
40203 +
40204 +       return 0;
40205 +}
40206 +
40207 +UNUSED_ARG static int
40208 +cut_items_cryptcompress(struct inode *inode, loff_t new_size, int update_sd)
40209 +{
40210 +       reiser4_key from_key, to_key;
40211 +       reiser4_key smallest_removed;
40212 +       int result = 0;
40213 +
40214 +       assert("edward-293", inode_file_plugin(inode)->key_by_inode == key_by_inode_cryptcompress);
40215 +       key_by_inode_cryptcompress(inode, off_to_clust_to_off(new_size, inode), &from_key);
40216 +       to_key = from_key;
40217 +       set_key_offset(&to_key, get_key_offset(max_key()));
40218 +
40219 +       while (1) {
40220 +               result = reserve_cut_iteration(tree_by_inode(inode));
40221 +               if (result)
40222 +                       break;
40223 +
40224 +               result = cut_tree_object(current_tree, &from_key, &to_key,
40225 +                                        &smallest_removed, inode, 0);
40226 +               if (result == -E_REPEAT) {
40227 +                       /* -E_REPEAT is a signal to interrupt a long file truncation process */
40228 +                       /* FIXME(Zam) cut_tree does not support that signaling.*/
40229 +                       result = update_inode_cryptcompress
40230 +                               (inode, get_key_offset(&smallest_removed), 1, 1, update_sd);
40231 +                       if (result)
40232 +                               break;
40233 +
40234 +                       all_grabbed2free();
40235 +                       reiser4_release_reserved(inode->i_sb);
40236 +
40237 +                       txn_restart_current();
40238 +                       continue;
40239 +               }
40240 +               if (result)
40241 +                       break;
40242 +               result = update_inode_cryptcompress
40243 +                       (inode, get_key_offset(&smallest_removed), 1, 1, update_sd);
40244 +               break;
40245 +       }
40246 +
40247 +       all_grabbed2free();
40248 +       reiser4_release_reserved(inode->i_sb);
40249 +       return result;
40250 +}
40251 +
40252 +/* The following two procedures are called when truncate decided
40253 +   to deal with real items */
40254 +static int
40255 +cryptcompress_append_hole(struct inode * inode, loff_t new_size)
40256 +{
40257 +       return write_cryptcompress_flow(0, inode, 0, 0, new_size);
40258 +}
40259 +
40260 +reiser4_internal void
40261 +truncate_cluster(struct inode * inode, pgoff_t start, long count)
40262 +{
40263 +       truncate_inode_pages_range(inode->i_mapping, start, count);
40264 +       truncate_jnodes_range(inode, start, count);
40265 +}
40266 +
40267 +static int
40268 +shorten_cryptcompress(struct inode * inode, loff_t new_size, int update_sd,
40269 +                     loff_t asize)
40270 +{
40271 +       int result;
40272 +       pgoff_t start, count;
40273 +       struct page ** pages;
40274 +       loff_t old_size;
40275 +       char * kaddr;
40276 +       unsigned pgoff;
40277 +       reiser4_cluster_t clust;
40278 +       crypto_plugin * cplug;
40279 +
40280 +       assert("edward-290", inode->i_size > new_size);
40281 +       assert("edward-756", crc_inode_ok(inode));
40282 +
40283 +       pgoff = 0;
40284 +       start = count = 0;
40285 +       old_size = inode->i_size;
40286 +       cplug = inode_crypto_plugin(inode);
40287 +
40288 +       result = cut_file_items(inode, new_size, update_sd, asize, 0);
40289 +
40290 +       if(result)
40291 +               return result;
40292 +
40293 +       assert("edward-660", ergo(!new_size,
40294 +                                 (reiser4_inode_data(inode)->anonymous_eflushed == 0 &&
40295 +                                  reiser4_inode_data(inode)->captured_eflushed == 0)));
40296 +
40297 +       if (!off_to_cloff(new_size, inode))
40298 +               /* truncated to cluster boundary (1) */
40299 +               return 0;
40300 +       /* there is a cluster which should be modified and flushed */
40301 +       pages = reiser4_kmalloc(sizeof(*pages) << inode_cluster_shift(inode), GFP_KERNEL);
40302 +       if (!pages)
40303 +               return -ENOMEM;
40304 +
40305 +       reiser4_cluster_init(&clust);
40306 +       clust.pages = pages;
40307 +       /* set frame */
40308 +       clust_by_offs(&clust, inode, new_size, old_size);
40309 +
40310 +       /* read the whole cluster */
40311 +       result = prepare_cluster(inode, 0, 0, NULL, &clust, "shorten cryptcompress");
40312 +       if (result) {
40313 +               reiser4_kfree(pages);
40314 +               return result;
40315 +       }
40316 +                                                                           /* jp+ */
40317 +       /* truncate last cluster pages and jnodes */
40318 +       assert("edward-294", clust.stat == DATA_CLUSTER);
40319 +       assert("edward-661", clust.off > 0);
40320 +
40321 +       pgoff = off_to_pgoff(clust.off);
40322 +
40323 +        /* reduced idx of the first page to release */
40324 +       start = off_to_pg(clust.off - 1) + 1;
40325 +       /* number of pages to release and truncate */
40326 +       count = clust.nr_pages - start;
40327 +
40328 +       /* release last pages which won't participate in flush */
40329 +       release_cluster_pages(&clust, start);
40330 +       /* truncate the pages above, also don't forget about jnodes */
40331 +       truncate_cluster(inode, clust_to_pg(clust.index, inode) + start, count);
40332 +       /* update number of cluster pages */
40333 +       clust.nr_pages = start;
40334 +
40335 +       /* align last non-truncated page */
40336 +       lock_page(pages[clust.nr_pages - 1]);
40337 +       kaddr = kmap_atomic(pages[clust.nr_pages - 1], KM_USER0);
40338 +
40339 +       if (inode_get_crypto(inode) && cplug->align_cluster)
40340 +               cplug->align_cluster(kaddr + pgoff, pgoff, PAGE_CACHE_SIZE);
40341 +       else
40342 +               xmemset(kaddr + pgoff, 0, PAGE_CACHE_SIZE - pgoff);
40343 +       unlock_page(pages[clust.nr_pages - 1]);
40344 +
40345 +       set_cluster_pages_dirty(&clust);                  /* p- */
40346 +       result = try_capture_cluster(&clust);
40347 +       if (result) {
40348 +               put_cluster_jnodes(&clust);
40349 +               goto exit;
40350 +       }
40351 +       make_cluster_jnodes_dirty(&clust);                /* j- */
40352 +
40353 +       /* FIXME-EDWARD: Update this using balance dirty cluster pages */
40354 +       assert("edward-757", 0 /* don't free reserved4cluster when success */);
40355 +       result = update_inode_cryptcompress(inode, new_size, 1, 1, update_sd);
40356 +       if(!result)
40357 +               goto exit;
40358 +       balance_dirty_pages_ratelimited(inode->i_mapping);
40359 + exit:
40360 +       free_reserved4cluster(inode, &clust);
40361 +       reiser4_kfree(pages);
40362 +       return result;
40363 +}
40364 +
40365 +/* This is called in setattr_cryptcompress when it is used to truncate,
40366 +   and in delete_cryptcompress */
40367 +
40368 +static int
40369 +cryptcompress_truncate(struct inode *inode, /* old size */
40370 +                      loff_t new_size, /* new size */
40371 +                      int update_sd)
40372 +{
40373 +       int result;
40374 +       loff_t old_size = inode->i_size;
40375 +       loff_t asize; /* actual size */
40376 +
40377 +       result = find_object_size(inode, &asize);
40378 +
40379 +       if (result)
40380 +               return result;
40381 +       if (!asize ||
40382 +           /* no items */
40383 +           off_to_clust(asize, inode) < off_to_clust(new_size, inode)
40384 +           /* truncating up to fake cluster boundary */) {
40385 +               /* do not touch items */
40386 +               assert("edward-662", !off_to_cloff(new_size, inode));
40387 +
40388 +               INODE_SET_FIELD(inode, i_size, asize);
40389 +               truncate_cluster(inode, size_to_next_pg(new_size),
40390 +                                size_to_pg(old_size) - size_to_next_pg(new_size) + 1);
40391 +               assert("edward-663", ergo(!new_size,
40392 +                                         reiser4_inode_data(inode)->anonymous_eflushed == 0 &&
40393 +                                         reiser4_inode_data(inode)->captured_eflushed == 0));
40394 +
40395 +               if (update_sd) {
40396 +                       result = setattr_reserve_common(tree_by_inode(inode));
40397 +                       if (!result)
40398 +                               result = update_inode_cryptcompress(inode, new_size, 1, 1, 1);
40399 +                       all_grabbed2free();
40400 +               }
40401 +               return result;
40402 +       }
40403 +       result = (old_size < new_size ? cryptcompress_append_hole(inode, new_size) :
40404 +                 shorten_cryptcompress(inode, new_size, update_sd, asize));
40405 +       return result;
40406 +}
40407 +
40408 +/* plugin->u.file.truncate */
40409 +reiser4_internal int
40410 +truncate_cryptcompress(struct inode *inode, loff_t new_size)
40411 +{
40412 +       return 0;
40413 +}
40414 +
40415 +#if 0
40416 +static int
40417 +cryptcompress_writepage(struct page * page, reiser4_cluster_t * clust)
40418 +{
40419 +       int result = 0;
40420 +       int nrpages;
40421 +       struct inode * inode;
40422 +
40423 +       assert("edward-423", page->mapping && page->mapping->host);
40424 +
40425 +       inode = page->mapping->host;
40426 +       reiser4_cluster_init(&clust);
40427 +
40428 +        /* read all cluster pages if necessary */
40429 +       clust.pages = reiser4_kmalloc(sizeof(*clust.pages) << inode_cluster_shift(inode), GFP_KERNEL);
40430 +       if (!pages)
40431 +               return -ENOMEM;
40432 +       clust.index = pg_to_clust(page->index, inode);
40433 +       clust.off = pg_to_off_to_cloff(page->index, inode);
40434 +       clust.count = PAGE_CACHE_SIZE;
40435 +       nrpages = count_to_nrpages(fsize_to_count(&clust, inode));
40436 +
40437 +       result = prepare_cluster(page->mapping->host, 0, 0, &nrpages, &clust, "cryptcompress_writepage");  /* jp+ */
40438 +       if(result)
40439 +               goto exit;
40440 +
40441 +       set_cluster_pages_dirty(&clust);                                  /* p- */
40442 +       result = try_capture_cluster(&clust);
40443 +       if (result) {
40444 +               free_reserved4cluster(inode, &clust);
40445 +               put_cluster_jnodes(&clust);                                     /* j- */
40446 +               goto exit;
40447 +       }
40448 +       lock_page(page);
40449 +       make_cluster_jnodes_dirty(&clust);
40450 +       put_cluster_jnodes(&clust);                                             /* j- */
40451 + exit:
40452 +       reiser4_kfree(clust.pages);
40453 +       return result;
40454 +}
40455 +
40456 +/* make sure for each page the whole cluster was captured */
40457 +static int
40458 +writepages_cryptcompress(struct address_space * mapping)
40459 +{
40460 +       struct list_head *mpages;
40461 +       int result;
40462 +       int nr;
40463 +       int nrpages;
40464 +       int captured = 0, clean = 0, writeback = 0;
40465 +       reiser4_cluster_t * clust;
40466 +
40467 +       reiser4_cluster_init(clust);
40468 +       result = 0;
40469 +       nr = 0;
40470 +
40471 +       spin_lock (&mapping->page_lock);
40472 +
40473 +       mpages = get_moved_pages(mapping);
40474 +       while ((result == 0 || result == 1) && !list_empty (mpages) && nr < CAPTURE_APAGE_BURST) {
40475 +               struct page *pg = list_to_page(mpages);
40476 +
40477 +               assert("edward-481", PageDirty(pg));
40478 +
40479 +               if (!clust->nr_pages || !page_of_cluster(pg, &clust, inode)) {
40480 +                       /* update cluster handle */
40481 +                       clust.index = pg_to_clust(pg->index, inode);
40482 +                       clust.off = pg_to_off_to_cloff(pg->index, inode);
40483 +                       clust.count = PAGE_CACHE_SIZE;
40484 +                       /* advice number of pages */
40485 +                       nrpages = count_to_nrpages(fsize_to_count(&clust, inode));
40486 +
40487 +                       result = prepare_cluster(mapping->host, 0, 0, &nrpages, &clust,
40488 +               }
40489 +               result = capture_anonymous_page(pg, 0);
40490 +               if (result == 1) {
40491 +                       ++ nr;
40492 +                       result = 0;
40493 +               }
40494 +       }
40495 +       spin_unlock(&mapping->page_lock);
40496 +
40497 +       if (result) {
40498 +               warning("vs-1454", "Cannot capture anon pages: %i (%d %d %d)\n", result, captured, clean, writeback);
40499 +               return result;
40500 +       }
40501 +
40502 +
40503 +       if (nr >= CAPTURE_APAGE_BURST)
40504 +               redirty_inode(mapping->host);
40505 +
40506 +       if (result == 0)
40507 +               result = capture_anonymous_jnodes(mapping->host);
40508 +
40509 +       if (result != 0)
40510 +               warning("nikita-3328", "Cannot capture anon pages: %i\n", result);
40511 +       return result;
40512 +}
40513 +
40514 +#endif
40515 +
40516 +/* plugin->u.file.capture
40517 +   FIXME: capture method of file plugin is called by reiser4_writepages. It has to capture all
40518 +   anonymous pages and jnodes of the mapping. See capture_unix_file, for example
40519 + */
40520 +reiser4_internal int
40521 +capture_cryptcompress(struct inode *inode, const struct writeback_control *wbc, long *captured)
40522 +{
40523 +
40524 +#if 0
40525 +       int result;
40526 +       struct inode *inode;
40527 +
40528 +       assert("edward-424", PageLocked(page));
40529 +       assert("edward-425", PageUptodate(page));
40530 +       assert("edward-426", page->mapping && page->mapping->host);
40531 +
40532 +       inode = page->mapping->host;
40533 +       assert("edward-427", pg_to_off(page->index) < inode->i_size);
40534 +
40535 +       unlock_page(page);
40536 +       if (pg_to_off(page->index) >= inode->i_size) {
40537 +               /* race with truncate? */
40538 +               lock_page(page);
40539 +               page_cache_release(page);
40540 +               return RETERR(-EIO);
40541 +       }
40542 +       /* FIXME-EDWARD: Estimate insertion */
40543 +       result = cryptcompress_writepage(page);
40544 +       assert("edward-428", PageLocked(page));
40545 +       return result;
40546 +
40547 +       int result;
40548 +       reiser4_context ctx;
40549 +
40550 +       if (!inode_has_anonymous_pages(inode))
40551 +               return 0;
40552 +
40553 +       init_context(&ctx, inode->i_sb);
40554 +
40555 +       ctx.nobalance = 1;
40556 +       assert("edward-482", lock_stack_isclean(get_current_lock_stack()));
40557 +
40558 +       result = 0;
40559 +
40560 +       do {
40561 +               result = writepages_cryptcompress(inode->i_mapping);
40562 +               if (result != 0 || wbc->sync_mode != WB_SYNC_ALL)
40563 +                       break;
40564 +               result = txnmgr_force_commit_all(inode->i_sb, 0);
40565 +       } while (result == 0 && inode_has_anonymous_pages(inode));
40566 +
40567 +       reiser4_exit_context(&ctx);
40568 +       return result;
40569 +#endif
40570 +       return 0;
40571 +}
40572 +
40573 +static inline void
40574 +validate_crc_extended_coord(uf_coord_t *uf_coord, loff_t offset)
40575 +{
40576 +       assert("edward-418", uf_coord->valid == 0);
40577 +       assert("edward-419", item_plugin_by_coord(&uf_coord->base_coord)->s.file.init_coord_extension);
40578 +
40579 +       /* FIXME: */
40580 +       item_body_by_coord(&uf_coord->base_coord);
40581 +       item_plugin_by_coord(&uf_coord->base_coord)->s.file.init_coord_extension(uf_coord, offset);
40582 +}
40583 +
40584 +/* plugin->u.file.mmap:
40585 +   generic_file_mmap */
40586 +
40587 +/* plugin->u.file.release */
40588 +/* plugin->u.file.get_block */
40589 +/* This function is used for ->bmap() VFS method in reiser4 address_space_operations */
40590 +reiser4_internal int
40591 +get_block_cryptcompress(struct inode *inode, sector_t block, struct buffer_head *bh_result, int create UNUSED_ARG)
40592 +{
40593 +       if (current_blocksize != inode_cluster_size(inode))
40594 +               return RETERR(-EINVAL);
40595 +       else {
40596 +               int result;
40597 +               reiser4_key key;
40598 +               hint_t hint;
40599 +               lock_handle lh;
40600 +               item_plugin *iplug;
40601 +
40602 +               assert("edward-420", create == 0);
40603 +               key_by_inode_cryptcompress(inode, (loff_t)block * current_blocksize, &key);
40604 +               hint_init_zero(&hint, &lh);
40605 +               result = find_cluster_item(&hint, &key, 0, ZNODE_READ_LOCK, 0, FIND_EXACT);
40606 +               if (result != CBK_COORD_FOUND) {
40607 +                       done_lh(&lh);
40608 +                       return result;
40609 +               }
40610 +               result = zload(hint.coord.base_coord.node);
40611 +               if (unlikely(result)) {
40612 +                       done_lh(&lh);
40613 +                       return result;
40614 +               }
40615 +               iplug = item_plugin_by_coord(&hint.coord.base_coord);
40616 +
40617 +               assert("edward-421", iplug == item_plugin_by_id(CTAIL_ID));
40618 +
40619 +               if (!hint.coord.valid)
40620 +                       validate_crc_extended_coord(&hint.coord,
40621 +                                               (loff_t) block << PAGE_CACHE_SHIFT);
40622 +               if (iplug->s.file.get_block)
40623 +                       result = iplug->s.file.get_block(&hint.coord.base_coord, block, bh_result);
40624 +               else
40625 +                       result = RETERR(-EINVAL);
40626 +
40627 +               zrelse(hint.coord.base_coord.node);
40628 +               done_lh(&lh);
40629 +               return result;
40630 +       }
40631 +}
40632 +
40633 +/* plugin->u.file.delete */
40634 +/* EDWARD-FIXME-HANS: comment is where? */
40635 +reiser4_internal int
40636 +delete_cryptcompress(struct inode *inode)
40637 +{
40638 +       int result;
40639 +
40640 +       assert("edward-429", inode->i_nlink == 0);
40641 +
40642 +       if (inode->i_size) {
40643 +               result = cryptcompress_truncate(inode, 0, 0);
40644 +               if (result) {
40645 +                       warning("edward-430", "cannot truncate cryptcompress file  %lli: %i",
40646 +                               get_inode_oid(inode), result);
40647 +                       return result;
40648 +               }
40649 +       }
40650 +       return delete_object(inode, 0);
40651 +}
40652 +
40653 +/* plugin->u.file.init_inode_data */
40654 +/* plugin->u.file.owns_item:
40655 +   owns_item_common */
40656 +/* plugin->u.file.pre_delete */
40657 +/* EDWARD-FIXME-HANS: comment is where? */
40658 +reiser4_internal int
40659 +pre_delete_cryptcompress(struct inode *inode)
40660 +{
40661 +       return cryptcompress_truncate(inode, 0, 0);
40662 +}
40663 +
40664 +/* plugin->u.file.setattr method */
40665 +reiser4_internal int
40666 +setattr_cryptcompress(struct inode *inode,     /* Object to change attributes */
40667 +                     struct iattr *attr /* change description */ )
40668 +{
40669 +       int result;
40670 +
40671 +       if (attr->ia_valid & ATTR_SIZE) {
40672 +               /* EDWARD-FIXME-HANS: VS-FIXME-HANS:
40673 +                  Q: this case occurs when? truncate?
40674 +                  A: yes
40675 +
40676 +                  Q: If so, why isn't this code in truncate itself instead of here?
40677 +
40678 +                  A: because vfs calls fs's truncate after it has called truncate_inode_pages to get rid of pages
40679 +                  corresponding to part of file being truncated. In reiser4 it may cause existence of unallocated
40680 +                  extents which do not have jnodes. Flush code does not expect that. Solution of this problem is
40681 +                  straightforward. As vfs's truncate is implemented using setattr operation (common implementaion of
40682 +                  which calls truncate_inode_pages and fs's truncate in case when size of file changes) - it seems
40683 +                  reasonable to have reiser4_setattr which will take care of removing pages, jnodes and extents
40684 +                  simultaneously in case of truncate.
40685 +               */
40686 +
40687 +               /* truncate does reservation itself and requires exclusive access obtained */
40688 +               if (inode->i_size != attr->ia_size) {
40689 +                       loff_t old_size;
40690 +
40691 +                       inode_check_scale(inode, inode->i_size, attr->ia_size);
40692 +
40693 +                       old_size = inode->i_size;
40694 +
40695 +                       result = cryptcompress_truncate(inode, attr->ia_size, 1/* update stat data */);
40696 +
40697 +                       if (!result) {
40698 +                               /* items are removed already. inode_setattr will call vmtruncate to invalidate truncated
40699 +                                  pages and truncate_cryptcompress which will do nothing. FIXME: is this necessary? */
40700 +                               INODE_SET_FIELD(inode, i_size, old_size);
40701 +                               result = inode_setattr(inode, attr);
40702 +                       }
40703 +               } else
40704 +                       result = 0;
40705 +       } else {
40706 +               /* FIXME: Edward, please consider calling setattr_common() here */
40707 +               result = setattr_reserve_common(tree_by_inode(inode));
40708 +               if (!result) {
40709 +                       result = inode_setattr(inode, attr);
40710 +                       if (!result)
40711 +                               /* "capture" inode */
40712 +                               result = reiser4_mark_inode_dirty(inode);
40713 +                       all_grabbed2free();
40714 +               }
40715 +       }
40716 +       return result;
40717 +}
40718 +
40719 +/*
40720 +  Local variables:
40721 +  c-indentation-style: "K&R"
40722 +  mode-name: "LC"
40723 +  c-basic-offset: 8
40724 +  tab-width: 8
40725 +  fill-column: 120
40726 +  scroll-step: 1
40727 +  End:
40728 +*/
40729 diff -rupN linux-2.6.8-rc3/fs/reiser4/plugin/cryptcompress.h linux-2.6.8-rc3-a/fs/reiser4/plugin/cryptcompress.h
40730 --- linux-2.6.8-rc3/fs/reiser4/plugin/cryptcompress.h   1970-01-01 03:00:00.000000000 +0300
40731 +++ linux-2.6.8-rc3-a/fs/reiser4/plugin/cryptcompress.h 2004-08-05 21:20:53.249625422 +0400
40732 @@ -0,0 +1,170 @@
40733 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
40734 +/* See http://www.namesys.com/cryptcompress_design.html */
40735 +
40736 +#if !defined( __FS_REISER4_CRYPTCOMPRESS_H__ )
40737 +#define __FS_REISER4_CRYPTCOMPRESS_H__
40738 +
40739 +
40740 +#include <linux/pagemap.h>
40741 +#include <linux/crypto.h>
40742 +
40743 +#define MIN_CLUSTER_SIZE PAGE_CACHE_SIZE
40744 +#define MAX_CLUSTER_SHIFT 4
40745 +#define DEFAULT_CLUSTER_SHIFT 0
40746 +#define MIN_SIZE_FOR_COMPRESSION 64
40747 +#define MIN_CRYPTO_BLOCKSIZE 8
40748 +#define CLUSTER_MAGIC_SIZE (MIN_CRYPTO_BLOCKSIZE >> 1)
40749 +
40750 +/* cluster status */
40751 +typedef enum {
40752 +       DATA_CLUSTER = 0,
40753 +       HOLE_CLUSTER = 1, /* indicates hole for write ops */
40754 +       FAKE_CLUSTER = 2  /* indicates absence of disk cluster for read ops */
40755 +} reiser4_cluster_status;
40756 +
40757 +/* reiser4 transforms */
40758 +typedef enum {
40759 +       CRYPTO_TFM,
40760 +       DIGEST_TFM,
40761 +       COMPRESS_TFM,
40762 +       LAST_TFM
40763 +} reiser4_tfm;
40764 +
40765 +/* Write modes for item conversion in flush squeeze phase */
40766 +typedef enum {
40767 +       CRC_FIRST_ITEM = 1,
40768 +       CRC_APPEND_ITEM = 2,
40769 +       CRC_OVERWRITE_ITEM = 3,
40770 +       CRC_CUT_ITEM = 4
40771 +} crc_write_mode_t;
40772 +
40773 +/* reiser4 cluster manager transforms page cluster into disk cluster (and back) via
40774 +   input/output stream of crypto/compression algorithms using copy on clustering.
40775 +   COC means that page cluster will be assembled into united stream before compression,
40776 +   and output stream of decompression algorithm will be split into pages.
40777 +   This manager consists mostly of operations on the following object which represents
40778 +   one cluster:
40779 +*/
40780 +typedef struct reiser4_cluster{
40781 +       __u8 * buf;      /* pointer to input/output stream of crypto/compression algorithm */
40782 +       size_t bsize;    /* size of the buffer allocated for the stream */
40783 +       size_t len;      /* actual length of the stream above */
40784 +       int nr_pages;    /* number of attached pages */
40785 +       struct page ** pages; /* attached pages */
40786 +       struct file * file;
40787 +       hint_t * hint;
40788 +       reiser4_cluster_status stat;
40789 +       /* sliding frame of cluster size in loff_t-space to translate main file 'offsets'
40790 +          like read/write position, size, new size (for truncate), etc.. into number
40791 +          of pages, cluster status, etc..*/
40792 +       unsigned long index; /* cluster index, coord of the frame */
40793 +       unsigned off;    /* offset we want to read/write/truncate from */
40794 +       unsigned count;  /* bytes to read/write/truncate */
40795 +       unsigned delta;  /* bytes of user's data to append to the hole */
40796 +} reiser4_cluster_t;
40797 +
40798 +/* security attributes supposed to be stored on disk
40799 +   are loaded by stat-data methods (see plugin/item/static_stat.c */
40800 +typedef struct crypto_stat {
40801 +       __u8 * keyid;  /* pointer to a fingerprint */
40802 +       __u16 keysize; /* key size, bits */
40803 +} crypto_stat_t;
40804 +
40805 +/* cryptcompress specific part of reiser4_inode */
40806 +typedef struct cryptcompress_info {
40807 +       struct crypto_tfm *tfm[LAST_TFM];
40808 +       __u32 * expkey;
40809 +} cryptcompress_info_t;
40810 +
40811 +cryptcompress_info_t *cryptcompress_inode_data(const struct inode * inode);
40812 +int equal_to_rdk(znode *, const reiser4_key *);
40813 +int equal_to_ldk(znode *, const reiser4_key *);
40814 +int goto_right_neighbor(coord_t *, lock_handle *);
40815 +int load_file_hint(struct file *, hint_t *, lock_handle *);
40816 +void save_file_hint(struct file *, const hint_t *);
40817 +
40818 +/* declarations of functions implementing methods of cryptcompress object plugin */
40819 +void init_inode_data_cryptcompress(struct inode *inode, reiser4_object_create_data *crd, int create);
40820 +int create_cryptcompress(struct inode *, struct inode *, reiser4_object_create_data *);
40821 +int open_cryptcompress(struct inode * inode, struct file * file);
40822 +int truncate_cryptcompress(struct inode *, loff_t size);
40823 +int readpage_cryptcompress(void *, struct page *);
40824 +int capture_cryptcompress(struct inode *inode, const struct writeback_control *wbc, long *);
40825 +ssize_t write_cryptcompress(struct file *, const char *buf, size_t size, loff_t *off);
40826 +int release_cryptcompress(struct inode *inode, struct file *);
40827 +int mmap_cryptcompress(struct file *, struct vm_area_struct *vma);
40828 +int get_block_cryptcompress(struct inode *, sector_t block, struct buffer_head *bh_result, int create);
40829 +int flow_by_inode_cryptcompress(struct inode *, char *buf, int user, loff_t, loff_t, rw_op, flow_t *);
40830 +int key_by_inode_cryptcompress(struct inode *, loff_t off, reiser4_key *);
40831 +int delete_cryptcompress(struct inode *);
40832 +int owns_item_cryptcompress(const struct inode *, const coord_t *);
40833 +int setattr_cryptcompress(struct inode *, struct iattr *);
40834 +void readpages_cryptcompress(struct file *, struct address_space *, struct list_head *pages);
40835 +void init_inode_data_cryptcompress(struct inode *, reiser4_object_create_data *, int create);
40836 +int pre_delete_cryptcompress(struct inode *);
40837 +void hint_init_zero(hint_t *, lock_handle *);
40838 +void destroy_inode_cryptcompress(struct inode * inode);
40839 +int crc_inode_ok(struct inode * inode);
40840 +
40841 +static inline struct crypto_tfm *
40842 +inode_get_tfm (struct inode * inode, reiser4_tfm tfm)
40843 +{
40844 +       return cryptcompress_inode_data(inode)->tfm[tfm];
40845 +}
40846 +
40847 +static inline struct crypto_tfm *
40848 +inode_get_crypto (struct inode * inode)
40849 +{
40850 +       return (inode_get_tfm(inode, CRYPTO_TFM));
40851 +}
40852 +
40853 +static inline struct crypto_tfm *
40854 +inode_get_digest (struct inode * inode)
40855 +{
40856 +       return (inode_get_tfm(inode, DIGEST_TFM));
40857 +}
40858 +
40859 +static inline unsigned int
40860 +crypto_blocksize(struct inode * inode)
40861 +{
40862 +       assert("edward-758", inode_get_tfm(inode, CRYPTO_TFM) != NULL);
40863 +       return crypto_tfm_alg_blocksize(inode_get_tfm(inode, CRYPTO_TFM));
40864 +}
40865 +
40866 +#define REGISTER_NONE_ALG(ALG, TFM)                                  \
40867 +static int alloc_none_ ## ALG (struct inode * inode)                 \
40868 +{                                                                    \
40869 +        cryptcompress_info_t * info;                                 \
40870 +        assert("edward-760", inode != NULL);                         \
40871 +                                                                    \
40872 +       info = cryptcompress_inode_data(inode);                      \
40873 +                                                                     \
40874 +                                                                     \
40875 +       cryptcompress_inode_data(inode)->tfm[TFM ## _TFM] = NULL;    \
40876 +       return 0;                                                    \
40877 +                                                                     \
40878 +}                                                                    \
40879 +static void free_none_ ## ALG (struct inode * inode)                 \
40880 +{                                                                    \
40881 +        cryptcompress_info_t * info;                                 \
40882 +        assert("edward-761", inode != NULL);                         \
40883 +                                                                    \
40884 +       info = cryptcompress_inode_data(inode);                      \
40885 +                                                                    \
40886 +       assert("edward-762", info != NULL);                          \
40887 +                                                                    \
40888 +       info->tfm[TFM ## _TFM] = NULL;                               \
40889 +}
40890 +
40891 +#endif /* __FS_REISER4_CRYPTCOMPRESS_H__ */
40892 +
40893 +/* Make Linus happy.
40894 +   Local variables:
40895 +   c-indentation-style: "K&R"
40896 +   mode-name: "LC"
40897 +   c-basic-offset: 8
40898 +   tab-width: 8
40899 +   fill-column: 120
40900 +   scroll-step: 1
40901 +   End:
40902 +*/
40903 diff -rupN linux-2.6.8-rc3/fs/reiser4/plugin/digest.c linux-2.6.8-rc3-a/fs/reiser4/plugin/digest.c
40904 --- linux-2.6.8-rc3/fs/reiser4/plugin/digest.c  1970-01-01 03:00:00.000000000 +0300
40905 +++ linux-2.6.8-rc3-a/fs/reiser4/plugin/digest.c        2004-08-05 21:20:53.165643136 +0400
40906 @@ -0,0 +1,32 @@
40907 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
40908 +
40909 +/* reiser4 digest transform plugin (is used by cryptcompress object plugin) */
40910 +
40911 +#include "../debug.h"
40912 +#include "plugin_header.h"
40913 +#include "plugin.h"
40914 +#include "cryptcompress.h"
40915 +
40916 +#include <linux/types.h>
40917 +
40918 +#define NONE_DIGEST_SIZE 0
40919 +
40920 +REGISTER_NONE_ALG(digest, DIGEST)
40921 +
40922 +/* digest plugins */
40923 +digest_plugin digest_plugins[LAST_DIGEST_ID] = {
40924 +       [NONE_DIGEST_ID] = {
40925 +               .h = {
40926 +                       .type_id = REISER4_DIGEST_PLUGIN_TYPE,
40927 +                       .id = NONE_DIGEST_ID,
40928 +                       .pops = NULL,
40929 +                       .label = "none",
40930 +                       .desc = "trivial digest",
40931 +                       .linkage = TYPE_SAFE_LIST_LINK_ZERO
40932 +               },
40933 +               .dsize = NONE_DIGEST_SIZE,
40934 +               .alloc = alloc_none_digest,
40935 +               .free = free_none_digest,
40936 +       }
40937 +};
40938 +
40939 diff -rupN linux-2.6.8-rc3/fs/reiser4/plugin/dir/dir.c linux-2.6.8-rc3-a/fs/reiser4/plugin/dir/dir.c
40940 --- linux-2.6.8-rc3/fs/reiser4/plugin/dir/dir.c 1970-01-01 03:00:00.000000000 +0300
40941 +++ linux-2.6.8-rc3-a/fs/reiser4/plugin/dir/dir.c       2004-08-05 21:20:52.905697965 +0400
40942 @@ -0,0 +1,1919 @@
40943 +/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
40944 + * reiser4/README */
40945 +
40946 +/* Methods of directory plugin. */
40947 +
40948 +#include "../../forward.h"
40949 +#include "../../debug.h"
40950 +#include "../../spin_macros.h"
40951 +#include "../plugin_header.h"
40952 +#include "../../key.h"
40953 +#include "../../kassign.h"
40954 +#include "../../coord.h"
40955 +#include "../../type_safe_list.h"
40956 +#include "../plugin.h"
40957 +#include "dir.h"
40958 +#include "../item/item.h"
40959 +#include "../security/perm.h"
40960 +#include "../../jnode.h"
40961 +#include "../../znode.h"
40962 +#include "../../tap.h"
40963 +#include "../../vfs_ops.h"
40964 +#include "../../inode.h"
40965 +#include "../../super.h"
40966 +#include "../../safe_link.h"
40967 +#include "../object.h"
40968 +
40969 +#include "hashed_dir.h"
40970 +#include "pseudo_dir.h"
40971 +
40972 +#include <linux/types.h>       /* for __u??  */
40973 +#include <linux/fs.h>          /* for struct file  */
40974 +#include <linux/quotaops.h>
40975 +#include <linux/dcache.h>      /* for struct dentry */
40976 +
40977 +/* helper function. Standards require than for many file-system operations
40978 +   on success ctime and mtime of parent directory is to be updated. */
40979 +reiser4_internal int
40980 +reiser4_update_dir(struct inode *dir)
40981 +{
40982 +       assert("nikita-2525", dir != NULL);
40983 +
40984 +       dir->i_ctime = dir->i_mtime = CURRENT_TIME;
40985 +       return reiser4_mark_inode_dirty(dir);
40986 +}
40987 +
40988 +/* estimate disk space necessary to add a link from @parent to @object. */
40989 +static reiser4_block_nr common_estimate_link(
40990 +       struct inode *parent /* parent directory */,
40991 +       struct inode *object /* object to which new link is being cerated */)
40992 +{
40993 +       reiser4_block_nr res = 0;
40994 +       file_plugin *fplug;
40995 +       dir_plugin *dplug;
40996 +
40997 +       assert("vpf-317", object != NULL);
40998 +       assert("vpf-318", parent != NULL );
40999 +
41000 +       fplug = inode_file_plugin(object);
41001 +       dplug = inode_dir_plugin(parent);
41002 +
41003 +       /* reiser4_add_nlink(object) */
41004 +       res += fplug->estimate.update(object);
41005 +       /* add_entry(parent) */
41006 +       res += dplug->estimate.add_entry(parent);
41007 +       /* reiser4_del_nlink(object) */
41008 +       res += fplug->estimate.update(object);
41009 +       /* update_dir(parent) */
41010 +       res += inode_file_plugin(parent)->estimate.update(parent);
41011 +       /* safe-link */
41012 +       res += estimate_one_item_removal(tree_by_inode(object));
41013 +
41014 +       return res;
41015 +}
41016 +
41017 +/* add link from @parent directory to @existing object.
41018 +
41019 +       . get plugins
41020 +       . check permissions
41021 +       . check that "existing" can hold yet another link
41022 +       . start transaction
41023 +       . add link to "existing"
41024 +       . add entry to "parent"
41025 +       . if last step fails, remove link from "existing"
41026 +
41027 +*/
41028 +static int
41029 +link_common(struct inode *parent /* parent directory */ ,
41030 +           struct dentry *existing     /* dentry of object to which
41031 +                                        * new link is being
41032 +                                        * cerated */ ,
41033 +           struct dentry *newname /* new name */ )
41034 +{
41035 +       int result;
41036 +       struct inode *object;
41037 +       dir_plugin *parent_dplug;
41038 +       reiser4_dir_entry_desc entry;
41039 +       reiser4_object_create_data data;
41040 +       reiser4_block_nr reserve;
41041 +
41042 +       assert("nikita-1431", existing != NULL);
41043 +       assert("nikita-1432", parent != NULL);
41044 +       assert("nikita-1433", newname != NULL);
41045 +
41046 +       object = existing->d_inode;
41047 +       assert("nikita-1434", object != NULL);
41048 +
41049 +       /* check for race with create_object() */
41050 +       if (inode_get_flag(object, REISER4_IMMUTABLE))
41051 +               return RETERR(-E_REPEAT);
41052 +
41053 +       /* links to directories are not allowed if file-system
41054 +          logical name-space should be ADG */
41055 +       if (S_ISDIR(object->i_mode) && reiser4_is_set(parent->i_sb, REISER4_ADG))
41056 +               return RETERR(-EISDIR);
41057 +
41058 +       /* check permissions */
41059 +       result = perm_chk(parent, link, existing, parent, newname);
41060 +       if (result != 0)
41061 +               return result;
41062 +
41063 +       parent_dplug = inode_dir_plugin(parent);
41064 +
41065 +       xmemset(&entry, 0, sizeof entry);
41066 +       entry.obj = object;
41067 +
41068 +       data.mode = object->i_mode;
41069 +       data.id = inode_file_plugin(object)->h.id;
41070 +
41071 +       reserve = common_estimate_link(parent, existing->d_inode);
41072 +       if ((__s64)reserve < 0)
41073 +           return reserve;
41074 +
41075 +       if (reiser4_grab_space(reserve, BA_CAN_COMMIT))
41076 +           return RETERR(-ENOSPC);
41077 +
41078 +       /*
41079 +        * Subtle race handling: sys_link() doesn't take i_sem on @parent. It
41080 +        * means that link(2) can race against unlink(2) or rename(2), and
41081 +        * inode is dead (->i_nlink == 0) when reiser4_link() is entered.
41082 +        *
41083 +        * For such inode we have to undo special processing done in
41084 +        * reiser4_unlink() viz. creation of safe-link.
41085 +        */
41086 +       if (unlikely(inode_file_plugin(object)->not_linked(object))) {
41087 +               result = safe_link_del(object, SAFE_UNLINK);
41088 +               if (result != 0)
41089 +                       return result;
41090 +       }
41091 +
41092 +       result = reiser4_add_nlink(object, parent, 1);
41093 +       if (result == 0) {
41094 +               /* add entry to the parent */
41095 +               result = parent_dplug->add_entry(parent, newname, &data, &entry);
41096 +               if (result != 0) {
41097 +                       /* failure to add entry to the parent, remove
41098 +                          link from "existing" */
41099 +                       reiser4_del_nlink(object, parent, 1);
41100 +                       /* now, if this fails, we have a file with too
41101 +                          big nlink---space leak, much better than
41102 +                          directory entry pointing to nowhere */
41103 +                       /* may be it should be recorded somewhere, but
41104 +                          if addition of link to parent and update of
41105 +                          object's stat data both failed, chances are
41106 +                          that something is going really wrong */
41107 +               }
41108 +       }
41109 +       if (result == 0) {
41110 +               atomic_inc(&object->i_count);
41111 +               /* Upon successful completion, link() shall mark for update
41112 +                  the st_ctime field of the file. Also, the st_ctime and
41113 +                  st_mtime fields of the directory that contains the new
41114 +                  entry shall be marked for update. --SUS
41115 +               */
41116 +               result = reiser4_update_dir(parent);
41117 +       }
41118 +       return result;
41119 +}
41120 +
41121 +/* estimate disk space necessary to remove a link between @parent and
41122 + * @object. */
41123 +static reiser4_block_nr common_estimate_unlink (
41124 +       struct inode *parent /* parent directory */,
41125 +       struct inode *object /* object to which new link is being cerated */)
41126 +{
41127 +       reiser4_block_nr res = 0;
41128 +       file_plugin *fplug;
41129 +       dir_plugin *dplug;
41130 +
41131 +       assert("vpf-317", object != NULL);
41132 +       assert("vpf-318", parent != NULL );
41133 +
41134 +       fplug = inode_file_plugin(object);
41135 +       dplug = inode_dir_plugin(parent);
41136 +
41137 +       /* rem_entry(parent) */
41138 +       res += dplug->estimate.rem_entry(parent);
41139 +       /* reiser4_del_nlink(object) */
41140 +       res += fplug->estimate.update(object);
41141 +       /* update_dir(parent) */
41142 +       res += inode_file_plugin(parent)->estimate.update(parent);
41143 +       /* fplug->unlink */
41144 +       res += fplug->estimate.unlink(object, parent);
41145 +       /* safe-link */
41146 +       res += estimate_one_insert_item(tree_by_inode(object));
41147 +
41148 +       return res;
41149 +}
41150 +
41151 +/* grab space for unlink. */
41152 +static int
41153 +unlink_check_and_grab(struct inode *parent, struct dentry *victim)
41154 +{
41155 +       file_plugin  *fplug;
41156 +       struct inode *child;
41157 +       int           result;
41158 +
41159 +       result = 0;
41160 +       child = victim->d_inode;
41161 +       fplug = inode_file_plugin(child);
41162 +
41163 +       /* check for race with create_object() */
41164 +       if (inode_get_flag(child, REISER4_IMMUTABLE))
41165 +               return RETERR(-E_REPEAT);
41166 +       /* object being deleted should have stat data */
41167 +       assert("vs-949", !inode_get_flag(child, REISER4_NO_SD));
41168 +
41169 +       /* check permissions */
41170 +       result = perm_chk(parent, unlink, parent, victim);
41171 +       if (result != 0)
41172 +               return result;
41173 +
41174 +       /* ask object plugin */
41175 +       if (fplug->can_rem_link != NULL && !fplug->can_rem_link(child))
41176 +               return RETERR(-ENOTEMPTY);
41177 +
41178 +       result = (int)common_estimate_unlink(parent, child);
41179 +       if (result < 0)
41180 +               return result;
41181 +
41182 +       return reiser4_grab_reserved(child->i_sb, result, BA_CAN_COMMIT);
41183 +}
41184 +
41185 +/* remove link from @parent directory to @victim object.
41186 +
41187 +       . get plugins
41188 +       . find entry in @parent
41189 +       . check permissions
41190 +       . decrement nlink on @victim
41191 +       . if nlink drops to 0, delete object
41192 +*/
41193 +static int
41194 +unlink_common(struct inode *parent /* parent object */ ,
41195 +             struct dentry *victim /* name being removed from @parent */)
41196 +{
41197 +       int           result;
41198 +       struct inode *object;
41199 +       file_plugin  *fplug;
41200 +
41201 +       object = victim->d_inode;
41202 +       fplug  = inode_file_plugin(object);
41203 +       assert("nikita-2882", fplug->detach != NULL);
41204 +
41205 +       result = unlink_check_and_grab(parent, victim);
41206 +       if (result != 0)
41207 +               return result;
41208 +
41209 +       result = fplug->detach(object, parent);
41210 +       if (result == 0) {
41211 +               dir_plugin            *parent_dplug;
41212 +               reiser4_dir_entry_desc entry;
41213 +
41214 +               parent_dplug = inode_dir_plugin(parent);
41215 +               xmemset(&entry, 0, sizeof entry);
41216 +
41217 +               /* first, delete directory entry */
41218 +               result = parent_dplug->rem_entry(parent, victim, &entry);
41219 +               if (result == 0) {
41220 +                       /*
41221 +                        * if name was removed successfully, we _have_ to
41222 +                        * return 0 from this function, because upper level
41223 +                        * caller (vfs_{rmdir,unlink}) expect this.
41224 +                        */
41225 +                       /* now that directory entry is removed, update
41226 +                        * stat-data */
41227 +                       reiser4_del_nlink(object, parent, 1);
41228 +                       /* Upon successful completion, unlink() shall mark for
41229 +                          update the st_ctime and st_mtime fields of the
41230 +                          parent directory. Also, if the file's link count is
41231 +                          not 0, the st_ctime field of the file shall be
41232 +                          marked for update. --SUS */
41233 +                       reiser4_update_dir(parent);
41234 +                       /* add safe-link for this file */
41235 +                       if (fplug->not_linked(object))
41236 +                               safe_link_add(object, SAFE_UNLINK);
41237 +               }
41238 +       }
41239 +
41240 +       if (unlikely(result != 0)) {
41241 +               if (result != -ENOMEM)
41242 +                       warning("nikita-3398", "Cannot unlink %llu (%i)",
41243 +                               get_inode_oid(object), result);
41244 +               /* if operation failed commit pending inode modifications to
41245 +                * the stat-data */
41246 +               reiser4_update_sd(object);
41247 +               reiser4_update_sd(parent);
41248 +       }
41249 +
41250 +       reiser4_release_reserved(object->i_sb);
41251 +
41252 +       /* @object's i_ctime was updated by ->rem_link() method(). */
41253 +
41254 +       return result;
41255 +}
41256 +
41257 +/* Estimate the maximum amount of nodes will be allocated or changed for:
41258 +   - insert an in the parent entry
41259 +   - update the SD of parent
41260 +   - estimate child creation
41261 +*/
41262 +static reiser4_block_nr common_estimate_create_child(
41263 +       struct inode *parent, /* parent object */
41264 +       struct inode *object /* object */)
41265 +{
41266 +       assert("vpf-309", parent != NULL);
41267 +       assert("vpf-307", object != NULL);
41268 +
41269 +       return
41270 +               /* object creation estimation */
41271 +               inode_file_plugin(object)->estimate.create(object) +
41272 +               /* stat data of parent directory estimation */
41273 +               inode_file_plugin(parent)->estimate.update(parent) +
41274 +               /* adding entry estimation */
41275 +               inode_dir_plugin(parent)->estimate.add_entry(parent) +
41276 +               /* to undo in the case of failure */
41277 +               inode_dir_plugin(parent)->estimate.rem_entry(parent);
41278 +}
41279 +
41280 +/* Create child in directory.
41281 +
41282 +   . get object's plugin
41283 +   . get fresh inode
41284 +   . initialize inode
41285 +   . add object's stat-data
41286 +   . initialize object's directory
41287 +   . add entry to the parent
41288 +   . instantiate dentry
41289 +
41290 +*/
41291 +/* ->create_child method of directory plugin */
41292 +static int
41293 +create_child_common(reiser4_object_create_data * data  /* parameters
41294 +                                                        * of new
41295 +                                                        * object */,
41296 +                   struct inode ** retobj)
41297 +{
41298 +       int result;
41299 +
41300 +       struct dentry *dentry;  /* parent object */
41301 +       struct inode *parent;   /* new name */
41302 +
41303 +       dir_plugin *par_dir;    /* directory plugin on the parent */
41304 +       dir_plugin *obj_dir;    /* directory plugin on the new object */
41305 +       file_plugin *obj_plug;  /* object plugin on the new object */
41306 +       struct inode *object;   /* new object */
41307 +       reiser4_block_nr reserve;
41308 +
41309 +       reiser4_dir_entry_desc entry;   /* new directory entry */
41310 +
41311 +       assert("nikita-1420", data != NULL);
41312 +       parent = data->parent;
41313 +       dentry = data->dentry;
41314 +
41315 +       assert("nikita-1418", parent != NULL);
41316 +       assert("nikita-1419", dentry != NULL);
41317 +       par_dir = inode_dir_plugin(parent);
41318 +       /* check permissions */
41319 +       result = perm_chk(parent, create, parent, dentry, data);
41320 +       if (result != 0)
41321 +               return result;
41322 +
41323 +       /* check, that name is acceptable for parent */
41324 +       if (par_dir->is_name_acceptable &&
41325 +           !par_dir->is_name_acceptable(parent,
41326 +                                        dentry->d_name.name,
41327 +                                        (int) dentry->d_name.len))
41328 +               return RETERR(-ENAMETOOLONG);
41329 +
41330 +       result = 0;
41331 +       obj_plug = file_plugin_by_id((int) data->id);
41332 +       if (obj_plug == NULL) {
41333 +               warning("nikita-430", "Cannot find plugin %i", data->id);
41334 +               return RETERR(-ENOENT);
41335 +       }
41336 +       object = new_inode(parent->i_sb);
41337 +       if (object == NULL)
41338 +               return RETERR(-ENOMEM);
41339 +       /* we'll update i_nlink below */
41340 +       object->i_nlink = 0;
41341 +       /* new_inode() initializes i_ino to "arbitrary" value. Reset it to 0,
41342 +        * to simplify error handling: if some error occurs before i_ino is
41343 +        * initialized with oid, i_ino should already be set to some
41344 +        * distinguished value. */
41345 +       object->i_ino = 0;
41346 +
41347 +       /* So that on error iput will be called. */
41348 +       *retobj = object;
41349 +
41350 +       if (DQUOT_ALLOC_INODE(object)) {
41351 +               DQUOT_DROP(object);
41352 +               object->i_flags |= S_NOQUOTA;
41353 +               return RETERR(-EDQUOT);
41354 +       }
41355 +
41356 +       xmemset(&entry, 0, sizeof entry);
41357 +       entry.obj = object;
41358 +
41359 +       plugin_set_file(&reiser4_inode_data(object)->pset, obj_plug);
41360 +       result = obj_plug->set_plug_in_inode(object, parent, data);
41361 +       if (result) {
41362 +               warning("nikita-431", "Cannot install plugin %i on %llx",
41363 +                       data->id, get_inode_oid(object));
41364 +               DQUOT_FREE_INODE(object);
41365 +               object->i_flags |= S_NOQUOTA;
41366 +               return result;
41367 +       }
41368 +
41369 +       /* reget plugin after installation */
41370 +       obj_plug = inode_file_plugin(object);
41371 +
41372 +       if (obj_plug->create == NULL) {
41373 +               DQUOT_FREE_INODE(object);
41374 +               object->i_flags |= S_NOQUOTA;
41375 +               return RETERR(-EPERM);
41376 +       }
41377 +
41378 +       /* if any of hash, tail, sd or permission plugins for newly created
41379 +          object are not set yet set them here inheriting them from parent
41380 +          directory
41381 +       */
41382 +       assert("nikita-2070", obj_plug->adjust_to_parent != NULL);
41383 +       result = obj_plug->adjust_to_parent(object,
41384 +                                           parent,
41385 +                                           object->i_sb->s_root->d_inode);
41386 +       if (result != 0) {
41387 +               warning("nikita-432", "Cannot inherit from %llx to %llx",
41388 +                       get_inode_oid(parent), get_inode_oid(object));
41389 +               DQUOT_FREE_INODE(object);
41390 +               object->i_flags |= S_NOQUOTA;
41391 +               return result;
41392 +       }
41393 +
41394 +       /* call file plugin's method to initialize plugin specific part of
41395 +        * inode */
41396 +       if (obj_plug->init_inode_data)
41397 +               obj_plug->init_inode_data(object, data, 1/*create*/);
41398 +
41399 +       /* obtain directory plugin (if any) for new object. */
41400 +       obj_dir = inode_dir_plugin(object);
41401 +       if (obj_dir != NULL && obj_dir->init == NULL) {
41402 +               DQUOT_FREE_INODE(object);
41403 +               object->i_flags |= S_NOQUOTA;
41404 +               return RETERR(-EPERM);
41405 +       }
41406 +
41407 +       reiser4_inode_data(object)->locality_id = get_inode_oid(parent);
41408 +
41409 +       reserve = common_estimate_create_child(parent, object);
41410 +       if (reiser4_grab_space(reserve, BA_CAN_COMMIT)) {
41411 +               DQUOT_FREE_INODE(object);
41412 +               object->i_flags |= S_NOQUOTA;
41413 +               return RETERR(-ENOSPC);
41414 +       }
41415 +
41416 +       /* mark inode `immutable'. We disable changes to the file being
41417 +          created until valid directory entry for it is inserted. Otherwise,
41418 +          if file were expanded and insertion of directory entry fails, we
41419 +          have to remove file, but we only alloted enough space in
41420 +          transaction to remove _empty_ file. 3.x code used to remove stat
41421 +          data in different transaction thus possibly leaking disk space on
41422 +          crash. This all only matters if it's possible to access file
41423 +          without name, for example, by inode number
41424 +       */
41425 +       inode_set_flag(object, REISER4_IMMUTABLE);
41426 +
41427 +       /* create empty object, this includes allocation of new objectid. For
41428 +          directories this implies creation of dot and dotdot  */
41429 +       assert("nikita-2265", inode_get_flag(object, REISER4_NO_SD));
41430 +
41431 +       /* mark inode as `loaded'. From this point onward
41432 +          reiser4_delete_inode() will try to remove its stat-data. */
41433 +       inode_set_flag(object, REISER4_LOADED);
41434 +
41435 +       result = obj_plug->create(object, parent, data);
41436 +       if (result != 0) {
41437 +               inode_clr_flag(object, REISER4_IMMUTABLE);
41438 +               if (result != -ENAMETOOLONG && result != -ENOMEM)
41439 +                       warning("nikita-2219",
41440 +                               "Failed to create sd for %llu",
41441 +                               get_inode_oid(object));
41442 +               DQUOT_FREE_INODE(object);
41443 +               object->i_flags |= S_NOQUOTA;
41444 +               return result;
41445 +       }
41446 +
41447 +       if (obj_dir != NULL)
41448 +               result = obj_dir->init(object, parent, data);
41449 +       if (result == 0) {
41450 +               assert("nikita-434", !inode_get_flag(object, REISER4_NO_SD));
41451 +               /* insert inode into VFS hash table */
41452 +               insert_inode_hash(object);
41453 +               /* create entry */
41454 +               result = par_dir->add_entry(parent, dentry, data, &entry);
41455 +               if (result == 0) {
41456 +                       result = reiser4_add_nlink(object, parent, 0);
41457 +                       /* If O_CREAT is set and the file did not previously
41458 +                          exist, upon successful completion, open() shall
41459 +                          mark for update the st_atime, st_ctime, and
41460 +                          st_mtime fields of the file and the st_ctime and
41461 +                          st_mtime fields of the parent directory. --SUS
41462 +                       */
41463 +                       /* @object times are already updated by
41464 +                          reiser4_add_nlink() */
41465 +                       if (result == 0)
41466 +                               reiser4_update_dir(parent);
41467 +                       if (result != 0)
41468 +                               /* cleanup failure to add nlink */
41469 +                               par_dir->rem_entry(parent, dentry, &entry);
41470 +               }
41471 +               if (result != 0)
41472 +                       /* cleanup failure to add entry */
41473 +                       obj_plug->detach(object, parent);
41474 +       } else if (result != -ENOMEM)
41475 +               warning("nikita-2219", "Failed to initialize dir for %llu: %i",
41476 +                       get_inode_oid(object), result);
41477 +
41478 +       /*
41479 +        * update stat-data, committing all pending modifications to the inode
41480 +        * fields.
41481 +        */
41482 +       reiser4_update_sd(object);
41483 +       if (result != 0) {
41484 +               DQUOT_FREE_INODE(object);
41485 +               object->i_flags |= S_NOQUOTA;
41486 +               /* if everything was ok (result == 0), parent stat-data is
41487 +                * already updated above (update_parent_dir()) */
41488 +               reiser4_update_sd(parent);
41489 +               /* failure to create entry, remove object */
41490 +               obj_plug->delete(object);
41491 +       }
41492 +
41493 +       /* file has name now, clear immutable flag */
41494 +       inode_clr_flag(object, REISER4_IMMUTABLE);
41495 +
41496 +       /* on error, iput() will call ->delete_inode(). We should keep track
41497 +          of the existence of stat-data for this inode and avoid attempt to
41498 +          remove it in reiser4_delete_inode(). This is accomplished through
41499 +          REISER4_NO_SD bit in inode.u.reiser4_i.plugin.flags
41500 +       */
41501 +       return result;
41502 +}
41503 +
41504 +/* ->is_name_acceptable() method of directory plugin */
41505 +/* Audited by: green(2002.06.15) */
41506 +reiser4_internal int
41507 +is_name_acceptable(const struct inode *inode /* directory to check */ ,
41508 +                  const char *name UNUSED_ARG /* name to check */ ,
41509 +                  int len /* @name's length */ )
41510 +{
41511 +       assert("nikita-733", inode != NULL);
41512 +       assert("nikita-734", name != NULL);
41513 +       assert("nikita-735", len > 0);
41514 +
41515 +       return len <= reiser4_max_filename_len(inode);
41516 +}
41517 +
41518 +/* return true, iff @coord points to the valid directory item that is part of
41519 + * @inode directory. */
41520 +static int
41521 +is_valid_dir_coord(struct inode * inode, coord_t * coord)
41522 +{
41523 +       return
41524 +               item_type_by_coord(coord) == DIR_ENTRY_ITEM_TYPE &&
41525 +               inode_file_plugin(inode)->owns_item(inode, coord);
41526 +}
41527 +
41528 +/* true if directory is empty (only contains dot and dotdot) */
41529 +reiser4_internal int
41530 +is_dir_empty(const struct inode *dir)
41531 +{
41532 +       assert("nikita-1976", dir != NULL);
41533 +
41534 +       /* rely on our method to maintain directory i_size being equal to the
41535 +          number of entries. */
41536 +       return dir->i_size <= 2 ? 0 : RETERR(-ENOTEMPTY);
41537 +}
41538 +
41539 +/* compare two logical positions within the same directory */
41540 +reiser4_internal cmp_t dir_pos_cmp(const dir_pos * p1, const dir_pos * p2)
41541 +{
41542 +       cmp_t result;
41543 +
41544 +       assert("nikita-2534", p1 != NULL);
41545 +       assert("nikita-2535", p2 != NULL);
41546 +
41547 +       result = de_id_cmp(&p1->dir_entry_key, &p2->dir_entry_key);
41548 +       if (result == EQUAL_TO) {
41549 +               int diff;
41550 +
41551 +               diff = p1->pos - p2->pos;
41552 +               result = (diff < 0) ? LESS_THAN : (diff ? GREATER_THAN : EQUAL_TO);
41553 +       }
41554 +       return result;
41555 +}
41556 +
41557 +/* true, if file descriptor @f is created by NFS server by "demand" to serve
41558 + * one file system operation. This means that there may be "detached state"
41559 + * for underlying inode. */
41560 +static inline int
41561 +file_is_stateless(struct file *f)
41562 +{
41563 +       return reiser4_get_dentry_fsdata(f->f_dentry)->stateless;
41564 +}
41565 +
41566 +#define CID_SHIFT (20)
41567 +#define CID_MASK  (0xfffffull)
41568 +
41569 +/* calculate ->fpos from user-supplied cookie. Normally it is dir->f_pos, but
41570 + * in the case of stateless directory operation (readdir-over-nfs), client id
41571 + * was encoded in the high bits of cookie and should me masked off. */
41572 +static loff_t
41573 +get_dir_fpos(struct file * dir)
41574 +{
41575 +       if (file_is_stateless(dir))
41576 +               return dir->f_pos & CID_MASK;
41577 +       else
41578 +               return dir->f_pos;
41579 +}
41580 +
41581 +/* see comment before readdir_common() for overview of why "adjustment" is
41582 + * necessary. */
41583 +static void
41584 +adjust_dir_pos(struct file   * dir,
41585 +              readdir_pos   * readdir_spot,
41586 +              const dir_pos * mod_point,
41587 +              int             adj)
41588 +{
41589 +       dir_pos *pos;
41590 +
41591 +       /*
41592 +        * new directory entry was added (adj == +1) or removed (adj == -1) at
41593 +        * the @mod_point. Directory file descriptor @dir is doing readdir and
41594 +        * is currently positioned at @readdir_spot. Latter has to be updated
41595 +        * to maintain stable readdir.
41596 +        */
41597 +
41598 +       ON_TRACE(TRACE_DIR, "adjust: %s/%i",
41599 +                dir ? (char *)dir->f_dentry->d_name.name : "(anon)", adj);
41600 +       ON_TRACE(TRACE_DIR, "\nf_pos: %llu, spot.fpos: %llu entry_no: %llu\n",
41601 +                dir ? dir->f_pos : 0, readdir_spot->fpos,
41602 +                readdir_spot->entry_no);
41603 +
41604 +       reiser4_stat_inc(dir.readdir.adjust_pos);
41605 +
41606 +       /* directory is positioned to the beginning. */
41607 +       if (readdir_spot->entry_no == 0)
41608 +               return;
41609 +
41610 +       pos = &readdir_spot->position;
41611 +       switch (dir_pos_cmp(mod_point, pos)) {
41612 +       case LESS_THAN:
41613 +               /* @mod_pos is _before_ @readdir_spot, that is, entry was
41614 +                * added/removed on the left (in key order) of current
41615 +                * position. */
41616 +               /* logical number of directory entry readdir is "looking" at
41617 +                * changes */
41618 +               readdir_spot->entry_no += adj;
41619 +               assert("nikita-2577",
41620 +                      ergo(dir != NULL, get_dir_fpos(dir) + adj >= 0));
41621 +               if (de_id_cmp(&pos->dir_entry_key,
41622 +                             &mod_point->dir_entry_key) == EQUAL_TO) {
41623 +                       assert("nikita-2575", mod_point->pos < pos->pos);
41624 +                       /*
41625 +                        * if entry added/removed has the same key as current
41626 +                        * for readdir, update counter of duplicate keys in
41627 +                        * @readdir_spot.
41628 +                        */
41629 +                       pos->pos += adj;
41630 +               }
41631 +               reiser4_stat_inc(dir.readdir.adjust_lt);
41632 +               break;
41633 +       case GREATER_THAN:
41634 +               /* directory is modified after @pos: nothing to do. */
41635 +               reiser4_stat_inc(dir.readdir.adjust_gt);
41636 +               break;
41637 +       case EQUAL_TO:
41638 +               /* cannot insert an entry readdir is looking at, because it
41639 +                  already exists. */
41640 +               assert("nikita-2576", adj < 0);
41641 +               /* directory entry to which @pos points to is being
41642 +                  removed.
41643 +
41644 +                  NOTE-NIKITA: Right thing to do is to update @pos to point
41645 +                  to the next entry. This is complex (we are under spin-lock
41646 +                  for one thing). Just rewind it to the beginning. Next
41647 +                  readdir will have to scan the beginning of
41648 +                  directory. Proper solution is to use semaphore in
41649 +                  spin lock's stead and use rewind_right() here.
41650 +
41651 +                  NOTE-NIKITA: now, semaphore is used, so...
41652 +               */
41653 +               xmemset(readdir_spot, 0, sizeof *readdir_spot);
41654 +               reiser4_stat_inc(dir.readdir.adjust_eq);
41655 +       }
41656 +}
41657 +
41658 +/* scan all file-descriptors for this directory and adjust their positions
41659 +   respectively. */
41660 +reiser4_internal void
41661 +adjust_dir_file(struct inode *dir, const struct dentry * de, int offset, int adj)
41662 +{
41663 +       reiser4_file_fsdata *scan;
41664 +       dir_pos mod_point;
41665 +
41666 +       assert("nikita-2536", dir != NULL);
41667 +       assert("nikita-2538", de  != NULL);
41668 +       assert("nikita-2539", adj != 0);
41669 +
41670 +       build_de_id(dir, &de->d_name, &mod_point.dir_entry_key);
41671 +       mod_point.pos = offset;
41672 +
41673 +       spin_lock_inode(dir);
41674 +
41675 +       /*
41676 +        * new entry was added/removed in directory @dir. Scan all file
41677 +        * descriptors for @dir that are currently involved into @readdir and
41678 +        * update them.
41679 +        */
41680 +
41681 +       for_all_type_safe_list(readdir, get_readdir_list(dir), scan)
41682 +               adjust_dir_pos(scan->back, &scan->dir.readdir, &mod_point, adj);
41683 +
41684 +       spin_unlock_inode(dir);
41685 +}
41686 +
41687 +/*
41688 + * traverse tree to start/continue readdir from the readdir position @pos.
41689 + */
41690 +static int
41691 +dir_go_to(struct file *dir, readdir_pos * pos, tap_t * tap)
41692 +{
41693 +       reiser4_key key;
41694 +       int result;
41695 +       struct inode *inode;
41696 +
41697 +       assert("nikita-2554", pos != NULL);
41698 +
41699 +       inode = dir->f_dentry->d_inode;
41700 +       result = inode_dir_plugin(inode)->build_readdir_key(dir, &key);
41701 +       if (result != 0)
41702 +               return result;
41703 +       result = object_lookup(inode,
41704 +                              &key,
41705 +                              tap->coord,
41706 +                              tap->lh,
41707 +                              tap->mode,
41708 +                              FIND_EXACT,
41709 +                              LEAF_LEVEL,
41710 +                              LEAF_LEVEL,
41711 +                              0,
41712 +                              &tap->ra_info);
41713 +       if (result == CBK_COORD_FOUND)
41714 +               result = rewind_right(tap, (int) pos->position.pos);
41715 +       else {
41716 +               tap->coord->node = NULL;
41717 +               done_lh(tap->lh);
41718 +               result = RETERR(-EIO);
41719 +       }
41720 +       return result;
41721 +}
41722 +
41723 +/*
41724 + * handling of non-unique keys: calculate at what ordinal position within
41725 + * sequence of directory items with identical keys @pos is.
41726 + */
41727 +static int
41728 +set_pos(struct inode * inode, readdir_pos * pos, tap_t * tap)
41729 +{
41730 +       int          result;
41731 +       coord_t      coord;
41732 +       lock_handle  lh;
41733 +       tap_t        scan;
41734 +       de_id       *did;
41735 +       reiser4_key  de_key;
41736 +
41737 +       coord_init_zero(&coord);
41738 +       init_lh(&lh);
41739 +       tap_init(&scan, &coord, &lh, ZNODE_READ_LOCK);
41740 +       tap_copy(&scan, tap);
41741 +       tap_load(&scan);
41742 +       pos->position.pos = 0;
41743 +
41744 +       did = &pos->position.dir_entry_key;
41745 +
41746 +       if (is_valid_dir_coord(inode, scan.coord)) {
41747 +
41748 +               build_de_id_by_key(unit_key_by_coord(scan.coord, &de_key), did);
41749 +
41750 +               while (1) {
41751 +
41752 +                       result = go_prev_unit(&scan);
41753 +                       if (result != 0)
41754 +                               break;
41755 +
41756 +                       if (!is_valid_dir_coord(inode, scan.coord)) {
41757 +                               result = -EINVAL;
41758 +                               break;
41759 +                       }
41760 +
41761 +                       /* get key of directory entry */
41762 +                       unit_key_by_coord(scan.coord, &de_key);
41763 +                       if (de_id_key_cmp(did, &de_key) != EQUAL_TO) {
41764 +                               /* duplicate-sequence is over */
41765 +                               break;
41766 +                       }
41767 +                       pos->position.pos ++;
41768 +               }
41769 +       } else
41770 +               result = RETERR(-ENOENT);
41771 +       tap_relse(&scan);
41772 +       tap_done(&scan);
41773 +       return result;
41774 +}
41775 +
41776 +
41777 +/*
41778 + * "rewind" directory to @offset, i.e., set @pos and @tap correspondingly.
41779 + */
41780 +static int
41781 +dir_rewind(struct file *dir, readdir_pos * pos, tap_t * tap)
41782 +{
41783 +       __u64 destination;
41784 +       __s64 shift;
41785 +       int result;
41786 +       struct inode *inode;
41787 +       loff_t dirpos;
41788 +
41789 +       assert("nikita-2553", dir != NULL);
41790 +       assert("nikita-2548", pos != NULL);
41791 +       assert("nikita-2551", tap->coord != NULL);
41792 +       assert("nikita-2552", tap->lh != NULL);
41793 +
41794 +       dirpos = get_dir_fpos(dir);
41795 +       shift = dirpos - pos->fpos;
41796 +       /* this is logical directory entry within @dir which we are rewinding
41797 +        * to */
41798 +       destination = pos->entry_no + shift;
41799 +
41800 +       inode = dir->f_dentry->d_inode;
41801 +       if (dirpos < 0)
41802 +               return RETERR(-EINVAL);
41803 +       else if (destination == 0ll || dirpos == 0) {
41804 +               /* rewind to the beginning of directory */
41805 +               xmemset(pos, 0, sizeof *pos);
41806 +               reiser4_stat_inc(dir.readdir.reset);
41807 +               return dir_go_to(dir, pos, tap);
41808 +       } else if (destination >= inode->i_size)
41809 +               return RETERR(-ENOENT);
41810 +
41811 +       if (shift < 0) {
41812 +               /* I am afraid of negative numbers */
41813 +               shift = -shift;
41814 +               /* rewinding to the left */
41815 +               reiser4_stat_inc(dir.readdir.rewind_left);
41816 +               if (shift <= (int) pos->position.pos) {
41817 +                       /* destination is within sequence of entries with
41818 +                          duplicate keys. */
41819 +                       reiser4_stat_inc(dir.readdir.left_non_uniq);
41820 +                       result = dir_go_to(dir, pos, tap);
41821 +               } else {
41822 +                       shift -= pos->position.pos;
41823 +                       while (1) {
41824 +                               /* repetitions: deadlock is possible when
41825 +                                  going to the left. */
41826 +                               result = dir_go_to(dir, pos, tap);
41827 +                               if (result == 0) {
41828 +                                       result = rewind_left(tap, shift);
41829 +                                       if (result == -E_DEADLOCK) {
41830 +                                               tap_done(tap);
41831 +                                               reiser4_stat_inc(dir.readdir.left_restart);
41832 +                                               continue;
41833 +                                       }
41834 +                               }
41835 +                               break;
41836 +                       }
41837 +               }
41838 +       } else {
41839 +               /* rewinding to the right */
41840 +               reiser4_stat_inc(dir.readdir.rewind_right);
41841 +               result = dir_go_to(dir, pos, tap);
41842 +               if (result == 0)
41843 +                       result = rewind_right(tap, shift);
41844 +       }
41845 +       if (result == 0) {
41846 +               result = set_pos(inode, pos, tap);
41847 +               if (result == 0) {
41848 +                       /* update pos->position.pos */
41849 +                       pos->entry_no = destination;
41850 +                       pos->fpos = dirpos;
41851 +               }
41852 +       }
41853 +       return result;
41854 +}
41855 +
41856 +/*
41857 + * Function that is called by common_readdir() on each directory entry while
41858 + * doing readdir. ->filldir callback may block, so we had to release long term
41859 + * lock while calling it. To avoid repeating tree traversal, seal is used. If
41860 + * seal is broken, we return -E_REPEAT. Node is unlocked in this case.
41861 + *
41862 + * Whether node is unlocked in case of any other error is undefined. It is
41863 + * guaranteed to be still locked if success (0) is returned.
41864 + *
41865 + * When ->filldir() wants no more, feed_entry() returns 1, and node is
41866 + * unlocked.
41867 + */
41868 +static int
41869 +feed_entry(struct file *f,
41870 +          readdir_pos * pos, tap_t *tap, filldir_t filldir, void *dirent)
41871 +{
41872 +       item_plugin *iplug;
41873 +       char *name;
41874 +       reiser4_key sd_key;
41875 +       int result;
41876 +       char buf[DE_NAME_BUF_LEN];
41877 +       char name_buf[32];
41878 +       char *local_name;
41879 +       unsigned file_type;
41880 +       seal_t seal;
41881 +       coord_t *coord;
41882 +       reiser4_key entry_key;
41883 +
41884 +       coord = tap->coord;
41885 +       iplug = item_plugin_by_coord(coord);
41886 +
41887 +       /* pointer to name within the node */
41888 +       name = iplug->s.dir.extract_name(coord, buf);
41889 +       assert("nikita-1371", name != NULL);
41890 +
41891 +       /* key of object the entry points to */
41892 +       if (iplug->s.dir.extract_key(coord, &sd_key) != 0)
41893 +               return RETERR(-EIO);
41894 +
41895 +       /* we must release longterm znode lock before calling filldir to avoid
41896 +          deadlock which may happen if filldir causes page fault. So, copy
41897 +          name to intermediate buffer */
41898 +       if (strlen(name) + 1 > sizeof(name_buf)) {
41899 +               local_name = kmalloc(strlen(name) + 1, GFP_KERNEL);
41900 +               if (local_name == NULL)
41901 +                       return RETERR(-ENOMEM);
41902 +       } else
41903 +               local_name = name_buf;
41904 +
41905 +       strcpy(local_name, name);
41906 +       file_type = iplug->s.dir.extract_file_type(coord);
41907 +
41908 +       unit_key_by_coord(coord, &entry_key);
41909 +       seal_init(&seal, coord, &entry_key);
41910 +
41911 +       longterm_unlock_znode(tap->lh);
41912 +
41913 +       ON_TRACE(TRACE_DIR | TRACE_VFS_OPS, "readdir: %s, %llu, %llu, %llu\n",
41914 +                name, pos->fpos, pos->entry_no, get_key_objectid(&sd_key));
41915 +
41916 +       /*
41917 +        * send information about directory entry to the ->filldir() filler
41918 +        * supplied to us by caller (VFS).
41919 +        *
41920 +        * ->filldir is entitled to do weird things. For example, ->filldir
41921 +        * supplied by knfsd re-enters file system. Make sure no locks are
41922 +        * held.
41923 +        */
41924 +       assert("nikita-3436", lock_stack_isclean(get_current_lock_stack()));
41925 +
41926 +       result = filldir(dirent, name, (int) strlen(name),
41927 +                        /* offset of this entry */
41928 +                        f->f_pos,
41929 +                        /* inode number of object bounden by this entry */
41930 +                        oid_to_uino(get_key_objectid(&sd_key)),
41931 +                        file_type);
41932 +       if (local_name != name_buf)
41933 +               kfree(local_name);
41934 +       if (result < 0)
41935 +               /* ->filldir() is satisfied. (no space in buffer, IOW) */
41936 +               result = 1;
41937 +       else
41938 +               result = seal_validate(&seal, coord, &entry_key, LEAF_LEVEL,
41939 +                                      tap->lh, FIND_EXACT,
41940 +                                      tap->mode, ZNODE_LOCK_HIPRI);
41941 +       return result;
41942 +}
41943 +
41944 +static void
41945 +move_entry(readdir_pos * pos, coord_t * coord)
41946 +{
41947 +       reiser4_key de_key;
41948 +       de_id *did;
41949 +
41950 +       /* update @pos */
41951 +       ++pos->entry_no;
41952 +       did = &pos->position.dir_entry_key;
41953 +
41954 +       /* get key of directory entry */
41955 +       unit_key_by_coord(coord, &de_key);
41956 +
41957 +       if (de_id_key_cmp(did, &de_key) == EQUAL_TO)
41958 +               /* we are within sequence of directory entries
41959 +                  with duplicate keys. */
41960 +               ++pos->position.pos;
41961 +       else {
41962 +               pos->position.pos = 0;
41963 +               build_de_id_by_key(&de_key, did);
41964 +       }
41965 +       ++pos->fpos;
41966 +}
41967 +
41968 +/*
41969 + *     STATELESS READDIR
41970 + *
41971 + * readdir support in reiser4 relies on ability to update readdir_pos embedded
41972 + * into reiser4_file_fsdata on each directory modification (name insertion and
41973 + * removal), see readdir_common() function below. This obviously doesn't work
41974 + * when reiser4 is accessed over NFS, because NFS doesn't keep any state
41975 + * across client READDIR requests for the same directory.
41976 + *
41977 + * To address this we maintain a "pool" of detached reiser4_file_fsdata
41978 + * (d_cursor). Whenever NFS readdir request comes, we detect this, and try to
41979 + * find detached reiser4_file_fsdata corresponding to previous readdir
41980 + * request. In other words, additional state is maintained on the
41981 + * server. (This is somewhat contrary to the design goals of NFS protocol.)
41982 + *
41983 + * To efficiently detect when our ->readdir() method is called by NFS server,
41984 + * dentry is marked as "stateless" in reiser4_decode_fh() (this is checked by
41985 + * file_is_stateless() function).
41986 + *
41987 + * To find out d_cursor in the pool, we encode client id (cid) in the highest
41988 + * bits of NFS readdir cookie: when first readdir request comes to the given
41989 + * directory from the given client, cookie is set to 0. This situation is
41990 + * detected, global cid_counter is incremented, and stored in highest bits of
41991 + * all direntry offsets returned to the client, including last one. As the
41992 + * only valid readdir cookie is one obtained as direntry->offset, we are
41993 + * guaranteed that next readdir request (continuing current one) will have
41994 + * current cid in the highest bits of starting readdir cookie. All d_cursors
41995 + * are hashed into per-super-block hash table by (oid, cid) key.
41996 + *
41997 + * In addition d_cursors are placed into per-super-block radix tree where they
41998 + * are keyed by oid alone. This is necessary to efficiently remove them during
41999 + * rmdir.
42000 + *
42001 + * At last, currently unused d_cursors are linked into special list. This list
42002 + * is used d_cursor_shrink to reclaim d_cursors on memory pressure.
42003 + *
42004 + */
42005 +
42006 +TYPE_SAFE_LIST_DECLARE(d_cursor);
42007 +TYPE_SAFE_LIST_DECLARE(a_cursor);
42008 +
42009 +typedef struct {
42010 +       __u16 cid;
42011 +       __u64 oid;
42012 +} d_cursor_key;
42013 +
42014 +struct dir_cursor {
42015 +       int                  ref;
42016 +       reiser4_file_fsdata *fsdata;
42017 +       d_cursor_hash_link   hash;
42018 +       d_cursor_list_link   list;
42019 +       d_cursor_key         key;
42020 +       d_cursor_info       *info;
42021 +       a_cursor_list_link   alist;
42022 +};
42023 +
42024 +static kmem_cache_t *d_cursor_slab;
42025 +static struct shrinker *d_cursor_shrinker;
42026 +static unsigned long d_cursor_unused = 0;
42027 +static spinlock_t d_lock = SPIN_LOCK_UNLOCKED;
42028 +static a_cursor_list_head cursor_cache = TYPE_SAFE_LIST_HEAD_INIT(cursor_cache);
42029 +
42030 +#define D_CURSOR_TABLE_SIZE (256)
42031 +
42032 +static inline unsigned long
42033 +d_cursor_hash(d_cursor_hash_table *table, const d_cursor_key * key)
42034 +{
42035 +       assert("nikita-3555", IS_POW(D_CURSOR_TABLE_SIZE));
42036 +       return (key->oid + key->cid) & (D_CURSOR_TABLE_SIZE - 1);
42037 +}
42038 +
42039 +static inline int
42040 +d_cursor_eq(const d_cursor_key * k1, const d_cursor_key * k2)
42041 +{
42042 +       return k1->cid == k2->cid && k1->oid == k2->oid;
42043 +}
42044 +
42045 +#define KMALLOC(size) kmalloc((size), GFP_KERNEL)
42046 +#define KFREE(ptr, size) kfree(ptr)
42047 +TYPE_SAFE_HASH_DEFINE(d_cursor,
42048 +                     dir_cursor,
42049 +                     d_cursor_key,
42050 +                     key,
42051 +                     hash,
42052 +                     d_cursor_hash,
42053 +                     d_cursor_eq);
42054 +#undef KFREE
42055 +#undef KMALLOC
42056 +
42057 +TYPE_SAFE_LIST_DEFINE(d_cursor, dir_cursor, list);
42058 +TYPE_SAFE_LIST_DEFINE(a_cursor, dir_cursor, alist);
42059 +
42060 +static void kill_cursor(dir_cursor *cursor);
42061 +
42062 +/*
42063 + * shrink d_cursors cache. Scan LRU list of unused cursors, freeing requested
42064 + * number. Return number of still freeable cursors.
42065 + */
42066 +int d_cursor_shrink(int nr, unsigned int gfp_mask)
42067 +{
42068 +       if (nr != 0) {
42069 +               dir_cursor *scan;
42070 +               int killed;
42071 +
42072 +               killed = 0;
42073 +               spin_lock(&d_lock);
42074 +               while (!a_cursor_list_empty(&cursor_cache)) {
42075 +                       scan = a_cursor_list_front(&cursor_cache);
42076 +                       assert("nikita-3567", scan->ref == 0);
42077 +                       kill_cursor(scan);
42078 +                       ++ killed;
42079 +                       -- nr;
42080 +                       if (nr == 0)
42081 +                               break;
42082 +               }
42083 +               spin_unlock(&d_lock);
42084 +       }
42085 +       return d_cursor_unused;
42086 +}
42087 +
42088 +/*
42089 + * perform global initializations for the d_cursor sub-system.
42090 + */
42091 +reiser4_internal int
42092 +d_cursor_init(void)
42093 +{
42094 +       d_cursor_slab = kmem_cache_create("d_cursor", sizeof (dir_cursor), 0,
42095 +                                         SLAB_HWCACHE_ALIGN, NULL, NULL);
42096 +       if (d_cursor_slab == NULL)
42097 +               return RETERR(-ENOMEM);
42098 +       else {
42099 +               /* actually, d_cursors are "priceless", because there is no
42100 +                * way to recover information stored in them. On the other
42101 +                * hand, we don't want to consume all kernel memory by
42102 +                * them. As a compromise, just assign higher "seeks" value to
42103 +                * d_cursor cache, so that it will be shrunk only if system is
42104 +                * really tight on memory. */
42105 +               d_cursor_shrinker = set_shrinker(DEFAULT_SEEKS << 3,
42106 +                                                d_cursor_shrink);
42107 +               if (d_cursor_shrinker == NULL)
42108 +                       return RETERR(-ENOMEM);
42109 +               else
42110 +                       return 0;
42111 +       }
42112 +}
42113 +
42114 +/*
42115 + * Dual to d_cursor_init(): release global d_cursor resources.
42116 + */
42117 +reiser4_internal void
42118 +d_cursor_done(void)
42119 +{
42120 +       if (d_cursor_shrinker != NULL) {
42121 +               remove_shrinker(d_cursor_shrinker);
42122 +               d_cursor_shrinker = NULL;
42123 +       }
42124 +       if (d_cursor_slab != NULL) {
42125 +               kmem_cache_destroy(d_cursor_slab);
42126 +               d_cursor_slab = NULL;
42127 +       }
42128 +}
42129 +
42130 +/*
42131 + * initialize per-super-block d_cursor resources
42132 + */
42133 +reiser4_internal int
42134 +d_cursor_init_at(struct super_block *s)
42135 +{
42136 +       d_cursor_info *p;
42137 +
42138 +       p = &get_super_private(s)->d_info;
42139 +
42140 +       INIT_RADIX_TREE(&p->tree, GFP_KERNEL);
42141 +       return d_cursor_hash_init(&p->table, D_CURSOR_TABLE_SIZE, NULL);
42142 +}
42143 +
42144 +/*
42145 + * Dual to d_cursor_init_at: release per-super-block d_cursor resources
42146 + */
42147 +reiser4_internal void
42148 +d_cursor_done_at(struct super_block *s)
42149 +{
42150 +       d_cursor_hash_done(&get_super_private(s)->d_info.table);
42151 +}
42152 +
42153 +/*
42154 + * return d_cursor data for the file system @inode is in.
42155 + */
42156 +static inline d_cursor_info * d_info(struct inode *inode)
42157 +{
42158 +       return &get_super_private(inode->i_sb)->d_info;
42159 +}
42160 +
42161 +/*
42162 + * lookup d_cursor in the per-super-block radix tree.
42163 + */
42164 +static inline dir_cursor *lookup(d_cursor_info *info, unsigned long index)
42165 +{
42166 +       return (dir_cursor *)radix_tree_lookup(&info->tree, index);
42167 +}
42168 +
42169 +/*
42170 + * attach @cursor to the radix tree. There may be multiple cursors for the
42171 + * same oid, they are chained into circular list.
42172 + */
42173 +static void bind_cursor(dir_cursor *cursor, unsigned long index)
42174 +{
42175 +       dir_cursor *head;
42176 +
42177 +       head = lookup(cursor->info, index);
42178 +       if (head == NULL) {
42179 +               /* this is the first cursor for this index */
42180 +               d_cursor_list_clean(cursor);
42181 +               radix_tree_insert(&cursor->info->tree, index, cursor);
42182 +       } else {
42183 +               /* some cursor already exists. Chain ours */
42184 +               d_cursor_list_insert_after(head, cursor);
42185 +       }
42186 +}
42187 +
42188 +/*
42189 + * remove @cursor from indices and free it
42190 + */
42191 +static void
42192 +kill_cursor(dir_cursor *cursor)
42193 +{
42194 +       unsigned long index;
42195 +
42196 +       assert("nikita-3566", cursor->ref == 0);
42197 +       assert("nikita-3572", cursor->fsdata != NULL);
42198 +
42199 +       index = (unsigned long)cursor->key.oid;
42200 +       readdir_list_remove_clean(cursor->fsdata);
42201 +       reiser4_free_fsdata(cursor->fsdata);
42202 +       cursor->fsdata = NULL;
42203 +
42204 +       if (d_cursor_list_is_clean(cursor))
42205 +               /* this is last cursor for a file. Kill radix-tree entry */
42206 +               radix_tree_delete(&cursor->info->tree, index);
42207 +       else {
42208 +               void **slot;
42209 +
42210 +               /*
42211 +                * there are other cursors for the same oid.
42212 +                */
42213 +
42214 +               /*
42215 +                * if radix tree point to the cursor being removed, re-target
42216 +                * radix tree slot to the next cursor in the (non-empty as was
42217 +                * checked above) element of the circular list of all cursors
42218 +                * for this oid.
42219 +                */
42220 +               slot = radix_tree_lookup_slot(&cursor->info->tree, index);
42221 +               assert("nikita-3571", *slot != NULL);
42222 +               if (*slot == cursor)
42223 +                       *slot = d_cursor_list_next(cursor);
42224 +               /* remove cursor from circular list */
42225 +               d_cursor_list_remove_clean(cursor);
42226 +       }
42227 +       /* remove cursor from the list of unused cursors */
42228 +       a_cursor_list_remove_clean(cursor);
42229 +       /* remove cursor from the hash table */
42230 +       d_cursor_hash_remove(&cursor->info->table, cursor);
42231 +       /* and free it */
42232 +       kmem_cache_free(d_cursor_slab, cursor);
42233 +       -- d_cursor_unused;
42234 +}
42235 +
42236 +/* possible actions that can be performed on all cursors for the given file */
42237 +enum cursor_action {
42238 +       /* load all detached state: this is called when stat-data is loaded
42239 +        * from the disk to recover information about all pending readdirs */
42240 +       CURSOR_LOAD,
42241 +       /* detach all state from inode, leaving it in the cache. This is
42242 +        * called when inode is removed form the memory by memory pressure */
42243 +       CURSOR_DISPOSE,
42244 +       /* detach cursors from the inode, and free them. This is called when
42245 +        * inode is destroyed. */
42246 +       CURSOR_KILL
42247 +};
42248 +
42249 +static void
42250 +process_cursors(struct inode *inode, enum cursor_action act)
42251 +{
42252 +       oid_t oid;
42253 +       dir_cursor *start;
42254 +       readdir_list_head *head;
42255 +       reiser4_context ctx;
42256 +       d_cursor_info *info;
42257 +
42258 +       /* this can be called by
42259 +        *
42260 +        * kswapd->...->prune_icache->..reiser4_destroy_inode
42261 +        *
42262 +        * without reiser4_context
42263 +        */
42264 +       init_context(&ctx, inode->i_sb);
42265 +
42266 +       assert("nikita-3558", inode != NULL);
42267 +
42268 +       info = d_info(inode);
42269 +       oid = get_inode_oid(inode);
42270 +       spin_lock_inode(inode);
42271 +       head = get_readdir_list(inode);
42272 +       spin_lock(&d_lock);
42273 +       /* find any cursor for this oid: reference to it is hanging of radix
42274 +        * tree */
42275 +       start = lookup(info, (unsigned long)oid);
42276 +       if (start != NULL) {
42277 +               dir_cursor *scan;
42278 +               reiser4_file_fsdata *fsdata;
42279 +
42280 +               /* process circular list of cursors for this oid */
42281 +               scan = start;
42282 +               do {
42283 +                       dir_cursor *next;
42284 +
42285 +                       next = d_cursor_list_next(scan);
42286 +                       fsdata = scan->fsdata;
42287 +                       assert("nikita-3557", fsdata != NULL);
42288 +                       if (scan->key.oid == oid) {
42289 +                               switch (act) {
42290 +                               case CURSOR_DISPOSE:
42291 +                                       readdir_list_remove_clean(fsdata);
42292 +                                       break;
42293 +                               case CURSOR_LOAD:
42294 +                                       readdir_list_push_front(head, fsdata);
42295 +                                       break;
42296 +                               case CURSOR_KILL:
42297 +                                       kill_cursor(scan);
42298 +                                       break;
42299 +                               }
42300 +                       }
42301 +                       if (scan == next)
42302 +                               /* last cursor was just killed */
42303 +                               break;
42304 +                       scan = next;
42305 +               } while (scan != start);
42306 +       }
42307 +       spin_unlock(&d_lock);
42308 +       /* check that we killed 'em all */
42309 +       assert("nikita-3568", ergo(act == CURSOR_KILL,
42310 +                                  readdir_list_empty(get_readdir_list(inode))));
42311 +       assert("nikita-3569", ergo(act == CURSOR_KILL,
42312 +                                  lookup(info, oid) == NULL));
42313 +       spin_unlock_inode(inode);
42314 +       reiser4_exit_context(&ctx);
42315 +}
42316 +
42317 +/* detach all cursors from inode. This is called when inode is removed from
42318 + * the memory by memory pressure */
42319 +reiser4_internal void dispose_cursors(struct inode *inode)
42320 +{
42321 +       process_cursors(inode, CURSOR_DISPOSE);
42322 +}
42323 +
42324 +/* attach all detached cursors to the inode. This is done when inode is loaded
42325 + * into memory */
42326 +reiser4_internal void load_cursors(struct inode *inode)
42327 +{
42328 +       process_cursors(inode, CURSOR_LOAD);
42329 +}
42330 +
42331 +/* free all cursors for this inode. This is called when inode is destroyed. */
42332 +reiser4_internal void kill_cursors(struct inode *inode)
42333 +{
42334 +       process_cursors(inode, CURSOR_KILL);
42335 +}
42336 +
42337 +/* global counter used to generate "client ids". These ids are encoded into
42338 + * high bits of fpos. */
42339 +static __u32 cid_counter = 0;
42340 +
42341 +/*
42342 + * detach fsdata (if detachable) from file descriptor, and put cursor on the
42343 + * "unused" list. Called when file descriptor is not longer in active use.
42344 + */
42345 +static void
42346 +clean_fsdata(struct file *f)
42347 +{
42348 +       dir_cursor   *cursor;
42349 +       reiser4_file_fsdata *fsdata;
42350 +
42351 +       assert("nikita-3570", file_is_stateless(f));
42352 +
42353 +       fsdata = (reiser4_file_fsdata *)f->private_data;
42354 +       if (fsdata != NULL) {
42355 +               cursor = fsdata->cursor;
42356 +               if (cursor != NULL) {
42357 +                       spin_lock(&d_lock);
42358 +                       -- cursor->ref;
42359 +                       if (cursor->ref == 0) {
42360 +                               a_cursor_list_push_back(&cursor_cache, cursor);
42361 +                               ++ d_cursor_unused;
42362 +                       }
42363 +                       spin_unlock(&d_lock);
42364 +                       f->private_data = NULL;
42365 +               }
42366 +       }
42367 +}
42368 +
42369 +/* add detachable readdir state to the @f */
42370 +static int
42371 +insert_cursor(dir_cursor *cursor, struct file *f, struct inode *inode)
42372 +{
42373 +       int                  result;
42374 +       reiser4_file_fsdata *fsdata;
42375 +
42376 +       xmemset(cursor, 0, sizeof *cursor);
42377 +
42378 +       /* this is either first call to readdir, or rewind. Anyway, create new
42379 +        * cursor. */
42380 +       fsdata = create_fsdata(NULL, GFP_KERNEL);
42381 +       if (fsdata != NULL) {
42382 +               result = radix_tree_preload(GFP_KERNEL);
42383 +               if (result == 0) {
42384 +                       d_cursor_info *info;
42385 +                       oid_t oid;
42386 +
42387 +                       info = d_info(inode);
42388 +                       oid  = get_inode_oid(inode);
42389 +                       /* cid occupies higher 12 bits of f->f_pos. Don't
42390 +                        * allow it to become negative: this confuses
42391 +                        * nfsd_readdir() */
42392 +                       cursor->key.cid = (++ cid_counter) & 0x7ff;
42393 +                       cursor->key.oid = oid;
42394 +                       cursor->fsdata  = fsdata;
42395 +                       cursor->info    = info;
42396 +                       cursor->ref     = 1;
42397 +                       spin_lock_inode(inode);
42398 +                       /* install cursor as @f's private_data, discarding old
42399 +                        * one if necessary */
42400 +                       clean_fsdata(f);
42401 +                       reiser4_free_file_fsdata(f);
42402 +                       f->private_data = fsdata;
42403 +                       fsdata->cursor = cursor;
42404 +                       spin_unlock_inode(inode);
42405 +                       spin_lock(&d_lock);
42406 +                       /* insert cursor into hash table */
42407 +                       d_cursor_hash_insert(&info->table, cursor);
42408 +                       /* and chain it into radix-tree */
42409 +                       bind_cursor(cursor, (unsigned long)oid);
42410 +                       spin_unlock(&d_lock);
42411 +                       radix_tree_preload_end();
42412 +                       f->f_pos = ((__u64)cursor->key.cid) << CID_SHIFT;
42413 +               }
42414 +       } else
42415 +               result = RETERR(-ENOMEM);
42416 +       return result;
42417 +}
42418 +
42419 +/* find or create cursor for readdir-over-nfs */
42420 +static int
42421 +try_to_attach_fsdata(struct file *f, struct inode *inode)
42422 +{
42423 +       loff_t pos;
42424 +       int    result;
42425 +       dir_cursor *cursor;
42426 +
42427 +       /*
42428 +        * we are serialized by inode->i_sem
42429 +        */
42430 +
42431 +       if (!file_is_stateless(f))
42432 +               return 0;
42433 +
42434 +       pos = f->f_pos;
42435 +       result = 0;
42436 +       if (pos == 0) {
42437 +               /*
42438 +                * first call to readdir (or rewind to the beginning of
42439 +                * directory)
42440 +                */
42441 +               cursor = kmem_cache_alloc(d_cursor_slab, GFP_KERNEL);
42442 +               if (cursor != NULL)
42443 +                       result = insert_cursor(cursor, f, inode);
42444 +               else
42445 +                       result = RETERR(-ENOMEM);
42446 +       } else {
42447 +               /* try to find existing cursor */
42448 +               d_cursor_key key;
42449 +
42450 +               key.cid = pos >> CID_SHIFT;
42451 +               key.oid = get_inode_oid(inode);
42452 +               spin_lock(&d_lock);
42453 +               cursor = d_cursor_hash_find(&d_info(inode)->table, &key);
42454 +               if (cursor != NULL) {
42455 +                       /* cursor was found */
42456 +                       if (cursor->ref == 0) {
42457 +                               /* move it from unused list */
42458 +                               a_cursor_list_remove_clean(cursor);
42459 +                               -- d_cursor_unused;
42460 +                       }
42461 +                       ++ cursor->ref;
42462 +               }
42463 +               spin_unlock(&d_lock);
42464 +               if (cursor != NULL) {
42465 +                       spin_lock_inode(inode);
42466 +                       assert("nikita-3556", cursor->fsdata->back == NULL);
42467 +                       clean_fsdata(f);
42468 +                       reiser4_free_file_fsdata(f);
42469 +                       f->private_data = cursor->fsdata;
42470 +                       spin_unlock_inode(inode);
42471 +               }
42472 +       }
42473 +       return result;
42474 +}
42475 +
42476 +/* detach fsdata, if necessary */
42477 +static void
42478 +detach_fsdata(struct file *f)
42479 +{
42480 +       struct inode *inode;
42481 +
42482 +       if (!file_is_stateless(f))
42483 +               return;
42484 +
42485 +       inode = f->f_dentry->d_inode;
42486 +       spin_lock_inode(inode);
42487 +       clean_fsdata(f);
42488 +       spin_unlock_inode(inode);
42489 +}
42490 +
42491 +/*
42492 + * prepare for readdir.
42493 + */
42494 +static int
42495 +dir_readdir_init(struct file *f, tap_t * tap, readdir_pos ** pos)
42496 +{
42497 +       struct inode *inode;
42498 +       reiser4_file_fsdata *fsdata;
42499 +       int result;
42500 +
42501 +       assert("nikita-1359", f != NULL);
42502 +       inode = f->f_dentry->d_inode;
42503 +       assert("nikita-1360", inode != NULL);
42504 +
42505 +       if (!S_ISDIR(inode->i_mode))
42506 +               return RETERR(-ENOTDIR);
42507 +
42508 +       /* try to find detached readdir state */
42509 +       result = try_to_attach_fsdata(f, inode);
42510 +       if (result != 0)
42511 +               return result;
42512 +
42513 +       fsdata = reiser4_get_file_fsdata(f);
42514 +       assert("nikita-2571", fsdata != NULL);
42515 +       if (IS_ERR(fsdata))
42516 +               return PTR_ERR(fsdata);
42517 +
42518 +       /* add file descriptor to the readdir list hanging of directory
42519 +        * inode. This list is used to scan "readdirs-in-progress" while
42520 +        * inserting or removing names in the directory. */
42521 +       spin_lock_inode(inode);
42522 +       if (readdir_list_is_clean(fsdata))
42523 +               readdir_list_push_front(get_readdir_list(inode), fsdata);
42524 +       *pos = &fsdata->dir.readdir;
42525 +       spin_unlock_inode(inode);
42526 +
42527 +       ON_TRACE(TRACE_DIR, " fpos: %llu entry_no: %llu\n",
42528 +                (*pos)->entry_no, (*pos)->fpos);
42529 +
42530 +       /* move @tap to the current position */
42531 +       return dir_rewind(f, *pos, tap);
42532 +}
42533 +
42534 +/*
42535 + * ->readdir method of directory plugin
42536 + *
42537 + * readdir problems:
42538 + *
42539 + *     Traditional UNIX API for scanning through directory
42540 + *     (readdir/seekdir/telldir/opendir/closedir/rewindir/getdents) is based
42541 + *     on the assumption that directory is structured very much like regular
42542 + *     file, in particular, it is implied that each name within given
42543 + *     directory (directory entry) can be uniquely identified by scalar offset
42544 + *     and that such offset is stable across the life-time of the name is
42545 + *     identifies.
42546 + *
42547 + *     This is manifestly not so for reiser4. In reiser4 the only stable
42548 + *     unique identifies for the directory entry is its key that doesn't fit
42549 + *     into seekdir/telldir API.
42550 + *
42551 + * solution:
42552 + *
42553 + *     Within each file descriptor participating in readdir-ing of directory
42554 + *     plugin/dir/dir.h:readdir_pos is maintained. This structure keeps track
42555 + *     of the "current" directory entry that file descriptor looks at. It
42556 + *     contains a key of directory entry (plus some additional info to deal
42557 + *     with non-unique keys that we wouldn't dwell onto here) and a logical
42558 + *     position of this directory entry starting from the beginning of the
42559 + *     directory, that is ordinal number of this entry in the readdir order.
42560 + *
42561 + *     Obviously this logical position is not stable in the face of directory
42562 + *     modifications. To work around this, on each addition or removal of
42563 + *     directory entry all file descriptors for directory inode are scanned
42564 + *     and their readdir_pos are updated accordingly (adjust_dir_pos()).
42565 + *
42566 + */
42567 +static int
42568 +readdir_common(struct file *f /* directory file being read */ ,
42569 +              void *dirent /* opaque data passed to us by VFS */ ,
42570 +              filldir_t filld  /* filler function passed to us
42571 +                                  * by VFS */ )
42572 +{
42573 +       int result;
42574 +       struct inode *inode;
42575 +       coord_t coord;
42576 +       lock_handle lh;
42577 +       tap_t tap;
42578 +       readdir_pos *pos;
42579 +
42580 +       assert("nikita-1359", f != NULL);
42581 +       inode = f->f_dentry->d_inode;
42582 +       assert("nikita-1360", inode != NULL);
42583 +
42584 +       reiser4_stat_inc(dir.readdir.calls);
42585 +
42586 +       if (!S_ISDIR(inode->i_mode))
42587 +               return RETERR(-ENOTDIR);
42588 +
42589 +       coord_init_zero(&coord);
42590 +       init_lh(&lh);
42591 +       tap_init(&tap, &coord, &lh, ZNODE_READ_LOCK);
42592 +
42593 +       reiser4_readdir_readahead_init(inode, &tap);
42594 +
42595 +       ON_TRACE(TRACE_DIR | TRACE_VFS_OPS,
42596 +                "readdir: inode: %llu offset: %#llx\n",
42597 +                get_inode_oid(inode), f->f_pos);
42598 +
42599 + repeat:
42600 +       result = dir_readdir_init(f, &tap, &pos);
42601 +       if (result == 0) {
42602 +               result = tap_load(&tap);
42603 +               /* scan entries one by one feeding them to @filld */
42604 +               while (result == 0) {
42605 +                       coord_t *coord;
42606 +
42607 +                       coord = tap.coord;
42608 +                       assert("nikita-2572", coord_is_existing_unit(coord));
42609 +                       assert("nikita-3227", is_valid_dir_coord(inode, coord));
42610 +
42611 +                       result = feed_entry(f, pos, &tap, filld, dirent);
42612 +                       ON_TRACE(TRACE_DIR | TRACE_VFS_OPS,
42613 +                                "readdir: entry: offset: %#llx\n", f->f_pos);
42614 +                       if (result > 0) {
42615 +                               break;
42616 +                       } else if (result == 0) {
42617 +                               ++ f->f_pos;
42618 +                               result = go_next_unit(&tap);
42619 +                               if (result == -E_NO_NEIGHBOR ||
42620 +                                   result == -ENOENT) {
42621 +                                       result = 0;
42622 +                                       break;
42623 +                               } else if (result == 0) {
42624 +                                       if (is_valid_dir_coord(inode, coord))
42625 +                                               move_entry(pos, coord);
42626 +                                       else
42627 +                                               break;
42628 +                               }
42629 +                       } else if (result == -E_REPEAT) {
42630 +                               /* feed_entry() had to restart. */
42631 +                               ++ f->f_pos;
42632 +                               tap_relse(&tap);
42633 +                               goto repeat;
42634 +                       } else
42635 +                               warning("vs-1617",
42636 +                                       "readdir_common: unexpected error %d",
42637 +                                       result);
42638 +               }
42639 +               tap_relse(&tap);
42640 +
42641 +               if (result >= 0)
42642 +                       f->f_version = inode->i_version;
42643 +       } else if (result == -E_NO_NEIGHBOR || result == -ENOENT)
42644 +               result = 0;
42645 +       tap_done(&tap);
42646 +       ON_TRACE(TRACE_DIR | TRACE_VFS_OPS,
42647 +                "readdir_exit: offset: %#llx\n", f->f_pos);
42648 +       detach_fsdata(f);
42649 +       return (result <= 0) ? result : 0;
42650 +}
42651 +
42652 +/*
42653 + * seek method for directory. See comment before readdir_common() for
42654 + * explanation.
42655 + */
42656 +loff_t
42657 +seek_dir(struct file *file, loff_t off, int origin)
42658 +{
42659 +       loff_t result;
42660 +       struct inode *inode;
42661 +
42662 +       inode = file->f_dentry->d_inode;
42663 +       ON_TRACE(TRACE_DIR | TRACE_VFS_OPS, "seek_dir: %s: %lli -> %lli/%i\n",
42664 +                file->f_dentry->d_name.name, file->f_pos, off, origin);
42665 +       down(&inode->i_sem);
42666 +
42667 +       /* update ->f_pos */
42668 +       result = default_llseek(file, off, origin);
42669 +       if (result >= 0) {
42670 +               int ff;
42671 +               coord_t coord;
42672 +               lock_handle lh;
42673 +               tap_t tap;
42674 +               readdir_pos *pos;
42675 +
42676 +               coord_init_zero(&coord);
42677 +               init_lh(&lh);
42678 +               tap_init(&tap, &coord, &lh, ZNODE_READ_LOCK);
42679 +
42680 +               ff = dir_readdir_init(file, &tap, &pos);
42681 +               detach_fsdata(file);
42682 +               if (ff != 0)
42683 +                       result = (loff_t) ff;
42684 +               tap_done(&tap);
42685 +       }
42686 +       detach_fsdata(file);
42687 +       up(&inode->i_sem);
42688 +       return result;
42689 +}
42690 +
42691 +/* ->attach method of directory plugin */
42692 +static int
42693 +attach_common(struct inode *child UNUSED_ARG, struct inode *parent UNUSED_ARG)
42694 +{
42695 +       assert("nikita-2647", child != NULL);
42696 +       assert("nikita-2648", parent != NULL);
42697 +
42698 +       return 0;
42699 +}
42700 +
42701 +/* ->estimate.add_entry method of directory plugin
42702 +   estimation of adding entry which supposes that entry is inserting a unit into item
42703 +*/
42704 +static reiser4_block_nr
42705 +estimate_add_entry_common(struct inode *inode)
42706 +{
42707 +       return estimate_one_insert_into_item(tree_by_inode(inode));
42708 +}
42709 +
42710 +/* ->estimate.rem_entry method of directory plugin */
42711 +static reiser4_block_nr
42712 +estimate_rem_entry_common(struct inode *inode)
42713 +{
42714 +       return estimate_one_item_removal(tree_by_inode(inode));
42715 +}
42716 +
42717 +/* placeholder for VFS methods not-applicable to the object */
42718 +static ssize_t
42719 +noperm(void)
42720 +{
42721 +       return RETERR(-EPERM);
42722 +}
42723 +
42724 +#define dir_eperm ((void *)noperm)
42725 +
42726 +static int
42727 +_noop(void)
42728 +{
42729 +       return 0;
42730 +}
42731 +
42732 +#define enoop ((void *)_noop)
42733 +
42734 +static int
42735 +change_dir(struct inode * inode, reiser4_plugin * plugin)
42736 +{
42737 +       /* cannot change dir plugin of already existing object */
42738 +       return RETERR(-EINVAL);
42739 +}
42740 +
42741 +static reiser4_plugin_ops dir_plugin_ops = {
42742 +       .init     = NULL,
42743 +       .load     = NULL,
42744 +       .save_len = NULL,
42745 +       .save     = NULL,
42746 +       .change   = change_dir
42747 +};
42748 +
42749 +/*
42750 + * definition of directory plugins
42751 + */
42752 +
42753 +dir_plugin dir_plugins[LAST_DIR_ID] = {
42754 +       /* standard hashed directory plugin */
42755 +       [HASHED_DIR_PLUGIN_ID] = {
42756 +               .h = {
42757 +                       .type_id = REISER4_DIR_PLUGIN_TYPE,
42758 +                       .id = HASHED_DIR_PLUGIN_ID,
42759 +                       .pops = &dir_plugin_ops,
42760 +                       .label = "dir",
42761 +                       .desc = "hashed directory",
42762 +                       .linkage = TYPE_SAFE_LIST_LINK_ZERO
42763 +               },
42764 +               .get_parent = get_parent_hashed,
42765 +               .lookup = lookup_hashed,
42766 +               .unlink = unlink_common,
42767 +               .link = link_common,
42768 +               .is_name_acceptable = is_name_acceptable,
42769 +               .build_entry_key = build_entry_key_common,
42770 +               .build_readdir_key = build_readdir_key_common,
42771 +               .add_entry = add_entry_hashed,
42772 +               .rem_entry = rem_entry_hashed,
42773 +               .create_child = create_child_common,
42774 +               .rename = rename_hashed,
42775 +               .readdir = readdir_common,
42776 +               .init = init_hashed,
42777 +               .done = done_hashed,
42778 +               .attach = attach_common,
42779 +               .detach = detach_hashed,
42780 +               .estimate = {
42781 +                       .add_entry = estimate_add_entry_common,
42782 +                       .rem_entry = estimate_rem_entry_common,
42783 +                       .unlink    = estimate_unlink_hashed
42784 +               }
42785 +       },
42786 +       /* hashed directory for which seekdir/telldir are guaranteed to
42787 +        * work. Brain-damage. */
42788 +       [SEEKABLE_HASHED_DIR_PLUGIN_ID] = {
42789 +               .h = {
42790 +                       .type_id = REISER4_DIR_PLUGIN_TYPE,
42791 +                       .id = SEEKABLE_HASHED_DIR_PLUGIN_ID,
42792 +                       .pops = &dir_plugin_ops,
42793 +                       .label = "dir32",
42794 +                       .desc = "directory hashed with 31 bit hash",
42795 +                       .linkage = TYPE_SAFE_LIST_LINK_ZERO
42796 +               },
42797 +               .get_parent = get_parent_hashed,
42798 +               .lookup = lookup_hashed,
42799 +               .unlink = unlink_common,
42800 +               .link = link_common,
42801 +               .is_name_acceptable = is_name_acceptable,
42802 +               .build_entry_key = build_entry_key_stable_entry,
42803 +               .build_readdir_key = build_readdir_key_common,
42804 +               .add_entry = add_entry_hashed,
42805 +               .rem_entry = rem_entry_hashed,
42806 +               .create_child = create_child_common,
42807 +               .rename = rename_hashed,
42808 +               .readdir = readdir_common,
42809 +               .init = init_hashed,
42810 +               .done = done_hashed,
42811 +               .attach = attach_common,
42812 +               .detach = detach_hashed,
42813 +               .estimate = {
42814 +                       .add_entry = estimate_add_entry_common,
42815 +                       .rem_entry = estimate_rem_entry_common,
42816 +                       .unlink    = estimate_unlink_hashed
42817 +               }
42818 +       },
42819 +       /* pseudo directory. */
42820 +       [PSEUDO_DIR_PLUGIN_ID] = {
42821 +               .h = {
42822 +                       .type_id = REISER4_DIR_PLUGIN_TYPE,
42823 +                       .id = PSEUDO_DIR_PLUGIN_ID,
42824 +                       .pops = &dir_plugin_ops,
42825 +                       .label = "pseudo",
42826 +                       .desc = "pseudo directory",
42827 +                       .linkage = TYPE_SAFE_LIST_LINK_ZERO
42828 +               },
42829 +               .get_parent = get_parent_pseudo,
42830 +               .lookup = lookup_pseudo,
42831 +               .unlink = dir_eperm,
42832 +               .link = dir_eperm,
42833 +               .is_name_acceptable = NULL,
42834 +               .build_entry_key = NULL,
42835 +               .build_readdir_key = NULL,
42836 +               .add_entry = dir_eperm,
42837 +               .rem_entry = dir_eperm,
42838 +               .create_child = NULL,
42839 +               .rename = dir_eperm,
42840 +               .readdir = readdir_pseudo,
42841 +               .init = enoop,
42842 +               .done = enoop,
42843 +               .attach = enoop,
42844 +               .detach = enoop,
42845 +               .estimate = {
42846 +                       .add_entry = NULL,
42847 +                       .rem_entry = NULL,
42848 +                       .unlink    = NULL
42849 +               }
42850 +       }
42851 +};
42852 +
42853 +/* Make Linus happy.
42854 +   Local variables:
42855 +   c-indentation-style: "K&R"
42856 +   mode-name: "LC"
42857 +   c-basic-offset: 8
42858 +   tab-width: 8
42859 +   fill-column: 120
42860 +   End:
42861 +*/
42862 diff -rupN linux-2.6.8-rc3/fs/reiser4/plugin/dir/dir.h linux-2.6.8-rc3-a/fs/reiser4/plugin/dir/dir.h
42863 --- linux-2.6.8-rc3/fs/reiser4/plugin/dir/dir.h 1970-01-01 03:00:00.000000000 +0300
42864 +++ linux-2.6.8-rc3-a/fs/reiser4/plugin/dir/dir.h       2004-08-05 21:20:53.069663381 +0400
42865 @@ -0,0 +1,106 @@
42866 +/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
42867 + * reiser4/README */
42868 +
42869 +/* Directory plugin's methods. See dir.c for more details. */
42870 +
42871 +#if !defined( __REISER4_DIR_H__ )
42872 +#define __REISER4_DIR_H__
42873 +
42874 +#include "../../forward.h"
42875 +#include "../../kassign.h"
42876 +#include "../../type_safe_hash.h"
42877 +
42878 +#include <linux/types.h>       /* for __u??  */
42879 +#include <linux/fs.h>          /* for struct file  */
42880 +#include <linux/radix-tree.h>
42881 +
42882 +/* locking: fields of per file descriptor readdir_pos and ->f_pos are
42883 + * protected by ->i_sem on inode. Under this lock following invariant
42884 + * holds:
42885 + *
42886 + *     file descriptor is "looking" at the entry_no-th directory entry from
42887 + *     the beginning of directory. This entry has key dir_entry_key and is
42888 + *     pos-th entry with duplicate-key sequence.
42889 + *
42890 + */
42891 +
42892 +/* logical position within directory */
42893 +typedef struct {
42894 +       /* key of directory entry (actually, part of a key sufficient to
42895 +          identify directory entry)  */
42896 +       de_id dir_entry_key;
42897 +       /* ordinal number of directory entry among all entries with the same
42898 +          key. (Starting from 0.) */
42899 +       unsigned pos;
42900 +} dir_pos;
42901 +
42902 +typedef struct {
42903 +       /* f_pos corresponding to this readdir position */
42904 +       __u64 fpos;
42905 +       /* logical position within directory */
42906 +       dir_pos position;
42907 +       /* logical number of directory entry within
42908 +          directory  */
42909 +       __u64 entry_no;
42910 +} readdir_pos;
42911 +
42912 +extern void adjust_dir_file(struct inode *dir, const struct dentry *de,
42913 +                           int offset, int adj);
42914 +extern loff_t seek_dir(struct file *file, loff_t off, int origin);
42915 +
42916 +/* description of directory entry being created/destroyed/sought for
42917 +
42918 +   It is passed down to the directory plugin and farther to the
42919 +   directory item plugin methods. Creation of new directory is done in
42920 +   several stages: first we search for an entry with the same name, then
42921 +   create new one. reiser4_dir_entry_desc is used to store some information
42922 +   collected at some stage of this process and required later: key of
42923 +   item that we want to insert/delete and pointer to an object that will
42924 +   be bound by the new directory entry. Probably some more fields will
42925 +   be added there.
42926 +
42927 +*/
42928 +struct reiser4_dir_entry_desc {
42929 +       /* key of directory entry */
42930 +       reiser4_key key;
42931 +       /* object bound by this entry. */
42932 +       struct inode *obj;
42933 +};
42934 +
42935 +int is_name_acceptable(const struct inode *inode, const char *name UNUSED_ARG, int len);
42936 +int is_dir_empty(const struct inode *dir);
42937 +int reiser4_update_dir(struct inode *dir);
42938 +
42939 +void dispose_cursors(struct inode *inode);
42940 +void load_cursors(struct inode *inode);
42941 +void kill_cursors(struct inode *inode);
42942 +
42943 +typedef struct dir_cursor dir_cursor;
42944 +
42945 +TYPE_SAFE_HASH_DECLARE(d_cursor, dir_cursor);
42946 +
42947 +int d_cursor_init_at(struct super_block *s);
42948 +void d_cursor_done_at(struct super_block *s);
42949 +
42950 +/*
42951 + * information about d_cursors (detached readdir state) maintained in reiser4
42952 + * specific portion of reiser4 super-block. See dir.c for more information on
42953 + * d_cursors.
42954 + */
42955 +typedef struct d_cursor_info {
42956 +       d_cursor_hash_table    table;
42957 +       struct radix_tree_root tree;
42958 +} d_cursor_info;
42959 +
42960 +/* __REISER4_DIR_H__ */
42961 +#endif
42962 +
42963 +/*
42964 +   Local variables:
42965 +   c-indentation-style: "K&R"
42966 +   mode-name: "LC"
42967 +   c-basic-offset: 8
42968 +   tab-width: 8
42969 +   fill-column: 120
42970 +   End:
42971 +*/
42972 diff -rupN linux-2.6.8-rc3/fs/reiser4/plugin/dir/hashed_dir.c linux-2.6.8-rc3-a/fs/reiser4/plugin/dir/hashed_dir.c
42973 --- linux-2.6.8-rc3/fs/reiser4/plugin/dir/hashed_dir.c  1970-01-01 03:00:00.000000000 +0300
42974 +++ linux-2.6.8-rc3-a/fs/reiser4/plugin/dir/hashed_dir.c        2004-08-05 21:20:53.162643769 +0400
42975 @@ -0,0 +1,1473 @@
42976 +/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
42977 + * reiser4/README */
42978 +
42979 +/* Directory plugin using hashes (see fs/reiser4/plugin/hash.c) to map file
42980 +   names to the files. */
42981 +
42982 +/* See fs/reiser4/doc/directory-service for initial design note. */
42983 +
42984 +/*
42985 + * Hashed directory logically consists of persistent directory
42986 + * entries. Directory entry is a pair of a file name and a key of stat-data of
42987 + * a file that has this name in the given directory.
42988 + *
42989 + * Directory entries are stored in the tree in the form of directory
42990 + * items. Directory item should implement dir_entry_ops portion of item plugin
42991 + * interface (see plugin/item/item.h). Hashed directory interacts with
42992 + * directory item plugin exclusively through dir_entry_ops operations.
42993 + *
42994 + * Currently there are two implementations of directory items: "simple
42995 + * directory item" (plugin/item/sde.[ch]), and "compound directory item"
42996 + * (plugin/item/cde.[ch]) with the latter being the default.
42997 + *
42998 + * There is, however some delicate way through which directory code interferes
42999 + * with item plugin: key assignment policy. A key for a directory item is
43000 + * chosen by directory code, and as described in kassign.c, this key contains
43001 + * a portion of file name. Directory item uses this knowledge to avoid storing
43002 + * this portion of file name twice: in the key and in the directory item body.
43003 + *
43004 + */
43005 +
43006 +#include "../../forward.h"
43007 +#include "../../debug.h"
43008 +#include "../../spin_macros.h"
43009 +#include "../../key.h"
43010 +#include "../../kassign.h"
43011 +#include "../../coord.h"
43012 +#include "../../seal.h"
43013 +#include "dir.h"
43014 +#include "../item/item.h"
43015 +#include "../security/perm.h"
43016 +#include "../pseudo/pseudo.h"
43017 +#include "../plugin.h"
43018 +#include "../object.h"
43019 +#include "../../jnode.h"
43020 +#include "../../znode.h"
43021 +#include "../../tree.h"
43022 +#include "../../vfs_ops.h"
43023 +#include "../../inode.h"
43024 +#include "../../reiser4.h"
43025 +#include "../../safe_link.h"
43026 +
43027 +#include <linux/fs.h>          /* for struct inode */
43028 +#include <linux/dcache.h>      /* for struct dentry */
43029 +
43030 +static int create_dot_dotdot(struct inode *object, struct inode *parent);
43031 +static int find_entry(struct inode *dir, struct dentry *name,
43032 +                     lock_handle * lh, znode_lock_mode mode,
43033 +                     reiser4_dir_entry_desc * entry);
43034 +static int check_item(const struct inode *dir,
43035 +                     const coord_t * coord, const char *name);
43036 +
43037 +reiser4_internal reiser4_block_nr
43038 +hashed_estimate_init(struct inode *parent, struct inode *object)
43039 +{
43040 +       reiser4_block_nr res = 0;
43041 +
43042 +       assert("vpf-321", parent != NULL);
43043 +       assert("vpf-322", object != NULL);
43044 +
43045 +       /* hashed_add_entry(object) */
43046 +       res += inode_dir_plugin(object)->estimate.add_entry(object);
43047 +       /* reiser4_add_nlink(object) */
43048 +       res += inode_file_plugin(object)->estimate.update(object);
43049 +       /* hashed_add_entry(object) */
43050 +       res += inode_dir_plugin(object)->estimate.add_entry(object);
43051 +       /* reiser4_add_nlink(parent) */
43052 +       res += inode_file_plugin(parent)->estimate.update(parent);
43053 +
43054 +       return 0;
43055 +}
43056 +
43057 +/* plugin->u.dir.init
43058 +   create sd for directory file. Create stat-data, dot, and dotdot. */
43059 +reiser4_internal int
43060 +init_hashed(struct inode *object /* new directory */ ,
43061 +           struct inode *parent /* parent directory */ ,
43062 +           reiser4_object_create_data * data UNUSED_ARG        /* info passed
43063 +                                                                * to us, this
43064 +                                                                * is filled by
43065 +                                                                * reiser4()
43066 +                                                                * syscall in
43067 +                                                                * particular */ )
43068 +{
43069 +       reiser4_block_nr reserve;
43070 +
43071 +       assert("nikita-680", object != NULL);
43072 +       assert("nikita-681", S_ISDIR(object->i_mode));
43073 +       assert("nikita-682", parent != NULL);
43074 +       assert("nikita-684", data != NULL);
43075 +       assert("nikita-686", data->id == DIRECTORY_FILE_PLUGIN_ID);
43076 +       assert("nikita-687", object->i_mode & S_IFDIR);
43077 +       trace_stamp(TRACE_DIR);
43078 +
43079 +       reserve = hashed_estimate_init(parent, object);
43080 +       if (reiser4_grab_space(reserve, BA_CAN_COMMIT))
43081 +               return RETERR(-ENOSPC);
43082 +
43083 +       return create_dot_dotdot(object, parent);
43084 +}
43085 +
43086 +static reiser4_block_nr
43087 +hashed_estimate_done(struct inode *object)
43088 +{
43089 +       reiser4_block_nr res = 0;
43090 +
43091 +       /* hashed_rem_entry(object) */
43092 +       res += inode_dir_plugin(object)->estimate.rem_entry(object);
43093 +       return res;
43094 +}
43095 +
43096 +/* plugin->u.dir.estimate.unlink */
43097 +reiser4_internal reiser4_block_nr
43098 +estimate_unlink_hashed(struct inode *parent, struct inode *object)
43099 +{
43100 +       reiser4_block_nr res = 0;
43101 +
43102 +       /* hashed_rem_entry(object) */
43103 +       res += inode_dir_plugin(object)->estimate.rem_entry(object);
43104 +       /* del_nlink(parent) */
43105 +       res += 2 * inode_file_plugin(parent)->estimate.update(parent);
43106 +
43107 +       return res;
43108 +}
43109 +
43110 +/* ->delete() method of directory plugin
43111 +   plugin->u.dir.done
43112 +   Delete dot, and call common_file_delete() to delete stat data.
43113 +*/
43114 +reiser4_internal int
43115 +done_hashed(struct inode *object /* object being deleted */)
43116 +{
43117 +       int result;
43118 +       reiser4_block_nr reserve;
43119 +       struct dentry goodby_dots;
43120 +       reiser4_dir_entry_desc entry;
43121 +
43122 +       assert("nikita-1449", object != NULL);
43123 +
43124 +       if (inode_get_flag(object, REISER4_NO_SD))
43125 +               return 0;
43126 +
43127 +       /* of course, this can be rewritten to sweep everything in one
43128 +          cut_tree(). */
43129 +       xmemset(&entry, 0, sizeof entry);
43130 +
43131 +       /* FIXME: this done method is called from delete_directory_common which
43132 +        * reserved space already */
43133 +       reserve = hashed_estimate_done(object);
43134 +       if (reiser4_grab_space(reserve, BA_CAN_COMMIT | BA_RESERVED))
43135 +               return RETERR(-ENOSPC);
43136 +
43137 +       xmemset(&goodby_dots, 0, sizeof goodby_dots);
43138 +       entry.obj = goodby_dots.d_inode = object;
43139 +       goodby_dots.d_name.name = ".";
43140 +       goodby_dots.d_name.len = 1;
43141 +       result = rem_entry_hashed(object, &goodby_dots, &entry);
43142 +       reiser4_free_dentry_fsdata(&goodby_dots);
43143 +       if (unlikely(result != 0 && result != -ENOMEM && result != -ENOENT))
43144 +               /* only worth a warning
43145 +
43146 +                       "values of \ eB\ f will give rise to dom!\n"
43147 +                            -- v6src/s2/mv.c:89
43148 +               */
43149 +               warning("nikita-2252", "Cannot remove dot of %lli: %i",
43150 +                       get_inode_oid(object), result);
43151 +       return 0;
43152 +}
43153 +
43154 +/* ->detach() method of directory plugin
43155 +   plugin->u.dir.done
43156 +   Delete dotdot, decrease nlink on parent
43157 +*/
43158 +reiser4_internal int
43159 +detach_hashed(struct inode *object, struct inode *parent)
43160 +{
43161 +       int result;
43162 +       struct dentry goodby_dots;
43163 +       reiser4_dir_entry_desc entry;
43164 +
43165 +       assert("nikita-2885", object != NULL);
43166 +       assert("nikita-2886", !inode_get_flag(object, REISER4_NO_SD));
43167 +
43168 +       xmemset(&entry, 0, sizeof entry);
43169 +
43170 +       /* NOTE-NIKITA this only works if @parent is -the- parent of
43171 +          @object, viz. object whose key is stored in dotdot
43172 +          entry. Wouldn't work with hard-links on directories. */
43173 +       xmemset(&goodby_dots, 0, sizeof goodby_dots);
43174 +       entry.obj = goodby_dots.d_inode = parent;
43175 +       goodby_dots.d_name.name = "..";
43176 +       goodby_dots.d_name.len = 2;
43177 +       result = rem_entry_hashed(object, &goodby_dots, &entry);
43178 +       reiser4_free_dentry_fsdata(&goodby_dots);
43179 +       if (result == 0) {
43180 +               /* the dot should be the only entry remaining at this time... */
43181 +               assert("nikita-3400", object->i_size == 1);
43182 +               /* and, together with the only name directory can have, they
43183 +                * provides for the last 2 remaining references. If we get
43184 +                * here as part of error handling during mkdir, @object
43185 +                * possibly has no name yet, so its nlink == 1. If we get here
43186 +                * from rename (targeting empty directory), it has no name
43187 +                * already, so its nlink == 1. */
43188 +               assert("nikita-3401",
43189 +                      object->i_nlink == 2 || object->i_nlink == 1);
43190 +
43191 +               reiser4_del_nlink(parent, object, 0);
43192 +       }
43193 +       return result;
43194 +}
43195 +
43196 +
43197 +/* ->owns_item() for hashed directory object plugin. */
43198 +reiser4_internal int
43199 +owns_item_hashed(const struct inode *inode /* object to check against */ ,
43200 +                const coord_t * coord /* coord of item to check */ )
43201 +{
43202 +       reiser4_key item_key;
43203 +
43204 +       assert("nikita-1335", inode != NULL);
43205 +       assert("nikita-1334", coord != NULL);
43206 +
43207 +       if (item_type_by_coord(coord) == DIR_ENTRY_ITEM_TYPE)
43208 +               return get_key_locality(item_key_by_coord(coord, &item_key)) == get_inode_oid(inode);
43209 +       else
43210 +               return owns_item_common(inode, coord);
43211 +}
43212 +
43213 +/* helper function for directory_file_create(). Create "." and ".." */
43214 +static int
43215 +create_dot_dotdot(struct inode *object /* object to create dot and
43216 +                                        * dotdot for */ ,
43217 +                 struct inode *parent /* parent of @object */ )
43218 +{
43219 +       int result;
43220 +       struct dentry dots_entry;
43221 +       reiser4_dir_entry_desc entry;
43222 +
43223 +       assert("nikita-688", object != NULL);
43224 +       assert("nikita-689", S_ISDIR(object->i_mode));
43225 +       assert("nikita-691", parent != NULL);
43226 +       trace_stamp(TRACE_DIR);
43227 +
43228 +       /* We store dot and dotdot as normal directory entries. This is
43229 +          not necessary, because almost all information stored in them
43230 +          is already in the stat-data of directory, the only thing
43231 +          being missed is objectid of grand-parent directory that can
43232 +          easily be added there as extension.
43233 +
43234 +          But it is done the way it is done, because not storing dot
43235 +          and dotdot will lead to the following complications:
43236 +
43237 +          . special case handling in ->lookup().
43238 +          . addition of another extension to the sd.
43239 +          . dependency on key allocation policy for stat data.
43240 +
43241 +       */
43242 +
43243 +       xmemset(&entry, 0, sizeof entry);
43244 +       xmemset(&dots_entry, 0, sizeof dots_entry);
43245 +       entry.obj = dots_entry.d_inode = object;
43246 +       dots_entry.d_name.name = ".";
43247 +       dots_entry.d_name.len = 1;
43248 +       result = add_entry_hashed(object, &dots_entry, NULL, &entry);
43249 +       reiser4_free_dentry_fsdata(&dots_entry);
43250 +
43251 +       if (result == 0) {
43252 +               result = reiser4_add_nlink(object, object, 0);
43253 +               if (result == 0) {
43254 +                       entry.obj = dots_entry.d_inode = parent;
43255 +                       dots_entry.d_name.name = "..";
43256 +                       dots_entry.d_name.len = 2;
43257 +                       result = add_entry_hashed(object,
43258 +                                                 &dots_entry, NULL, &entry);
43259 +                       reiser4_free_dentry_fsdata(&dots_entry);
43260 +                       /* if creation of ".." failed, iput() will delete
43261 +                          object with ".". */
43262 +                       if (result == 0) {
43263 +                               result = reiser4_add_nlink(parent, object, 0);
43264 +                               if (result != 0)
43265 +                                       /*
43266 +                                        * if we failed to bump i_nlink, try
43267 +                                        * to remove ".."
43268 +                                        */
43269 +                                       detach_hashed(object, parent);
43270 +                       }
43271 +               }
43272 +       }
43273 +
43274 +       if (result != 0) {
43275 +               /*
43276 +                * in the case of error, at least update stat-data so that,
43277 +                * ->i_nlink updates are not lingering.
43278 +                */
43279 +               reiser4_update_sd(object);
43280 +               reiser4_update_sd(parent);
43281 +       }
43282 +
43283 +       return result;
43284 +}
43285 +
43286 +/* looks for name specified in @dentry in directory @parent and if name is
43287 +   found - key of object found entry points to is stored in @entry->key */
43288 +static int
43289 +lookup_name_hashed(struct inode *parent /* inode of directory to lookup for
43290 +                                        * name in */,
43291 +                  struct dentry *dentry /* name to look for */,
43292 +                  reiser4_key *key /* place to store key */)
43293 +{
43294 +       int result;
43295 +       coord_t *coord;
43296 +       lock_handle lh;
43297 +       const char *name;
43298 +       int len;
43299 +       reiser4_dir_entry_desc entry;
43300 +       reiser4_dentry_fsdata *fsdata;
43301 +
43302 +       assert("nikita-1247", parent != NULL);
43303 +       assert("nikita-1248", dentry != NULL);
43304 +       assert("nikita-1123", dentry->d_name.name != NULL);
43305 +       assert("vs-1486",
43306 +              dentry->d_op == &get_super_private(parent->i_sb)->ops.dentry);
43307 +
43308 +       result = perm_chk(parent, lookup, parent, dentry);
43309 +       if (result != 0)
43310 +               return 0;
43311 +
43312 +       name = dentry->d_name.name;
43313 +       len = dentry->d_name.len;
43314 +
43315 +       if (!is_name_acceptable(parent, name, len))
43316 +               /* some arbitrary error code to return */
43317 +               return RETERR(-ENAMETOOLONG);
43318 +
43319 +       fsdata = reiser4_get_dentry_fsdata(dentry);
43320 +       if (IS_ERR(fsdata))
43321 +               return PTR_ERR(fsdata);
43322 +
43323 +       coord = &fsdata->dec.entry_coord;
43324 +       coord_clear_iplug(coord);
43325 +       init_lh(&lh);
43326 +
43327 +       ON_TRACE(TRACE_DIR | TRACE_VFS_OPS, "lookup inode: %lli \"%s\"\n", get_inode_oid(parent), dentry->d_name.name);
43328 +
43329 +       /* find entry in a directory. This is plugin method. */
43330 +       result = find_entry(parent, dentry, &lh, ZNODE_READ_LOCK, &entry);
43331 +       if (result == 0) {
43332 +               /* entry was found, extract object key from it. */
43333 +               result = WITH_COORD(coord, item_plugin_by_coord(coord)->s.dir.extract_key(coord, key));
43334 +       }
43335 +       done_lh(&lh);
43336 +       return result;
43337 +
43338 +}
43339 +
43340 +/*
43341 + * helper for ->lookup() and ->get_parent() methods: if @inode is a
43342 + * light-weight file, setup its credentials that are not stored in the
43343 + * stat-data in this case
43344 + */
43345 +static void
43346 +check_light_weight(struct inode *inode, struct inode *parent)
43347 +{
43348 +       if (inode_get_flag(inode, REISER4_LIGHT_WEIGHT)) {
43349 +               inode->i_uid = parent->i_uid;
43350 +               inode->i_gid = parent->i_gid;
43351 +               /* clear light-weight flag. If inode would be read by any
43352 +                  other name, [ug]id wouldn't change. */
43353 +               inode_clr_flag(inode, REISER4_LIGHT_WEIGHT);
43354 +       }
43355 +}
43356 +
43357 +/* implementation of ->lookup() method for hashed directories. */
43358 +reiser4_internal int
43359 +lookup_hashed(struct inode * parent    /* inode of directory to
43360 +                                        * lookup into */ ,
43361 +             struct dentry **dentryloc /* name to look for */ )
43362 +{
43363 +       int result;
43364 +       struct inode *inode;
43365 +       struct dentry *dentry;
43366 +       reiser4_dir_entry_desc entry;
43367 +
43368 +       dentry = *dentryloc;
43369 +       /* set up operations on dentry. */
43370 +       dentry->d_op = &get_super_private(parent->i_sb)->ops.dentry;
43371 +
43372 +       result = lookup_name_hashed(parent, dentry, &entry.key);
43373 +       if (result == 0) {
43374 +               inode = reiser4_iget(parent->i_sb, &entry.key, 0);
43375 +               if (!IS_ERR(inode)) {
43376 +                       check_light_weight(inode, parent);
43377 +                       /* success */
43378 +                       *dentryloc = d_splice_alias(inode, dentry);
43379 +                       reiser4_iget_complete(inode);
43380 +               } else
43381 +                       result = PTR_ERR(inode);
43382 +       } else if (result == -ENOENT)
43383 +               result = lookup_pseudo_file(parent, dentryloc);
43384 +
43385 +       return result;
43386 +}
43387 +
43388 +/*
43389 + * ->get_parent() method of hashed directory. This is used by NFS kernel
43390 + * server to "climb" up directory tree to check permissions.
43391 + */
43392 +reiser4_internal struct dentry *
43393 +get_parent_hashed(struct inode *child)
43394 +{
43395 +       struct super_block *s;
43396 +       struct inode  *parent;
43397 +       struct dentry  dotdot;
43398 +       struct dentry *dentry;
43399 +       reiser4_key key;
43400 +       int         result;
43401 +
43402 +       /*
43403 +        * lookup dotdot entry.
43404 +        */
43405 +
43406 +       s = child->i_sb;
43407 +       memset(&dotdot, 0, sizeof(dotdot));
43408 +       dotdot.d_name.name = "..";
43409 +       dotdot.d_name.len = 2;
43410 +       dotdot.d_op = &get_super_private(s)->ops.dentry;
43411 +
43412 +       result = lookup_name_hashed(child, &dotdot, &key);
43413 +       if (result != 0)
43414 +               return ERR_PTR(result);
43415 +
43416 +       parent = reiser4_iget(s, &key, 1);
43417 +       if (!IS_ERR(parent)) {
43418 +               /*
43419 +                * FIXME-NIKITA dubious: attributes are inherited from @child
43420 +                * to @parent. But:
43421 +                *
43422 +                *     (*) this is the only this we can do
43423 +                *
43424 +                *     (*) attributes of light-weight object are inherited
43425 +                *     from a parent through which object was looked up first,
43426 +                *     so it is ambiguous anyway.
43427 +                *
43428 +                */
43429 +               check_light_weight(parent, child);
43430 +               reiser4_iget_complete(parent);
43431 +               dentry = d_alloc_anon(parent);
43432 +               if (dentry == NULL) {
43433 +                       iput(parent);
43434 +                       dentry = ERR_PTR(RETERR(-ENOMEM));
43435 +               } else
43436 +                       dentry->d_op = &get_super_private(s)->ops.dentry;
43437 +       } else if (PTR_ERR(parent) == -ENOENT)
43438 +               dentry = ERR_PTR(RETERR(-ESTALE));
43439 +       else
43440 +               dentry = (void *)parent;
43441 +       return dentry;
43442 +}
43443 +
43444 +static const char *possible_leak = "Possible disk space leak.";
43445 +
43446 +/* re-bind existing name at @from_coord in @from_dir to point to @to_inode.
43447 +
43448 +   Helper function called from hashed_rename() */
43449 +static int
43450 +replace_name(struct inode *to_inode    /* inode where @from_coord is
43451 +                                        * to be re-targeted at */ ,
43452 +            struct inode *from_dir     /* directory where @from_coord
43453 +                                        * lives */ ,
43454 +            struct inode *from_inode   /* inode @from_coord
43455 +                                        * originally point to */ ,
43456 +            coord_t * from_coord       /* where directory entry is in
43457 +                                        * the tree */ ,
43458 +            lock_handle * from_lh /* lock handle on @from_coord */ )
43459 +{
43460 +       item_plugin *from_item;
43461 +       int result;
43462 +       znode *node;
43463 +
43464 +       coord_clear_iplug(from_coord);
43465 +       node = from_coord->node;
43466 +       result = zload(node);
43467 +       if (result != 0)
43468 +               return result;
43469 +       from_item = item_plugin_by_coord(from_coord);
43470 +       if (item_type_by_coord(from_coord) == DIR_ENTRY_ITEM_TYPE) {
43471 +               reiser4_key to_key;
43472 +
43473 +               build_sd_key(to_inode, &to_key);
43474 +
43475 +               /* everything is found and prepared to change directory entry
43476 +                  at @from_coord to point to @to_inode.
43477 +
43478 +                  @to_inode is just about to get new name, so bump its link
43479 +                  counter.
43480 +
43481 +               */
43482 +               result = reiser4_add_nlink(to_inode, from_dir, 0);
43483 +               if (result != 0) {
43484 +                       /* Don't issue warning: this may be plain -EMLINK */
43485 +                       zrelse(node);
43486 +                       return result;
43487 +               }
43488 +
43489 +               result = from_item->s.dir.update_key(from_coord, &to_key, from_lh);
43490 +               if (result != 0) {
43491 +                       reiser4_del_nlink(to_inode, from_dir, 0);
43492 +                       zrelse(node);
43493 +                       return result;
43494 +               }
43495 +
43496 +               /* @from_inode just lost its name, he-he.
43497 +
43498 +                  If @from_inode was directory, it contained dotdot pointing
43499 +                  to @from_dir. @from_dir i_nlink will be decreased when
43500 +                  iput() will be called on @from_inode.
43501 +
43502 +                  If file-system is not ADG (hard-links are
43503 +                  supported on directories), iput(from_inode) will not remove
43504 +                  @from_inode, and thus above is incorrect, but hard-links on
43505 +                  directories are problematic in many other respects.
43506 +               */
43507 +               result = reiser4_del_nlink(from_inode, from_dir, 0);
43508 +               if (result != 0) {
43509 +                       warning("nikita-2330",
43510 +                               "Cannot remove link from source: %i. %s",
43511 +                               result, possible_leak);
43512 +               }
43513 +               /* Has to return success, because entry is already
43514 +                * modified. */
43515 +               result = 0;
43516 +
43517 +               /* NOTE-NIKITA consider calling plugin method in stead of
43518 +                  accessing inode fields directly. */
43519 +               from_dir->i_mtime = CURRENT_TIME;
43520 +       } else {
43521 +               warning("nikita-2326", "Unexpected item type");
43522 +               print_plugin("item", item_plugin_to_plugin(from_item));
43523 +               result = RETERR(-EIO);
43524 +       }
43525 +       zrelse(node);
43526 +       return result;
43527 +}
43528 +
43529 +/* add new entry pointing to @inode into @dir at @coord, locked by @lh
43530 +
43531 +   Helper function used by hashed_rename(). */
43532 +static int
43533 +add_name(struct inode *inode   /* inode where @coord is to be
43534 +                                * re-targeted at */ ,
43535 +        struct inode *dir /* directory where @coord lives */ ,
43536 +        struct dentry *name /* new name */ ,
43537 +        coord_t * coord /* where directory entry is in the tree */ ,
43538 +        lock_handle * lh /* lock handle on @coord */ ,
43539 +        int is_dir /* true, if @inode is directory */ )
43540 +{
43541 +       int result;
43542 +       reiser4_dir_entry_desc entry;
43543 +
43544 +       assert("nikita-2333", lh->node == coord->node);
43545 +       assert("nikita-2334", is_dir == S_ISDIR(inode->i_mode));
43546 +
43547 +       xmemset(&entry, 0, sizeof entry);
43548 +       entry.obj = inode;
43549 +       /* build key of directory entry description */
43550 +       inode_dir_plugin(dir)->build_entry_key(dir, &name->d_name, &entry.key);
43551 +
43552 +       /* ext2 does this in different order: first inserts new entry,
43553 +          then increases directory nlink. We don't want do this,
43554 +          because reiser4_add_nlink() calls ->add_link() plugin
43555 +          method that can fail for whatever reason, leaving as with
43556 +          cleanup problems.
43557 +       */
43558 +       /* @inode is getting new name */
43559 +       reiser4_add_nlink(inode, dir, 0);
43560 +       /* create @new_name in @new_dir pointing to
43561 +          @old_inode */
43562 +       result = WITH_COORD(coord,
43563 +                           inode_dir_item_plugin(dir)->s.dir.add_entry(dir,
43564 +                                                                       coord,
43565 +                                                                       lh,
43566 +                                                                       name,
43567 +                                                                       &entry));
43568 +       if (result != 0) {
43569 +               int result2;
43570 +               result2 = reiser4_del_nlink(inode, dir, 0);
43571 +               if (result2 != 0) {
43572 +                       warning("nikita-2327", "Cannot drop link on %lli %i. %s",
43573 +                               get_inode_oid(inode),
43574 +                               result2, possible_leak);
43575 +               }
43576 +       } else
43577 +               INODE_INC_FIELD(dir, i_size);
43578 +       return result;
43579 +}
43580 +
43581 +static reiser4_block_nr
43582 +hashed_estimate_rename(
43583 +       struct inode  *old_dir  /* directory where @old is located */,
43584 +       struct dentry *old_name /* old name */,
43585 +       struct inode  *new_dir  /* directory where @new is located */,
43586 +       struct dentry *new_name /* new name */)
43587 +{
43588 +       reiser4_block_nr res1, res2;
43589 +       dir_plugin *p_parent_old, *p_parent_new;
43590 +       file_plugin *p_child_old, *p_child_new;
43591 +
43592 +       assert("vpf-311", old_dir != NULL);
43593 +       assert("vpf-312", new_dir != NULL);
43594 +       assert("vpf-313", old_name != NULL);
43595 +       assert("vpf-314", new_name != NULL);
43596 +
43597 +       p_parent_old = inode_dir_plugin(old_dir);
43598 +       p_parent_new = inode_dir_plugin(new_dir);
43599 +       p_child_old = inode_file_plugin(old_name->d_inode);
43600 +       if (new_name->d_inode)
43601 +               p_child_new = inode_file_plugin(new_name->d_inode);
43602 +       else
43603 +               p_child_new = 0;
43604 +
43605 +       /* find_entry - can insert one leaf. */
43606 +       res1 = res2 = 1;
43607 +
43608 +       /* replace_name */
43609 +       {
43610 +               /* reiser4_add_nlink(p_child_old) and reiser4_del_nlink(p_child_old) */
43611 +               res1 += 2 * p_child_old->estimate.update(old_name->d_inode);
43612 +               /* update key */
43613 +               res1 += 1;
43614 +               /* reiser4_del_nlink(p_child_new) */
43615 +               if (p_child_new)
43616 +                   res1 += p_child_new->estimate.update(new_name->d_inode);
43617 +       }
43618 +
43619 +       /* else add_name */
43620 +       {
43621 +               /* reiser4_add_nlink(p_parent_new) and reiser4_del_nlink(p_parent_new) */
43622 +               res2 += 2 * inode_file_plugin(new_dir)->estimate.update(new_dir);
43623 +               /* reiser4_add_nlink(p_parent_old) */
43624 +               res2 += p_child_old->estimate.update(old_name->d_inode);
43625 +               /* add_entry(p_parent_new) */
43626 +               res2 += p_parent_new->estimate.add_entry(new_dir);
43627 +               /* reiser4_del_nlink(p_parent_old) */
43628 +               res2 += p_child_old->estimate.update(old_name->d_inode);
43629 +       }
43630 +
43631 +       res1 = res1 < res2 ? res2 : res1;
43632 +
43633 +
43634 +       /* reiser4_write_sd(p_parent_new) */
43635 +       res1 += inode_file_plugin(new_dir)->estimate.update(new_dir);
43636 +
43637 +       /* reiser4_write_sd(p_child_new) */
43638 +       if (p_child_new)
43639 +           res1 += p_child_new->estimate.update(new_name->d_inode);
43640 +
43641 +       /* hashed_rem_entry(p_parent_old) */
43642 +       res1 += p_parent_old->estimate.rem_entry(old_dir);
43643 +
43644 +       /* reiser4_del_nlink(p_child_old) */
43645 +       res1 += p_child_old->estimate.update(old_name->d_inode);
43646 +
43647 +       /* replace_name */
43648 +       {
43649 +           /* reiser4_add_nlink(p_parent_dir_new) */
43650 +           res1 += inode_file_plugin(new_dir)->estimate.update(new_dir);
43651 +           /* update_key */
43652 +           res1 += 1;
43653 +           /* reiser4_del_nlink(p_parent_new) */
43654 +           res1 += inode_file_plugin(new_dir)->estimate.update(new_dir);
43655 +           /* reiser4_del_nlink(p_parent_old) */
43656 +           res1 += inode_file_plugin(old_dir)->estimate.update(old_dir);
43657 +       }
43658 +
43659 +       /* reiser4_write_sd(p_parent_old) */
43660 +       res1 += inode_file_plugin(old_dir)->estimate.update(old_dir);
43661 +
43662 +       /* reiser4_write_sd(p_child_old) */
43663 +       res1 += p_child_old->estimate.update(old_name->d_inode);
43664 +
43665 +       return res1;
43666 +}
43667 +
43668 +static int
43669 +hashed_rename_estimate_and_grab(
43670 +       struct inode *old_dir /* directory where @old is located */ ,
43671 +       struct dentry *old_name /* old name */ ,
43672 +       struct inode *new_dir /* directory where @new is located */ ,
43673 +       struct dentry *new_name /* new name */ )
43674 +{
43675 +       reiser4_block_nr reserve;
43676 +
43677 +       reserve = hashed_estimate_rename(old_dir, old_name, new_dir, new_name);
43678 +
43679 +       if (reiser4_grab_space(reserve, BA_CAN_COMMIT))
43680 +               return RETERR(-ENOSPC);
43681 +
43682 +       return 0;
43683 +}
43684 +
43685 +/* check whether @old_inode and @new_inode can be moved within file system
43686 + * tree. This singles out attempts to rename pseudo-files, for example. */
43687 +static int
43688 +can_rename(struct inode *old_dir, struct inode *old_inode,
43689 +          struct inode *new_dir, struct inode *new_inode)
43690 +{
43691 +       file_plugin *fplug;
43692 +       dir_plugin  *dplug;
43693 +
43694 +       assert("nikita-3370", old_inode != NULL);
43695 +
43696 +       dplug = inode_dir_plugin(new_dir);
43697 +       fplug = inode_file_plugin(old_inode);
43698 +
43699 +       if (dplug == NULL)
43700 +               return RETERR(-ENOTDIR);
43701 +       else if (dplug->create_child == NULL)
43702 +               return RETERR(-EPERM);
43703 +       else if (!fplug->can_add_link(old_inode))
43704 +               return RETERR(-EMLINK);
43705 +       else if (new_inode != NULL) {
43706 +               fplug = inode_file_plugin(new_inode);
43707 +               if (fplug->can_rem_link != NULL &&
43708 +                   !fplug->can_rem_link(new_inode))
43709 +                       return RETERR(-EBUSY);
43710 +       }
43711 +       return 0;
43712 +}
43713 +
43714 +/* ->rename directory plugin method implementation for hashed directories.
43715 +   plugin->u.dir.rename
43716 +   See comments in the body.
43717 +
43718 +   It is arguable that this function can be made generic so, that it will be
43719 +   applicable to any kind of directory plugin that deals with directories
43720 +   composed out of directory entries. The only obstacle here is that we don't
43721 +   have any data-type to represent directory entry. This should be
43722 +   re-considered when more than one different directory plugin will be
43723 +   implemented.
43724 +*/
43725 +reiser4_internal int
43726 +rename_hashed(struct inode *old_dir /* directory where @old is located */ ,
43727 +             struct dentry *old_name /* old name */ ,
43728 +             struct inode *new_dir /* directory where @new is located */ ,
43729 +             struct dentry *new_name /* new name */ )
43730 +{
43731 +       /* From `The Open Group Base Specifications Issue 6'
43732 +
43733 +
43734 +          If either the old or new argument names a symbolic link, rename()
43735 +          shall operate on the symbolic link itself, and shall not resolve
43736 +          the last component of the argument. If the old argument and the new
43737 +          argument resolve to the same existing file, rename() shall return
43738 +          successfully and perform no other action.
43739 +
43740 +          [this is done by VFS: vfs_rename()]
43741 +
43742 +
43743 +          If the old argument points to the pathname of a file that is not a
43744 +          directory, the new argument shall not point to the pathname of a
43745 +          directory.
43746 +
43747 +          [checked by VFS: vfs_rename->may_delete()]
43748 +
43749 +                     If the link named by the new argument exists, it shall
43750 +          be removed and old renamed to new. In this case, a link named new
43751 +          shall remain visible to other processes throughout the renaming
43752 +          operation and refer either to the file referred to by new or old
43753 +          before the operation began.
43754 +
43755 +          [we should assure this]
43756 +
43757 +                                      Write access permission is required for
43758 +          both the directory containing old and the directory containing new.
43759 +
43760 +          [checked by VFS: vfs_rename->may_delete(), may_create()]
43761 +
43762 +          If the old argument points to the pathname of a directory, the new
43763 +          argument shall not point to the pathname of a file that is not a
43764 +          directory.
43765 +
43766 +          [checked by VFS: vfs_rename->may_delete()]
43767 +
43768 +                     If the directory named by the new argument exists, it
43769 +          shall be removed and old renamed to new. In this case, a link named
43770 +          new shall exist throughout the renaming operation and shall refer
43771 +          either to the directory referred to by new or old before the
43772 +          operation began.
43773 +
43774 +          [we should assure this]
43775 +
43776 +                           If new names an existing directory, it shall be
43777 +          required to be an empty directory.
43778 +
43779 +          [we should check this]
43780 +
43781 +          If the old argument points to a pathname of a symbolic link, the
43782 +          symbolic link shall be renamed. If the new argument points to a
43783 +          pathname of a symbolic link, the symbolic link shall be removed.
43784 +
43785 +          The new pathname shall not contain a path prefix that names
43786 +          old. Write access permission is required for the directory
43787 +          containing old and the directory containing new. If the old
43788 +          argument points to the pathname of a directory, write access
43789 +          permission may be required for the directory named by old, and, if
43790 +          it exists, the directory named by new.
43791 +
43792 +          [checked by VFS: vfs_rename(), vfs_rename_dir()]
43793 +
43794 +          If the link named by the new argument exists and the file's link
43795 +          count becomes 0 when it is removed and no process has the file
43796 +          open, the space occupied by the file shall be freed and the file
43797 +          shall no longer be accessible. If one or more processes have the
43798 +          file open when the last link is removed, the link shall be removed
43799 +          before rename() returns, but the removal of the file contents shall
43800 +          be postponed until all references to the file are closed.
43801 +
43802 +          [iput() handles this, but we can do this manually, a la
43803 +          reiser4_unlink()]
43804 +
43805 +          Upon successful completion, rename() shall mark for update the
43806 +          st_ctime and st_mtime fields of the parent directory of each file.
43807 +
43808 +          [N/A]
43809 +
43810 +       */
43811 +
43812 +       int result;
43813 +       int is_dir;             /* is @old_name directory */
43814 +
43815 +       struct inode *old_inode;
43816 +       struct inode *new_inode;
43817 +
43818 +       reiser4_dir_entry_desc old_entry;
43819 +       reiser4_dir_entry_desc new_entry;
43820 +
43821 +       coord_t *new_coord;
43822 +
43823 +       reiser4_dentry_fsdata *new_fsdata;
43824 +
43825 +       lock_handle new_lh;
43826 +
43827 +       dir_plugin  *dplug;
43828 +       file_plugin *fplug;
43829 +
43830 +       assert("nikita-2318", old_dir != NULL);
43831 +       assert("nikita-2319", new_dir != NULL);
43832 +       assert("nikita-2320", old_name != NULL);
43833 +       assert("nikita-2321", new_name != NULL);
43834 +
43835 +       old_inode = old_name->d_inode;
43836 +       new_inode = new_name->d_inode;
43837 +
43838 +       dplug = inode_dir_plugin(old_dir);
43839 +       fplug = NULL;
43840 +
43841 +       new_fsdata = reiser4_get_dentry_fsdata(new_name);
43842 +       if (IS_ERR(new_fsdata))
43843 +               return PTR_ERR(new_fsdata);
43844 +
43845 +       new_coord = &new_fsdata->dec.entry_coord;
43846 +       coord_clear_iplug(new_coord);
43847 +
43848 +       is_dir = S_ISDIR(old_inode->i_mode);
43849 +
43850 +       assert("nikita-3461", old_inode->i_nlink >= 1 + !!is_dir);
43851 +
43852 +       /* if target is existing directory and it's not empty---return error.
43853 +
43854 +          This check is done specifically, because is_dir_empty() requires
43855 +          tree traversal and have to be done before locks are taken.
43856 +       */
43857 +       if (is_dir && new_inode != NULL && is_dir_empty(new_inode) != 0)
43858 +               return RETERR(-ENOTEMPTY);
43859 +
43860 +       result = can_rename(old_dir, old_inode, new_dir, new_inode);
43861 +       if (result != 0)
43862 +               return result;
43863 +
43864 +       result = hashed_rename_estimate_and_grab(old_dir, old_name,
43865 +                                                new_dir, new_name);
43866 +       if (result != 0)
43867 +           return result;
43868 +
43869 +       init_lh(&new_lh);
43870 +
43871 +       /* find entry for @new_name */
43872 +       result = find_entry(new_dir,
43873 +                           new_name, &new_lh, ZNODE_WRITE_LOCK, &new_entry);
43874 +
43875 +       if (IS_CBKERR(result)) {
43876 +               done_lh(&new_lh);
43877 +               return result;
43878 +       }
43879 +
43880 +       seal_done(&new_fsdata->dec.entry_seal);
43881 +
43882 +       /* add or replace name for @old_inode as @new_name */
43883 +       if (new_inode != NULL) {
43884 +               /* target (@new_name) exists. */
43885 +               /* Not clear what to do with objects that are
43886 +                  both directories and files at the same time. */
43887 +               if (result == CBK_COORD_FOUND) {
43888 +                       result = replace_name(old_inode,
43889 +                                             new_dir,
43890 +                                             new_inode,
43891 +                                             new_coord,
43892 +                                             &new_lh);
43893 +                       if (result == 0)
43894 +                               fplug = inode_file_plugin(new_inode);
43895 +               } else if (result == CBK_COORD_NOTFOUND) {
43896 +                       /* VFS told us that @new_name is bound to existing
43897 +                          inode, but we failed to find directory entry. */
43898 +                       warning("nikita-2324", "Target not found");
43899 +                       result = RETERR(-ENOENT);
43900 +               }
43901 +       } else {
43902 +               /* target (@new_name) doesn't exists. */
43903 +               if (result == CBK_COORD_NOTFOUND)
43904 +                       result = add_name(old_inode,
43905 +                                         new_dir,
43906 +                                         new_name,
43907 +                                         new_coord,
43908 +                                         &new_lh, is_dir);
43909 +               else if (result == CBK_COORD_FOUND) {
43910 +                       /* VFS told us that @new_name is "negative" dentry,
43911 +                          but we found directory entry. */
43912 +                       warning("nikita-2331", "Target found unexpectedly");
43913 +                       result = RETERR(-EIO);
43914 +               }
43915 +       }
43916 +
43917 +       assert("nikita-3462", ergo(result == 0,
43918 +                                  old_inode->i_nlink >= 2 + !!is_dir));
43919 +
43920 +       /* We are done with all modifications to the @new_dir, release lock on
43921 +          node. */
43922 +       done_lh(&new_lh);
43923 +
43924 +       if (fplug != NULL) {
43925 +               /* detach @new_inode from name-space */
43926 +               result = fplug->detach(new_inode, new_dir);
43927 +               if (result != 0)
43928 +                       warning("nikita-2330", "Cannot detach %lli: %i. %s",
43929 +                               get_inode_oid(new_inode), result, possible_leak);
43930 +       }
43931 +
43932 +       if (new_inode != NULL)
43933 +               reiser4_mark_inode_dirty(new_inode);
43934 +
43935 +       if (result == 0) {
43936 +               xmemset(&old_entry, 0, sizeof old_entry);
43937 +               old_entry.obj = old_inode;
43938 +
43939 +               dplug->build_entry_key(old_dir,
43940 +                                      &old_name->d_name, &old_entry.key);
43941 +
43942 +               /* At this stage new name was introduced for
43943 +                  @old_inode. @old_inode, @new_dir, and @new_inode i_nlink
43944 +                  counters were updated.
43945 +
43946 +                  We want to remove @old_name now. If @old_inode wasn't
43947 +                  directory this is simple.
43948 +               */
43949 +               result = rem_entry_hashed(old_dir, old_name, &old_entry);
43950 +               if (result != 0 && result != -ENOMEM) {
43951 +                       warning("nikita-2335",
43952 +                               "Cannot remove old name: %i", result);
43953 +               } else {
43954 +                       result = reiser4_del_nlink(old_inode, old_dir, 0);
43955 +                       if (result != 0 && result != -ENOMEM) {
43956 +                               warning("nikita-2337",
43957 +                                       "Cannot drop link on old: %i", result);
43958 +                       }
43959 +               }
43960 +
43961 +               if (result == 0 && is_dir) {
43962 +                       /* @old_inode is directory. We also have to update
43963 +                          dotdot entry. */
43964 +                       coord_t *dotdot_coord;
43965 +                       lock_handle dotdot_lh;
43966 +                       struct dentry dotdot_name;
43967 +                       reiser4_dir_entry_desc dotdot_entry;
43968 +                       reiser4_dentry_fsdata  dataonstack;
43969 +                       reiser4_dentry_fsdata *fsdata;
43970 +
43971 +                       xmemset(&dataonstack, 0, sizeof dataonstack);
43972 +                       xmemset(&dotdot_entry, 0, sizeof dotdot_entry);
43973 +                       dotdot_entry.obj = old_dir;
43974 +                       xmemset(&dotdot_name, 0, sizeof dotdot_name);
43975 +                       dotdot_name.d_name.name = "..";
43976 +                       dotdot_name.d_name.len = 2;
43977 +                       /*
43978 +                        * allocate ->d_fsdata on the stack to avoid using
43979 +                        * reiser4_get_dentry_fsdata(). Locking is not needed,
43980 +                        * because dentry is private to the current thread.
43981 +                        */
43982 +                       dotdot_name.d_fsdata = &dataonstack;
43983 +                       init_lh(&dotdot_lh);
43984 +
43985 +                       fsdata = &dataonstack;
43986 +                       dotdot_coord = &fsdata->dec.entry_coord;
43987 +                       coord_clear_iplug(dotdot_coord);
43988 +
43989 +                       result = find_entry(old_inode, &dotdot_name, &dotdot_lh,
43990 +                                           ZNODE_WRITE_LOCK, &dotdot_entry);
43991 +                       if (result == 0) {
43992 +                               /* replace_name() decreases i_nlink on
43993 +                                * @old_dir */
43994 +                               result = replace_name(new_dir,
43995 +                                                     old_inode,
43996 +                                                     old_dir,
43997 +                                                     dotdot_coord,
43998 +                                                     &dotdot_lh);
43999 +                       } else
44000 +                               result = RETERR(-EIO);
44001 +                       done_lh(&dotdot_lh);
44002 +               }
44003 +       }
44004 +       reiser4_update_dir(new_dir);
44005 +       reiser4_update_dir(old_dir);
44006 +       reiser4_mark_inode_dirty(old_inode);
44007 +       if (result == 0) {
44008 +               file_plugin *fplug;
44009 +
44010 +               if (new_inode != NULL) {
44011 +                       /* add safe-link for target file (in case we removed
44012 +                        * last reference to the poor fellow */
44013 +                       fplug = inode_file_plugin(new_inode);
44014 +                       if (fplug->not_linked(new_inode))
44015 +                               result = safe_link_add(new_inode, SAFE_UNLINK);
44016 +               }
44017 +       }
44018 +       return result;
44019 +}
44020 +
44021 +/* ->add_entry() method for hashed directory object plugin.
44022 +   plugin->u.dir.add_entry
44023 +*/
44024 +reiser4_internal int
44025 +add_entry_hashed(struct inode *object  /* directory to add new name
44026 +                                        * in */ ,
44027 +                struct dentry *where /* new name */ ,
44028 +                reiser4_object_create_data * data UNUSED_ARG   /* parameters
44029 +                                                                * of new
44030 +                                                                * object */ ,
44031 +                reiser4_dir_entry_desc * entry /* parameters of new
44032 +                                                * directory entry */ )
44033 +{
44034 +       int result;
44035 +       coord_t *coord;
44036 +       lock_handle lh;
44037 +       reiser4_dentry_fsdata *fsdata;
44038 +       reiser4_block_nr       reserve;
44039 +
44040 +       assert("nikita-1114", object != NULL);
44041 +       assert("nikita-1250", where != NULL);
44042 +
44043 +       fsdata = reiser4_get_dentry_fsdata(where);
44044 +       if (unlikely(IS_ERR(fsdata)))
44045 +               return PTR_ERR(fsdata);
44046 +
44047 +       reserve = inode_dir_plugin(object)->estimate.add_entry(object);
44048 +       if (reiser4_grab_space(reserve, BA_CAN_COMMIT))
44049 +               return RETERR(-ENOSPC);
44050 +
44051 +       init_lh(&lh);
44052 +       ON_TRACE(TRACE_DIR, "[%i]: creating \"%s\" in %llu\n", current->pid, where->d_name.name, get_inode_oid(object));
44053 +       coord = &fsdata->dec.entry_coord;
44054 +       coord_clear_iplug(coord);
44055 +
44056 +       /* check for this entry in a directory. This is plugin method. */
44057 +       result = find_entry(object, where, &lh, ZNODE_WRITE_LOCK, entry);
44058 +       if (likely(result == -ENOENT)) {
44059 +               /* add new entry. Just pass control to the directory
44060 +                  item plugin. */
44061 +               assert("nikita-1709", inode_dir_item_plugin(object));
44062 +               assert("nikita-2230", coord->node == lh.node);
44063 +               seal_done(&fsdata->dec.entry_seal);
44064 +               result = inode_dir_item_plugin(object)->s.dir.add_entry(object, coord, &lh, where, entry);
44065 +               if (result == 0) {
44066 +                       adjust_dir_file(object, where, fsdata->dec.pos + 1, +1);
44067 +                       INODE_INC_FIELD(object, i_size);
44068 +               }
44069 +       } else if (result == 0) {
44070 +               assert("nikita-2232", coord->node == lh.node);
44071 +               result = RETERR(-EEXIST);
44072 +       }
44073 +       done_lh(&lh);
44074 +
44075 +       return result;
44076 +}
44077 +
44078 +/* ->rem_entry() method for hashed directory object plugin.
44079 +   plugin->u.dir.rem_entry
44080 + */
44081 +reiser4_internal int
44082 +rem_entry_hashed(struct inode *object  /* directory from which entry
44083 +                                        * is begin removed */ ,
44084 +                struct dentry *where   /* name that is being
44085 +                                        * removed */ ,
44086 +                reiser4_dir_entry_desc * entry /* description of entry being
44087 +                                                * removed */ )
44088 +{
44089 +       int result;
44090 +       coord_t *coord;
44091 +       lock_handle lh;
44092 +       reiser4_dentry_fsdata *fsdata;
44093 +       __u64 tograb;
44094 +
44095 +       /* yes, nested function, so what? Sue me. */
44096 +       int rem_entry(void) {
44097 +               item_plugin *iplug;
44098 +               struct inode *child;
44099 +
44100 +               iplug = inode_dir_item_plugin(object);
44101 +               child = where->d_inode;
44102 +               assert("nikita-3399", child != NULL);
44103 +
44104 +               /* check that we are really destroying an entry for @child */
44105 +               if (REISER4_DEBUG) {
44106 +                       int result;
44107 +                       reiser4_key key;
44108 +
44109 +                       result = iplug->s.dir.extract_key(coord, &key);
44110 +                       if (result != 0)
44111 +                               return result;
44112 +                       if (get_key_objectid(&key) != get_inode_oid(child)) {
44113 +                               warning("nikita-3397",
44114 +                                       "rem_entry: %#llx != %#llx\n",
44115 +                                       get_key_objectid(&key),
44116 +                                       get_inode_oid(child));
44117 +                               return RETERR(-EIO);
44118 +                       }
44119 +               }
44120 +               return iplug->s.dir.rem_entry(object,
44121 +                                             &where->d_name, coord, &lh, entry);
44122 +       }
44123 +
44124 +       assert("nikita-1124", object != NULL);
44125 +       assert("nikita-1125", where != NULL);
44126 +
44127 +       tograb = inode_dir_plugin(object)->estimate.rem_entry(object);
44128 +       result = reiser4_grab_space(tograb, BA_CAN_COMMIT | BA_RESERVED);
44129 +       if (result != 0)
44130 +               return RETERR(-ENOSPC);
44131 +
44132 +       init_lh(&lh);
44133 +
44134 +       /* check for this entry in a directory. This is plugin method. */
44135 +       result = find_entry(object, where, &lh, ZNODE_WRITE_LOCK, entry);
44136 +       fsdata = reiser4_get_dentry_fsdata(where);
44137 +       if (IS_ERR(fsdata))
44138 +               return PTR_ERR(fsdata);
44139 +
44140 +       coord = &fsdata->dec.entry_coord;
44141 +
44142 +       assert("nikita-3404",
44143 +              get_inode_oid(where->d_inode) != get_inode_oid(object) ||
44144 +              object->i_size <= 1);
44145 +
44146 +       coord_clear_iplug(coord);
44147 +       if (result == 0) {
44148 +               /* remove entry. Just pass control to the directory item
44149 +                  plugin. */
44150 +               assert("vs-542", inode_dir_item_plugin(object));
44151 +               seal_done(&fsdata->dec.entry_seal);
44152 +               adjust_dir_file(object, where, fsdata->dec.pos, -1);
44153 +               result = WITH_COORD(coord, rem_entry());
44154 +               if (result == 0) {
44155 +                       if (object->i_size >= 1)
44156 +                               INODE_DEC_FIELD(object, i_size);
44157 +                       else {
44158 +                               warning("nikita-2509", "Dir %llu is runt",
44159 +                                       get_inode_oid(object));
44160 +                               result = RETERR(-EIO);
44161 +                       }
44162 +                       write_current_logf(WRITE_TREE_LOG,
44163 +                                          "..de k %#llx %#llx %i %lli",
44164 +                                          get_inode_oid(where->d_inode),
44165 +                                          get_inode_oid(object),
44166 +                                          where->d_inode->i_nlink,
44167 +                                          where->d_inode->i_size);
44168 +                       assert("nikita-3405", where->d_inode->i_nlink != 1 ||
44169 +                              where->d_inode->i_size != 2 ||
44170 +                              inode_dir_plugin(where->d_inode) == NULL);
44171 +               }
44172 +       }
44173 +       done_lh(&lh);
44174 +
44175 +       return result;
44176 +}
44177 +
44178 +static int entry_actor(reiser4_tree * tree /* tree being scanned */ ,
44179 +                      coord_t * coord /* current coord */ ,
44180 +                      lock_handle * lh /* current lock handle */ ,
44181 +                      void *args /* argument to scan */ );
44182 +
44183 +/*
44184 + * argument package used by entry_actor to scan entries with identical keys.
44185 + */
44186 +typedef struct entry_actor_args {
44187 +       /* name we are looking for */
44188 +       const char *name;
44189 +       /* key of directory entry. entry_actor() scans through sequence of
44190 +        * items/units having the same key */
44191 +       reiser4_key *key;
44192 +       /* how many entries with duplicate key was scanned so far. */
44193 +       int non_uniq;
44194 +#if REISER4_USE_COLLISION_LIMIT || REISER4_STATS
44195 +       /* scan limit */
44196 +       int max_non_uniq;
44197 +#endif
44198 +       /* return parameter: set to true, if ->name wasn't found */
44199 +       int not_found;
44200 +       /* what type of lock to take when moving to the next node during
44201 +        * scan */
44202 +       znode_lock_mode mode;
44203 +
44204 +       /* last coord that was visited during scan */
44205 +       coord_t last_coord;
44206 +       /* last node locked during scan */
44207 +       lock_handle last_lh;
44208 +       /* inode of directory */
44209 +       const struct inode *inode;
44210 +} entry_actor_args;
44211 +
44212 +static int
44213 +check_entry(const struct inode *dir, coord_t *coord, const struct qstr *name)
44214 +{
44215 +       return WITH_COORD(coord, check_item(dir, coord, name->name));
44216 +}
44217 +
44218 +/* Look for given @name within directory @dir.
44219 +
44220 +   This is called during lookup, creation and removal of directory
44221 +   entries.
44222 +
44223 +   First calculate key that directory entry for @name would have. Search
44224 +   for this key in the tree. If such key is found, scan all items with
44225 +   the same key, checking name in each directory entry along the way.
44226 +*/
44227 +static int
44228 +find_entry(struct inode *dir /* directory to scan */,
44229 +          struct dentry *de /* name to search for */,
44230 +          lock_handle * lh /* resulting lock handle */,
44231 +          znode_lock_mode mode /* required lock mode */,
44232 +          reiser4_dir_entry_desc * entry /* parameters of found directory
44233 +                                          * entry */)
44234 +{
44235 +       const struct qstr *name;
44236 +       seal_t *seal;
44237 +       coord_t *coord;
44238 +       int result;
44239 +       __u32 flags;
44240 +       de_location *dec;
44241 +       reiser4_dentry_fsdata *fsdata;
44242 +
44243 +       assert("nikita-1130", lh != NULL);
44244 +       assert("nikita-1128", dir != NULL);
44245 +
44246 +       name = &de->d_name;
44247 +       assert("nikita-1129", name != NULL);
44248 +
44249 +       /* dentry private data don't require lock, because dentry
44250 +          manipulations are protected by i_sem on parent.
44251 +
44252 +          This is not so for inodes, because there is no -the- parent in
44253 +          inode case.
44254 +       */
44255 +       fsdata = reiser4_get_dentry_fsdata(de);
44256 +       if (IS_ERR(fsdata))
44257 +               return PTR_ERR(fsdata);
44258 +       dec = &fsdata->dec;
44259 +
44260 +       coord = &dec->entry_coord;
44261 +       coord_clear_iplug(coord);
44262 +       seal = &dec->entry_seal;
44263 +       /* compose key of directory entry for @name */
44264 +       inode_dir_plugin(dir)->build_entry_key(dir, name, &entry->key);
44265 +
44266 +       if (seal_is_set(seal)) {
44267 +               /* check seal */
44268 +               result = seal_validate(seal, coord, &entry->key, LEAF_LEVEL,
44269 +                                      lh, FIND_EXACT, mode, ZNODE_LOCK_LOPRI);
44270 +               if (result == 0) {
44271 +                       /* key was found. Check that it is really item we are
44272 +                          looking for. */
44273 +                       result = check_entry(dir, coord, name);
44274 +                       if (result == 0)
44275 +                               return 0;
44276 +               }
44277 +       }
44278 +       flags = (mode == ZNODE_WRITE_LOCK) ? CBK_FOR_INSERT : 0;
44279 +       /*
44280 +        * find place in the tree where directory item should be located.
44281 +        */
44282 +       result = object_lookup(dir,
44283 +                              &entry->key,
44284 +                              coord,
44285 +                              lh,
44286 +                              mode,
44287 +                              FIND_EXACT,
44288 +                              LEAF_LEVEL,
44289 +                              LEAF_LEVEL,
44290 +                              flags,
44291 +                              0/*ra_info*/);
44292 +
44293 +       if (result == CBK_COORD_FOUND) {
44294 +               entry_actor_args arg;
44295 +
44296 +               /* fast path: no hash collisions */
44297 +               result = check_entry(dir, coord, name);
44298 +               if (result == 0) {
44299 +                       seal_init(seal, coord, &entry->key);
44300 +                       dec->pos = 0;
44301 +               } else if (result > 0) {
44302 +                       /* Iterate through all units with the same keys. */
44303 +                       arg.name = name->name;
44304 +                       arg.key = &entry->key;
44305 +                       arg.not_found = 0;
44306 +                       arg.non_uniq = 0;
44307 +#if REISER4_USE_COLLISION_LIMIT
44308 +                       arg.max_non_uniq = max_hash_collisions(dir);
44309 +                       assert("nikita-2851", arg.max_non_uniq > 1);
44310 +#endif
44311 +                       arg.mode = mode;
44312 +                       arg.inode = dir;
44313 +                       coord_init_zero(&arg.last_coord);
44314 +                       init_lh(&arg.last_lh);
44315 +
44316 +                       result = iterate_tree(tree_by_inode(dir), coord, lh,
44317 +                                             entry_actor, &arg, mode, 1);
44318 +                       /* if end of the tree or extent was reached during
44319 +                          scanning. */
44320 +                       if (arg.not_found || (result == -E_NO_NEIGHBOR)) {
44321 +                               /* step back */
44322 +                               done_lh(lh);
44323 +
44324 +                               result = zload(arg.last_coord.node);
44325 +                               if (result == 0) {
44326 +                                       coord_clear_iplug(&arg.last_coord);
44327 +                                       coord_dup(coord, &arg.last_coord);
44328 +                                       move_lh(lh, &arg.last_lh);
44329 +                                       result = RETERR(-ENOENT);
44330 +                                       zrelse(arg.last_coord.node);
44331 +                                       --arg.non_uniq;
44332 +                               }
44333 +                       }
44334 +
44335 +                       done_lh(&arg.last_lh);
44336 +                       if (result == 0)
44337 +                               seal_init(seal, coord, &entry->key);
44338 +
44339 +                       if (result == 0 || result == -ENOENT) {
44340 +                               assert("nikita-2580", arg.non_uniq > 0);
44341 +                               dec->pos = arg.non_uniq - 1;
44342 +                       }
44343 +               }
44344 +       } else
44345 +               dec->pos = -1;
44346 +       return result;
44347 +}
44348 +
44349 +/* Function called by find_entry() to look for given name in the directory. */
44350 +static int
44351 +entry_actor(reiser4_tree * tree UNUSED_ARG /* tree being scanned */ ,
44352 +           coord_t * coord /* current coord */ ,
44353 +           lock_handle * lh /* current lock handle */ ,
44354 +           void *entry_actor_arg /* argument to scan */ )
44355 +{
44356 +       reiser4_key unit_key;
44357 +       entry_actor_args *args;
44358 +
44359 +       assert("nikita-1131", tree != NULL);
44360 +       assert("nikita-1132", coord != NULL);
44361 +       assert("nikita-1133", entry_actor_arg != NULL);
44362 +
44363 +       args = entry_actor_arg;
44364 +       ++args->non_uniq;
44365 +#if REISER4_USE_COLLISION_LIMIT
44366 +       if (args->non_uniq > args->max_non_uniq) {
44367 +               args->not_found = 1;
44368 +               /* hash collision overflow. */
44369 +               return RETERR(-EBUSY);
44370 +       }
44371 +#endif
44372 +
44373 +       /*
44374 +        * did we just reach the end of the sequence of items/units with
44375 +        * identical keys?
44376 +        */
44377 +       if (!keyeq(args->key, unit_key_by_coord(coord, &unit_key))) {
44378 +               assert("nikita-1791", keylt(args->key, unit_key_by_coord(coord, &unit_key)));
44379 +               args->not_found = 1;
44380 +               args->last_coord.between = AFTER_UNIT;
44381 +               return 0;
44382 +       }
44383 +
44384 +       coord_dup(&args->last_coord, coord);
44385 +       /*
44386 +        * did scan just moved to the next node?
44387 +        */
44388 +       if (args->last_lh.node != lh->node) {
44389 +               int lock_result;
44390 +
44391 +               /*
44392 +                * if so, lock new node with the mode requested by the caller
44393 +                */
44394 +               done_lh(&args->last_lh);
44395 +               assert("nikita-1896", znode_is_any_locked(lh->node));
44396 +               lock_result = longterm_lock_znode(&args->last_lh, lh->node,
44397 +                                                 args->mode, ZNODE_LOCK_HIPRI);
44398 +               if (lock_result != 0)
44399 +                       return lock_result;
44400 +       }
44401 +       return check_item(args->inode, coord, args->name);
44402 +}
44403 +
44404 +/*
44405 + * return 0 iff @coord contains a directory entry for the file with the name
44406 + * @name.
44407 + */
44408 +static int
44409 +check_item(const struct inode *dir, const coord_t * coord, const char *name)
44410 +{
44411 +       item_plugin *iplug;
44412 +       char buf[DE_NAME_BUF_LEN];
44413 +
44414 +       iplug = item_plugin_by_coord(coord);
44415 +       if (iplug == NULL) {
44416 +               warning("nikita-1135", "Cannot get item plugin");
44417 +               print_coord("coord", coord, 1);
44418 +               return RETERR(-EIO);
44419 +       } else if (item_id_by_coord(coord) != item_id_by_plugin(inode_dir_item_plugin(dir))) {
44420 +               /* item id of current item does not match to id of items a
44421 +                  directory is built of */
44422 +               warning("nikita-1136", "Wrong item plugin");
44423 +               print_coord("coord", coord, 1);
44424 +               print_plugin("plugin", item_plugin_to_plugin(iplug));
44425 +               return RETERR(-EIO);
44426 +       }
44427 +       assert("nikita-1137", iplug->s.dir.extract_name);
44428 +
44429 +       ON_TRACE(TRACE_DIR, "[%i]: check_item: \"%s\", \"%s\" in %lli (%lli)\n",
44430 +                current->pid, name, iplug->s.dir.extract_name(coord, buf),
44431 +                get_inode_oid(dir), *znode_get_block(coord->node));
44432 +       /* Compare name stored in this entry with name we are looking for.
44433 +
44434 +          NOTE-NIKITA Here should go code for support of something like
44435 +          unicode, code tables, etc.
44436 +       */
44437 +       return !!strcmp(name, iplug->s.dir.extract_name(coord, buf));
44438 +}
44439 +
44440 +/* Make Linus happy.
44441 +   Local variables:
44442 +   c-indentation-style: "K&R"
44443 +   mode-name: "LC"
44444 +   c-basic-offset: 8
44445 +   tab-width: 8
44446 +   fill-column: 120
44447 +   End:
44448 +*/
44449 diff -rupN linux-2.6.8-rc3/fs/reiser4/plugin/dir/hashed_dir.h linux-2.6.8-rc3-a/fs/reiser4/plugin/dir/hashed_dir.h
44450 --- linux-2.6.8-rc3/fs/reiser4/plugin/dir/hashed_dir.h  1970-01-01 03:00:00.000000000 +0300
44451 +++ linux-2.6.8-rc3-a/fs/reiser4/plugin/dir/hashed_dir.h        2004-08-05 21:20:53.483576077 +0400
44452 @@ -0,0 +1,46 @@
44453 +/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
44454 + * reiser4/README */
44455 +
44456 +/* Directory plugin using hashes (see fs/reiser4/plugin/hash.c) to map
44457 +   file names to to files. See hashed_dir.c */
44458 +
44459 +#if !defined( __HASHED_DIR_H__ )
44460 +#define __HASHED_DIR_H__
44461 +
44462 +#include "../../forward.h"
44463 +
44464 +#include <linux/fs.h>          /* for struct inode */
44465 +#include <linux/dcache.h>      /* for struct dentry */
44466 +
44467 +/* create sd for directory file. Create stat-data, dot, and dotdot. */
44468 +extern int init_hashed(struct inode *object, struct inode *parent, reiser4_object_create_data *);
44469 +extern int done_hashed(struct inode *object);
44470 +extern int detach_hashed(struct inode *object, struct inode *parent);
44471 +extern int owns_item_hashed(const struct inode *inode, const coord_t * coord);
44472 +extern int lookup_hashed(struct inode *inode, struct dentry **dentry);
44473 +extern int rename_hashed(struct inode *old_dir,
44474 +                        struct dentry *old_name, struct inode *new_dir, struct dentry *new_name);
44475 +extern int add_entry_hashed(struct inode *object,
44476 +                           struct dentry *where, reiser4_object_create_data *, reiser4_dir_entry_desc * entry);
44477 +extern int rem_entry_hashed(struct inode *object, struct dentry *where, reiser4_dir_entry_desc * entry);
44478 +extern reiser4_block_nr        estimate_rename_hashed(struct inode  *old_dir,
44479 +                                              struct dentry *old_name,
44480 +                                              struct inode  *new_dir,
44481 +                                              struct dentry *new_name);
44482 +extern reiser4_block_nr estimate_unlink_hashed(struct inode *parent,
44483 +                                              struct inode *object);
44484 +
44485 +extern struct dentry *get_parent_hashed(struct inode *child);
44486 +
44487 +/* __HASHED_DIR_H__ */
44488 +#endif
44489 +
44490 +/* Make Linus happy.
44491 +   Local variables:
44492 +   c-indentation-style: "K&R"
44493 +   mode-name: "LC"
44494 +   c-basic-offset: 8
44495 +   tab-width: 8
44496 +   fill-column: 120
44497 +   End:
44498 +*/
44499 diff -rupN linux-2.6.8-rc3/fs/reiser4/plugin/dir/pseudo_dir.c linux-2.6.8-rc3-a/fs/reiser4/plugin/dir/pseudo_dir.c
44500 --- linux-2.6.8-rc3/fs/reiser4/plugin/dir/pseudo_dir.c  1970-01-01 03:00:00.000000000 +0300
44501 +++ linux-2.6.8-rc3-a/fs/reiser4/plugin/dir/pseudo_dir.c        2004-08-05 21:20:53.009676033 +0400
44502 @@ -0,0 +1,97 @@
44503 +/* Copyright 2003 by Hans Reiser, licensing governed by reiser4/README */
44504 +
44505 +/* Directory plugin for pseudo files that operate like a directory. */
44506 +
44507 +#include "../../debug.h"
44508 +#include "../../inode.h"
44509 +#include "../pseudo/pseudo.h"
44510 +#include "dir.h"
44511 +
44512 +#include <linux/fs.h>          /* for struct inode */
44513 +#include <linux/dcache.h>      /* for struct dentry */
44514 +
44515 +/* implementation of ->lookup() method for pseudo files. */
44516 +reiser4_internal int lookup_pseudo(struct inode * parent, struct dentry **dentry)
44517 +{
44518 +       pseudo_plugin *pplug;
44519 +       int result;
44520 +
44521 +       /*
44522 +        * call ->lookup method of pseudo plugin
44523 +        */
44524 +
44525 +       pplug = reiser4_inode_data(parent)->file_plugin_data.pseudo_info.plugin;
44526 +       assert("nikita-3222", pplug->lookup != NULL);
44527 +       result = pplug->lookup(parent, dentry);
44528 +       if (result == -ENOENT)
44529 +               result = lookup_pseudo_file(parent, dentry);
44530 +       return result;
44531 +}
44532 +
44533 +
44534 +/* ->readdir() method for pseudo file acting like a directory */
44535 +reiser4_internal int
44536 +readdir_pseudo(struct file *f, void *dirent, filldir_t filld)
44537 +{
44538 +       pseudo_plugin *pplug;
44539 +       struct inode  *inode;
44540 +       struct dentry *dentry;
44541 +       int result = 0;
44542 +
44543 +       dentry = f->f_dentry;
44544 +       inode = dentry->d_inode;
44545 +       pplug = reiser4_inode_data(inode)->file_plugin_data.pseudo_info.plugin;
44546 +       if (pplug->readdir != NULL)
44547 +               /*
44548 +                * if pseudo plugin defines ->readdir() method---call it to do
44549 +                * actual work.
44550 +                */
44551 +               result = pplug->readdir(f, dirent, filld);
44552 +       else {
44553 +               ino_t ino;
44554 +               int i;
44555 +
44556 +               /*
44557 +                * if there is no ->readdir() method in the pseudo plugin,
44558 +                * make sure that at least dot and dotdot are returned to keep
44559 +                * user-level happy.
44560 +                */
44561 +
44562 +               i = f->f_pos;
44563 +               switch (i) {
44564 +               case 0:
44565 +                       ino = get_inode_oid(dentry->d_inode);
44566 +                       if (filld(dirent, ".", 1, i, ino, DT_DIR) < 0)
44567 +                               break;
44568 +                       f->f_pos++;
44569 +                       i++;
44570 +                       /* fallthrough */
44571 +               case 1:
44572 +                       ino = parent_ino(dentry);
44573 +                       if (filld(dirent, "..", 2, i, ino, DT_DIR) < 0)
44574 +                               break;
44575 +                       f->f_pos++;
44576 +                       i++;
44577 +                       /* fallthrough */
44578 +               }
44579 +       }
44580 +       return result;
44581 +}
44582 +
44583 +/* pseudo files are not serializable (currently). So, this should just return an
44584 + * error. */
44585 +reiser4_internal struct dentry *
44586 +get_parent_pseudo(struct inode *child)
44587 +{
44588 +       return ERR_PTR(RETERR(-ENOTSUPP));
44589 +}
44590 +
44591 +/* Make Linus happy.
44592 +   Local variables:
44593 +   c-indentation-style: "K&R"
44594 +   mode-name: "LC"
44595 +   c-basic-offset: 8
44596 +   tab-width: 8
44597 +   fill-column: 120
44598 +   End:
44599 +*/
44600 diff -rupN linux-2.6.8-rc3/fs/reiser4/plugin/dir/pseudo_dir.h linux-2.6.8-rc3-a/fs/reiser4/plugin/dir/pseudo_dir.h
44601 --- linux-2.6.8-rc3/fs/reiser4/plugin/dir/pseudo_dir.h  1970-01-01 03:00:00.000000000 +0300
44602 +++ linux-2.6.8-rc3-a/fs/reiser4/plugin/dir/pseudo_dir.h        2004-08-05 21:20:53.429587464 +0400
44603 @@ -0,0 +1,29 @@
44604 +/* Copyright 2003, 2004 by Hans Reiser, licensing governed by
44605 + * reiser4/README */
44606 +
44607 +/* Directory plugin for pseudo files. See pseudo_dir.c for details. */
44608 +
44609 +#if !defined( __PSEUDO_DIR_H__ )
44610 +#define __PSEUDO_DIR_H__
44611 +
44612 +#include "../../forward.h"
44613 +
44614 +#include <linux/fs.h>          /* for struct inode */
44615 +#include <linux/dcache.h>      /* for struct dentry */
44616 +
44617 +extern int lookup_pseudo(struct inode * parent, struct dentry **dentry);
44618 +extern int readdir_pseudo(struct file *f, void *dirent, filldir_t filld);
44619 +extern struct dentry *get_parent_pseudo(struct inode *child);
44620 +
44621 +/* __PSEUDO_DIR_H__ */
44622 +#endif
44623 +
44624 +/* Make Linus happy.
44625 +   Local variables:
44626 +   c-indentation-style: "K&R"
44627 +   mode-name: "LC"
44628 +   c-basic-offset: 8
44629 +   tab-width: 8
44630 +   fill-column: 120
44631 +   End:
44632 +*/
44633 diff -rupN linux-2.6.8-rc3/fs/reiser4/plugin/disk_format/disk_format.c linux-2.6.8-rc3-a/fs/reiser4/plugin/disk_format/disk_format.c
44634 --- linux-2.6.8-rc3/fs/reiser4/plugin/disk_format/disk_format.c 1970-01-01 03:00:00.000000000 +0300
44635 +++ linux-2.6.8-rc3-a/fs/reiser4/plugin/disk_format/disk_format.c       2004-08-05 21:20:53.040669496 +0400
44636 @@ -0,0 +1,38 @@
44637 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
44638 +
44639 +#include "../../debug.h"
44640 +#include "../plugin_header.h"
44641 +#include "disk_format40.h"
44642 +#include "disk_format.h"
44643 +#include "../plugin.h"
44644 +
44645 +/* initialization of disk layout plugins */
44646 +disk_format_plugin format_plugins[LAST_FORMAT_ID] = {
44647 +       [FORMAT40_ID] = {
44648 +               .h = {
44649 +                       .type_id = REISER4_FORMAT_PLUGIN_TYPE,
44650 +                       .id = FORMAT40_ID,
44651 +                       .pops = NULL,
44652 +                       .label = "reiser40",
44653 +                       .desc = "standard disk layout for reiser40",
44654 +                       .linkage = TYPE_SAFE_LIST_LINK_ZERO,
44655 +               },
44656 +               .get_ready = get_ready_format40,
44657 +               .root_dir_key = root_dir_key_format40,
44658 +               .release = release_format40,
44659 +               .log_super = log_super_format40,
44660 +               .print_info = print_info_format40,
44661 +               .check_open = check_open_format40
44662 +       }
44663 +};
44664 +
44665 +/* Make Linus happy.
44666 +   Local variables:
44667 +   c-indentation-style: "K&R"
44668 +   mode-name: "LC"
44669 +   c-basic-offset: 8
44670 +   tab-width: 8
44671 +   fill-column: 120
44672 +   scroll-step: 1
44673 +   End:
44674 +*/
44675 diff -rupN linux-2.6.8-rc3/fs/reiser4/plugin/disk_format/disk_format.h linux-2.6.8-rc3-a/fs/reiser4/plugin/disk_format/disk_format.h
44676 --- linux-2.6.8-rc3/fs/reiser4/plugin/disk_format/disk_format.h 1970-01-01 03:00:00.000000000 +0300
44677 +++ linux-2.6.8-rc3-a/fs/reiser4/plugin/disk_format/disk_format.h       2004-08-05 21:20:53.391595478 +0400
44678 @@ -0,0 +1,41 @@
44679 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
44680 +
44681 +/* identifiers for disk layouts, they are also used as indexes in array of disk
44682 +   plugins */
44683 +
44684 +#if !defined( __REISER4_DISK_FORMAT_H__ )
44685 +#define __REISER4_DISK_FORMAT_H__
44686 +
44687 +typedef enum {
44688 +       /* standard reiser4 disk layout plugin id */
44689 +       FORMAT40_ID,
44690 +       LAST_FORMAT_ID
44691 +} disk_format_id;
44692 +
44693 +/* __REISER4_DISK_FORMAT_H__ */
44694 +#endif
44695 +
44696 +/* Make Linus happy.
44697 +   Local variables:
44698 +   c-indentation-style: "K&R"
44699 +   mode-name: "LC"
44700 +   c-basic-offset: 8
44701 +   tab-width: 8
44702 +   fill-column: 120
44703 +   scroll-step: 1
44704 +   End:
44705 +*/
44706 +
44707 +
44708 +
44709 +
44710 +
44711 +
44712 +
44713 +
44714 +
44715 +
44716 +
44717 +
44718 +
44719 +
44720 diff -rupN linux-2.6.8-rc3/fs/reiser4/plugin/disk_format/disk_format40.c linux-2.6.8-rc3-a/fs/reiser4/plugin/disk_format/disk_format40.c
44721 --- linux-2.6.8-rc3/fs/reiser4/plugin/disk_format/disk_format40.c       1970-01-01 03:00:00.000000000 +0300
44722 +++ linux-2.6.8-rc3-a/fs/reiser4/plugin/disk_format/disk_format40.c     2004-08-05 21:20:53.246626055 +0400
44723 @@ -0,0 +1,549 @@
44724 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
44725 +
44726 +#include "../../debug.h"
44727 +#include "../../dformat.h"
44728 +#include "../../key.h"
44729 +#include "../node/node.h"
44730 +#include "../space/space_allocator.h"
44731 +#include "disk_format40.h"
44732 +#include "../plugin.h"
44733 +#include "../../txnmgr.h"
44734 +#include "../../jnode.h"
44735 +#include "../../tree.h"
44736 +#include "../../super.h"
44737 +#include "../../wander.h"
44738 +#include "../../diskmap.h"
44739 +#include "../../inode.h"
44740 +#include "../../ktxnmgrd.h"
44741 +#include "../../status_flags.h"
44742 +
44743 +#include <linux/types.h>       /* for __u??  */
44744 +#include <linux/fs.h>          /* for struct super_block  */
44745 +#include <linux/buffer_head.h>
44746 +
44747 +/* reiser 4.0 default disk layout */
44748 +
44749 +/* functions to access fields of format40_disk_super_block */
44750 +static __u64
44751 +get_format40_block_count(const format40_disk_super_block * sb)
44752 +{
44753 +       return d64tocpu(&sb->block_count);
44754 +}
44755 +
44756 +static __u64
44757 +get_format40_free_blocks(const format40_disk_super_block * sb)
44758 +{
44759 +       return d64tocpu(&sb->free_blocks);
44760 +}
44761 +
44762 +static __u64
44763 +get_format40_root_block(const format40_disk_super_block * sb)
44764 +{
44765 +       return d64tocpu(&sb->root_block);
44766 +}
44767 +
44768 +static __u16
44769 +get_format40_tree_height(const format40_disk_super_block * sb)
44770 +{
44771 +       return d16tocpu(&sb->tree_height);
44772 +}
44773 +
44774 +static __u64
44775 +get_format40_file_count(const format40_disk_super_block * sb)
44776 +{
44777 +       return d64tocpu(&sb->file_count);
44778 +}
44779 +
44780 +static __u64
44781 +get_format40_oid(const format40_disk_super_block * sb)
44782 +{
44783 +       return d64tocpu(&sb->oid);
44784 +}
44785 +
44786 +static __u32
44787 +get_format40_mkfs_id(const format40_disk_super_block * sb)
44788 +{
44789 +       return d32tocpu(&sb->mkfs_id);
44790 +}
44791 +
44792 +static __u64
44793 +get_format40_flags(const format40_disk_super_block * sb)
44794 +{
44795 +       return d64tocpu(&sb->flags);
44796 +}
44797 +
44798 +static format40_super_info *
44799 +get_sb_info(struct super_block *super)
44800 +{
44801 +       return &get_super_private(super)->u.format40;
44802 +}
44803 +
44804 +static int
44805 +consult_diskmap(struct super_block *s)
44806 +{
44807 +       format40_super_info *info;
44808 +       journal_location    *jloc;
44809 +
44810 +       info = get_sb_info(s);
44811 +       jloc = &get_super_private(s)->jloc;
44812 +       /* Default format-specific locations, if there is nothing in
44813 +        * diskmap */
44814 +       jloc->footer = FORMAT40_JOURNAL_FOOTER_BLOCKNR;
44815 +       jloc->header = FORMAT40_JOURNAL_HEADER_BLOCKNR;
44816 +       info->loc.super = FORMAT40_OFFSET / s->s_blocksize;
44817 +#ifdef CONFIG_REISER4_BADBLOCKS
44818 +        reiser4_get_diskmap_value(FORMAT40_PLUGIN_DISKMAP_ID, FORMAT40_JF,
44819 +                                 &jloc->footer);
44820 +        reiser4_get_diskmap_value(FORMAT40_PLUGIN_DISKMAP_ID, FORMAT40_JH,
44821 +                                 &jloc->header);
44822 +        reiser4_get_diskmap_value(FORMAT40_PLUGIN_DISKMAP_ID, FORMAT40_SUPER,
44823 +                                 &info->loc.super);
44824 +#endif
44825 +       return 0;
44826 +}
44827 +
44828 +/* find any valid super block of disk_format40 (even if the first
44829 +   super block is destroyed), will change block numbers of actual journal header/footer (jf/jh)
44830 +   if needed */
44831 +static struct buffer_head *
44832 +find_a_disk_format40_super_block(struct super_block *s)
44833 +{
44834 +       struct buffer_head *super_bh;
44835 +       format40_disk_super_block *disk_sb;
44836 +       format40_super_info *info;
44837 +
44838 +       assert("umka-487", s != NULL);
44839 +
44840 +       info = get_sb_info(s);
44841 +
44842 +       super_bh = sb_bread(s, info->loc.super);
44843 +       if (super_bh == NULL)
44844 +               return ERR_PTR(RETERR(-EIO));
44845 +
44846 +       disk_sb = (format40_disk_super_block *) super_bh->b_data;
44847 +       if (strncmp(disk_sb->magic, FORMAT40_MAGIC, sizeof(FORMAT40_MAGIC))) {
44848 +               brelse(super_bh);
44849 +               return ERR_PTR(RETERR(-EINVAL));
44850 +       }
44851 +
44852 +       reiser4_set_block_count(s, d64tocpu(&disk_sb->block_count));
44853 +       reiser4_set_data_blocks(s, d64tocpu(&disk_sb->block_count) -
44854 +                               d64tocpu(&disk_sb->free_blocks));
44855 +       reiser4_set_free_blocks(s, (d64tocpu(&disk_sb->free_blocks)));
44856 +
44857 +       return super_bh;
44858 +}
44859 +
44860 +/* find the most recent version of super block. This is called after journal is
44861 +   replayed */
44862 +static struct buffer_head *
44863 +read_super_block(struct super_block *s UNUSED_ARG)
44864 +{
44865 +       /* Here the most recent superblock copy has to be read. However, as
44866 +          journal replay isn't complete, we are using
44867 +          find_a_disk_format40_super_block() function. */
44868 +       return find_a_disk_format40_super_block(s);
44869 +}
44870 +
44871 +static int
44872 +get_super_jnode(struct super_block *s)
44873 +{
44874 +       reiser4_super_info_data *sbinfo = get_super_private(s);
44875 +       jnode *sb_jnode;
44876 +       int ret;
44877 +
44878 +       sb_jnode = alloc_io_head(&get_sb_info(s)->loc.super);
44879 +
44880 +       ret = jload(sb_jnode);
44881 +
44882 +       if (ret) {
44883 +               drop_io_head(sb_jnode);
44884 +               return ret;
44885 +       }
44886 +
44887 +       pin_jnode_data(sb_jnode);
44888 +       jrelse(sb_jnode);
44889 +
44890 +       sbinfo->u.format40.sb_jnode = sb_jnode;
44891 +
44892 +       return 0;
44893 +}
44894 +
44895 +static void
44896 +done_super_jnode(struct super_block *s)
44897 +{
44898 +       jnode *sb_jnode = get_super_private(s)->u.format40.sb_jnode;
44899 +
44900 +       if (sb_jnode) {
44901 +               unpin_jnode_data(sb_jnode);
44902 +               drop_io_head(sb_jnode);
44903 +       }
44904 +}
44905 +
44906 +typedef enum format40_init_stage {
44907 +       NONE_DONE = 0,
44908 +       CONSULT_DISKMAP,
44909 +       FIND_A_SUPER,
44910 +       INIT_JOURNAL_INFO,
44911 +       INIT_EFLUSH,
44912 +       INIT_STATUS,
44913 +       JOURNAL_REPLAY,
44914 +       READ_SUPER,
44915 +       KEY_CHECK,
44916 +       INIT_OID,
44917 +       INIT_TREE,
44918 +       JOURNAL_RECOVER,
44919 +       INIT_SA,
44920 +       INIT_JNODE,
44921 +       ALL_DONE
44922 +} format40_init_stage;
44923 +
44924 +static int
44925 +try_init_format40(struct super_block *s, format40_init_stage *stage)
44926 +{
44927 +       int result;
44928 +       struct buffer_head *super_bh;
44929 +       reiser4_super_info_data *sbinfo;
44930 +       format40_disk_super_block  sb;
44931 +       /* FIXME-NIKITA ugly work-around: keep copy of on-disk super-block */
44932 +       format40_disk_super_block *sb_copy = &sb;
44933 +       tree_level height;
44934 +       reiser4_block_nr root_block;
44935 +       node_plugin *nplug;
44936 +
44937 +       cassert(sizeof sb == 512);
44938 +
44939 +       assert("vs-475", s != NULL);
44940 +       assert("vs-474", get_super_private(s));
44941 +
44942 +       /* initialize reiser4_super_info_data */
44943 +       sbinfo = get_super_private(s);
44944 +
44945 +       *stage = NONE_DONE;
44946 +
44947 +       result = consult_diskmap(s);
44948 +       if (result)
44949 +               return result;
44950 +       *stage = CONSULT_DISKMAP;
44951 +
44952 +       super_bh = find_a_disk_format40_super_block(s);
44953 +       if (IS_ERR(super_bh))
44954 +               return PTR_ERR(super_bh);
44955 +       brelse(super_bh);
44956 +       *stage = FIND_A_SUPER;
44957 +
44958 +       /* map jnodes for journal control blocks (header, footer) to disk  */
44959 +       result = init_journal_info(s);
44960 +       if (result)
44961 +               return result;
44962 +       *stage = INIT_JOURNAL_INFO;
44963 +
44964 +       result = eflush_init_at(s);
44965 +       if (result)
44966 +               return result;
44967 +       *stage = INIT_EFLUSH;
44968 +
44969 +       /* ok, we are sure that filesystem format is a format40 format */
44970 +       /* Now check it's state */
44971 +       result = reiser4_status_init(FORMAT40_STATUS_BLOCKNR);
44972 +       if (result != 0 && result != -EINVAL)
44973 +               /* -EINVAL means there is no magic, so probably just old
44974 +                * fs. */
44975 +               return result;
44976 +       *stage = INIT_STATUS;
44977 +
44978 +       result = reiser4_status_query(NULL, NULL);
44979 +       if (result == REISER4_STATUS_MOUNT_WARN)
44980 +               printk("Warning, mounting filesystem with errors\n");
44981 +       if (result == REISER4_STATUS_MOUNT_RO) {
44982 +               printk("Warning, mounting filesystem with fatal errors, forcing read-only mount\n");
44983 +               /* FIXME: here we should actually enforce read-only mount,
44984 +                * only it is unsupported yet. */
44985 +       }
44986 +
44987 +       result = reiser4_journal_replay(s);
44988 +       if (result)
44989 +               return result;
44990 +       *stage = JOURNAL_REPLAY;
44991 +
44992 +       super_bh = read_super_block(s);
44993 +       if (IS_ERR(super_bh))
44994 +               return PTR_ERR(super_bh);
44995 +       *stage = READ_SUPER;
44996 +
44997 +       xmemcpy(sb_copy, ((format40_disk_super_block *) super_bh->b_data), sizeof (*sb_copy));
44998 +       brelse(super_bh);
44999 +
45000 +       if (!equi(REISER4_LARGE_KEY,
45001 +                 get_format40_flags(sb_copy) & (1 << FORMAT40_LARGE_KEYS))) {
45002 +               warning("nikita-3228", "Key format mismatch. "
45003 +                       "Only %s keys are supported.",
45004 +                       REISER4_LARGE_KEY ? "large" : "small");
45005 +               return RETERR(-EINVAL);
45006 +       }
45007 +       *stage = KEY_CHECK;
45008 +
45009 +       result = oid_init_allocator(s, get_format40_file_count(sb_copy), get_format40_oid(sb_copy));
45010 +       if (result)
45011 +               return result;
45012 +       *stage = INIT_OID;
45013 +
45014 +       /* get things necessary to init reiser4_tree */
45015 +       root_block = get_format40_root_block(sb_copy);
45016 +       height = get_format40_tree_height(sb_copy);
45017 +       nplug = node_plugin_by_id(NODE40_ID);
45018 +
45019 +       sbinfo->tree.super = s;
45020 +       /* init reiser4_tree for the filesystem */
45021 +       result = init_tree(&sbinfo->tree, &root_block, height, nplug);
45022 +       if (result)
45023 +               return result;
45024 +       *stage = INIT_TREE;
45025 +
45026 +       /* initialize reiser4_super_info_data */
45027 +       sbinfo->default_uid = 0;
45028 +       sbinfo->default_gid = 0;
45029 +
45030 +       reiser4_set_mkfs_id(s, get_format40_mkfs_id(sb_copy));
45031 +       reiser4_set_block_count(s, get_format40_block_count(sb_copy));
45032 +       reiser4_set_free_blocks(s, get_format40_free_blocks(sb_copy));
45033 +
45034 +       sbinfo->fsuid = 0;
45035 +       sbinfo->fs_flags |= (1 << REISER4_ADG); /* hard links for directories
45036 +                                                        * are not supported */
45037 +       sbinfo->fs_flags |= (1 << REISER4_ONE_NODE_PLUGIN);     /* all nodes in
45038 +                                                                * layout 40 are
45039 +                                                                * of one
45040 +                                                                * plugin */
45041 +       /* sbinfo->tmgr is initialized already */
45042 +
45043 +       /* recover sb data which were logged separately from sb block */
45044 +
45045 +       /* NOTE-NIKITA: reiser4_journal_recover_sb_data() calls
45046 +        * oid_init_allocator() and reiser4_set_free_blocks() with new
45047 +        * data. What's the reason to call them above? */
45048 +       result = reiser4_journal_recover_sb_data(s);
45049 +       if (result != 0)
45050 +               return result;
45051 +       *stage = JOURNAL_RECOVER;
45052 +
45053 +       /* Set number of used blocks.  The number of used blocks is not stored
45054 +          neither in on-disk super block nor in the journal footer blocks.  At
45055 +          this moment actual values of total blocks and free block counters are
45056 +          set in the reiser4 super block (in-memory structure) and we can
45057 +          calculate number of used blocks from them. */
45058 +       reiser4_set_data_blocks(s,
45059 +                               reiser4_block_count(s) - reiser4_free_blocks(s));
45060 +
45061 +#if REISER4_DEBUG
45062 +       sbinfo->min_blocks_used =
45063 +               16 /* reserved area */ +
45064 +               2 /* super blocks */ +
45065 +               2 /* journal footer and header */;
45066 +#endif
45067 +
45068 +       /* init disk space allocator */
45069 +       result = sa_init_allocator(get_space_allocator(s), s, 0);
45070 +       if (result)
45071 +               return result;
45072 +       *stage = INIT_SA;
45073 +
45074 +       result = get_super_jnode(s);
45075 +       if (result == 0)
45076 +               *stage = ALL_DONE;
45077 +       return result;
45078 +}
45079 +
45080 +/* plugin->u.format.get_ready */
45081 +reiser4_internal int
45082 +get_ready_format40(struct super_block *s, void *data UNUSED_ARG)
45083 +{
45084 +       int result;
45085 +       format40_init_stage stage;
45086 +
45087 +       result = try_init_format40(s, &stage);
45088 +       switch (stage) {
45089 +       case ALL_DONE:
45090 +               assert("nikita-3458", result == 0);
45091 +               break;
45092 +       case INIT_JNODE:
45093 +               done_super_jnode(s);
45094 +       case INIT_SA:
45095 +               sa_destroy_allocator(get_space_allocator(s), s);
45096 +       case JOURNAL_RECOVER:
45097 +       case INIT_TREE:
45098 +               done_tree(&get_super_private(s)->tree);
45099 +       case INIT_OID:
45100 +       case KEY_CHECK:
45101 +       case READ_SUPER:
45102 +       case JOURNAL_REPLAY:
45103 +       case INIT_STATUS:
45104 +               reiser4_status_finish();
45105 +       case INIT_EFLUSH:
45106 +               eflush_done_at(s);
45107 +       case INIT_JOURNAL_INFO:
45108 +               done_journal_info(s);
45109 +       case FIND_A_SUPER:
45110 +       case CONSULT_DISKMAP:
45111 +       case NONE_DONE:
45112 +               break;
45113 +       default:
45114 +               impossible("nikita-3457", "init stage: %i", stage);
45115 +       }
45116 +       return result;
45117 +}
45118 +
45119 +static void
45120 +pack_format40_super(const struct super_block *s, char *data)
45121 +{
45122 +       format40_disk_super_block *super_data = (format40_disk_super_block *) data;
45123 +       reiser4_super_info_data *sbinfo = get_super_private(s);
45124 +
45125 +       assert("zam-591", data != NULL);
45126 +
45127 +       cputod64(reiser4_free_committed_blocks(s), &super_data->free_blocks);
45128 +       cputod64(sbinfo->tree.root_block, &super_data->root_block);
45129 +
45130 +       cputod64(oid_next(s), &super_data->oid);
45131 +       cputod64(oids_used(s), &super_data->file_count);
45132 +
45133 +       cputod16(sbinfo->tree.height, &super_data->tree_height);
45134 +}
45135 +
45136 +/* plugin->u.format.log_super
45137 +   return a jnode which should be added to transaction when the super block
45138 +   gets logged */
45139 +reiser4_internal jnode *
45140 +log_super_format40(struct super_block *s)
45141 +{
45142 +       jnode *sb_jnode;
45143 +
45144 +       sb_jnode = get_super_private(s)->u.format40.sb_jnode;
45145 +
45146 +       jload(sb_jnode);
45147 +
45148 +       pack_format40_super(s, jdata(sb_jnode));
45149 +
45150 +       jrelse(sb_jnode);
45151 +
45152 +       return sb_jnode;
45153 +}
45154 +
45155 +/* plugin->u.format.release */
45156 +reiser4_internal int
45157 +release_format40(struct super_block *s)
45158 +{
45159 +       int ret;
45160 +       reiser4_super_info_data *sbinfo;
45161 +
45162 +       sbinfo = get_super_private(s);
45163 +       assert("zam-579", sbinfo != NULL);
45164 +
45165 +       /* FIXME-UMKA: Should we tell block transaction manager to commit all if
45166 +        * we will have no space left? */
45167 +       if (reiser4_grab_space(1, BA_RESERVED))
45168 +               return RETERR(-ENOSPC);
45169 +
45170 +       if (!rofs_super(s)) {
45171 +               ret = capture_super_block(s);
45172 +               if (ret != 0)
45173 +                       warning("vs-898", "capture_super_block failed: %d", ret);
45174 +
45175 +               ret = txnmgr_force_commit_all(s, 1);
45176 +               if (ret != 0)
45177 +                       warning("jmacd-74438", "txn_force failed: %d", ret);
45178 +       }
45179 +       if (reiser4_is_debugged(s, REISER4_STATS_ON_UMOUNT))
45180 +               print_fs_info("umount ok", s);
45181 +
45182 +       /*done_tree(&sbinfo->tree);*/
45183 +
45184 +       sa_destroy_allocator(&sbinfo->space_allocator, s);
45185 +       done_journal_info(s);
45186 +       eflush_done_at(s);
45187 +       done_super_jnode(s);
45188 +
45189 +       return 0;
45190 +}
45191 +
45192 +#define FORMAT40_ROOT_LOCALITY 41
45193 +#define FORMAT40_ROOT_OBJECTID 42
45194 +
45195 +/* plugin->u.format.root_dir_key */
45196 +reiser4_internal const reiser4_key *
45197 +root_dir_key_format40(const struct super_block *super UNUSED_ARG)
45198 +{
45199 +       static const reiser4_key FORMAT40_ROOT_DIR_KEY = {
45200 +               .el = {{(FORMAT40_ROOT_LOCALITY << 4) | KEY_SD_MINOR},
45201 +#if REISER4_LARGE_KEY
45202 +                      {0ull},
45203 +#endif
45204 +                      {FORMAT40_ROOT_OBJECTID}, {0ull}}
45205 +       };
45206 +
45207 +       return &FORMAT40_ROOT_DIR_KEY;
45208 +}
45209 +
45210 +/* plugin->u.format.print_info */
45211 +reiser4_internal void
45212 +print_info_format40(const struct super_block *s)
45213 +{
45214 +#if 0
45215 +       format40_disk_super_block *sb_copy;
45216 +
45217 +       sb_copy = &get_super_private(s)->u.format40.actual_sb;
45218 +
45219 +       printk("\tblock count %llu\n"
45220 +              "\tfree blocks %llu\n"
45221 +              "\troot_block %llu\n"
45222 +              "\ttail policy %s\n"
45223 +              "\tmin free oid %llu\n"
45224 +              "\tfile count %llu\n"
45225 +              "\ttree height %d\n",
45226 +              get_format40_block_count(sb_copy),
45227 +              get_format40_free_blocks(sb_copy),
45228 +              get_format40_root_block(sb_copy),
45229 +              formatting_plugin_by_id(get_format40_formatting_policy(sb_copy))->h.label,
45230 +              get_format40_oid(sb_copy), get_format40_file_count(sb_copy), get_format40_tree_height(sb_copy));
45231 +#endif
45232 +}
45233 +
45234 +/* plugin->u.format.check_open.
45235 +   Check the opened object for validness. For now it checks for the valid oid &
45236 +   locality only, can be improved later and it its work may depend on the mount
45237 +   options. */
45238 +reiser4_internal int
45239 +check_open_format40(const struct inode *object) {
45240 +       oid_t max, oid;
45241 +
45242 +       max = oid_next(object->i_sb) - 1;
45243 +
45244 +       /* Check the oid. */
45245 +       oid = get_inode_oid(object);
45246 +       if (oid > max) {
45247 +               warning("vpf-1360", "The object with the oid %llu greater then the "
45248 +                       "max used oid %llu found.", oid, max);
45249 +               return RETERR(-EIO);
45250 +       }
45251 +
45252 +       /* Check the locality. */
45253 +       oid = reiser4_inode_data(object)->locality_id;
45254 +       if (oid > max) {
45255 +               warning("vpf-1360", "The object with the locality %llu greater then the "
45256 +                       "max used oid %llu found.", oid, max);
45257 +               return RETERR(-EIO);
45258 +       }
45259 +
45260 +       return 0;
45261 +}
45262 +
45263 +/* Make Linus happy.
45264 +   Local variables:
45265 +   c-indentation-style: "K&R"
45266 +   mode-name: "LC"
45267 +   c-basic-offset: 8
45268 +   tab-width: 8
45269 +   fill-column: 120
45270 +   scroll-step: 1
45271 +   End:
45272 +*/
45273 diff -rupN linux-2.6.8-rc3/fs/reiser4/plugin/disk_format/disk_format40.h linux-2.6.8-rc3-a/fs/reiser4/plugin/disk_format/disk_format40.h
45274 --- linux-2.6.8-rc3/fs/reiser4/plugin/disk_format/disk_format40.h       1970-01-01 03:00:00.000000000 +0300
45275 +++ linux-2.6.8-rc3-a/fs/reiser4/plugin/disk_format/disk_format40.h     2004-08-05 21:20:53.248625633 +0400
45276 @@ -0,0 +1,100 @@
45277 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
45278 +
45279 +/* this file contains:
45280 +   - definition of ondisk super block of standart disk layout for
45281 +     reiser 4.0 (layout 40)
45282 +   - definition of layout 40 specific portion of in-core super block
45283 +   - declarations of functions implementing methods of layout plugin
45284 +     for layout 40
45285 +   - declarations of functions used to get/set fields in layout 40 super block
45286 +*/
45287 +
45288 +#ifndef __DISK_FORMAT40_H__
45289 +#define __DISK_FORMAT40_H__
45290 +
45291 +/* magic for default reiser4 layout */
45292 +#define FORMAT40_MAGIC "ReIsEr40FoRmAt"
45293 +#define FORMAT40_OFFSET (REISER4_MASTER_OFFSET + PAGE_CACHE_SIZE)
45294 +
45295 +#include "../../dformat.h"
45296 +
45297 +#include <linux/fs.h>          /* for struct super_block  */
45298 +
45299 +typedef enum {
45300 +       FORMAT40_LARGE_KEYS
45301 +} format40_flags;
45302 +
45303 +/* ondisk super block for format 40. It is 512 bytes long */
45304 +typedef struct format40_disk_super_block {
45305 +       /*   0 */ d64 block_count;
45306 +       /* number of block in a filesystem */
45307 +       /*   8 */ d64 free_blocks;
45308 +       /* number of free blocks */
45309 +       /*  16 */ d64 root_block;
45310 +       /* filesystem tree root block */
45311 +       /*  24 */ d64 oid;
45312 +       /* smallest free objectid */
45313 +       /*  32 */ d64 file_count;
45314 +       /* number of files in a filesystem */
45315 +       /*  40 */ d64 flushes;
45316 +       /* number of times super block was
45317 +          flushed. Needed if format 40
45318 +          will have few super blocks */
45319 +       /*  48 */ d32 mkfs_id;
45320 +       /* unique identifier of fs */
45321 +       /*  52 */ char magic[16];
45322 +       /* magic string ReIsEr40FoRmAt */
45323 +       /*  68 */ d16 tree_height;
45324 +       /* height of filesystem tree */
45325 +       /*  70 */ d16 formatting_policy;
45326 +       /*  72 */ d64 flags;
45327 +       /*  72 */ char not_used[432];
45328 +} format40_disk_super_block;
45329 +
45330 +/* format 40 specific part of reiser4_super_info_data */
45331 +typedef struct format40_super_info {
45332 +/*     format40_disk_super_block actual_sb; */
45333 +       jnode *sb_jnode;
45334 +       struct {
45335 +               reiser4_block_nr super;
45336 +       } loc;
45337 +} format40_super_info;
45338 +
45339 +/* Defines for journal header and footer respectively. */
45340 +#define FORMAT40_JOURNAL_HEADER_BLOCKNR \
45341 +       ((REISER4_MASTER_OFFSET / PAGE_CACHE_SIZE) + 3)
45342 +
45343 +#define FORMAT40_JOURNAL_FOOTER_BLOCKNR \
45344 +       ((REISER4_MASTER_OFFSET / PAGE_CACHE_SIZE) + 4)
45345 +
45346 +#define FORMAT40_STATUS_BLOCKNR \
45347 +       ((REISER4_MASTER_OFFSET / PAGE_CACHE_SIZE) + 5)
45348 +
45349 +/* Diskmap declarations */
45350 +#define FORMAT40_PLUGIN_DISKMAP_ID ((REISER4_FORMAT_PLUGIN_TYPE<<16) | (FORMAT40_ID))
45351 +#define FORMAT40_SUPER 1
45352 +#define FORMAT40_JH 2
45353 +#define FORMAT40_JF 3
45354 +
45355 +/* declarations of functions implementing methods of layout plugin for
45356 +   format 40. The functions theirself are in disk_format40.c */
45357 +int get_ready_format40(struct super_block *, void *data);
45358 +const reiser4_key *root_dir_key_format40(const struct super_block *);
45359 +int release_format40(struct super_block *s);
45360 +jnode *log_super_format40(struct super_block *s);
45361 +void print_info_format40(const struct super_block *s);
45362 +int check_open_format40(const struct inode *object);
45363 +
45364 +/* __DISK_FORMAT40_H__ */
45365 +#endif
45366 +
45367 +/* Make Linus happy.
45368 +   Local variables:
45369 +   c-indentation-style: "K&R"
45370 +   mode-name: "LC"
45371 +   c-basic-offset: 8
45372 +   tab-width: 8
45373 +   fill-column: 120
45374 +   scroll-step: 1
45375 +   End:
45376 +*/
45377 diff -rupN linux-2.6.8-rc3/fs/reiser4/plugin/fibration.c linux-2.6.8-rc3-a/fs/reiser4/plugin/fibration.c
45378 --- linux-2.6.8-rc3/fs/reiser4/plugin/fibration.c       1970-01-01 03:00:00.000000000 +0300
45379 +++ linux-2.6.8-rc3-a/fs/reiser4/plugin/fibration.c     2004-08-05 21:20:52.783723693 +0400
45380 @@ -0,0 +1,173 @@
45381 +/* Copyright 2004 by Hans Reiser, licensing governed by
45382 + * reiser4/README */
45383 +
45384 +/* Directory fibrations */
45385 +
45386 +/*
45387 + * Suppose we have a directory tree with sources of some project. During
45388 + * compilation .o files are created within this tree. This makes access
45389 + * to the original source files less efficient, because source files are
45390 + * now "diluted" by object files: default directory plugin uses prefix
45391 + * of a file name as a part of the key for directory entry (and this
45392 + * part is also inherited by the key of file body). This means that
45393 + * foo.o will be located close to foo.c and foo.h in the tree.
45394 + *
45395 + * To avoid this effect directory plugin fill highest 7 (unused
45396 + * originally) bits of the second component of the directory entry key
45397 + * by bit-pattern depending on the file name (see
45398 + * fs/reiser4/kassign.c:build_entry_key_common()). These bits are called
45399 + * "fibre". Fibre of the file name key is inherited by key of stat data
45400 + * and keys of file body (in the case of REISER4_LARGE_KEY).
45401 + *
45402 + * Fibre for a given file is chosen by per-directory fibration
45403 + * plugin. Names within given fibre are ordered lexicographically.
45404 + */
45405 +
45406 +#include "../debug.h"
45407 +#include "plugin_header.h"
45408 +#include "plugin.h"
45409 +#include "../super.h"
45410 +#include "../inode.h"
45411 +
45412 +#include <linux/types.h>
45413 +
45414 +static const int fibre_shift = 57;
45415 +
45416 +#define FIBRE_NO(n) (((__u64)(n)) << fibre_shift)
45417 +
45418 +/*
45419 + * Trivial fibration: all files of directory are just ordered
45420 + * lexicographically.
45421 + */
45422 +static __u64 fibre_trivial(const struct inode *dir, const char *name, int len)
45423 +{
45424 +       return FIBRE_NO(0);
45425 +}
45426 +
45427 +/*
45428 + * dot-o fibration: place .o files after all others.
45429 + */
45430 +static __u64 fibre_dot_o(const struct inode *dir, const char *name, int len)
45431 +{
45432 +       /* special treatment for .*\.o */
45433 +       if (len > 2 && name[len - 1] == 'o' && name[len - 2] == '.')
45434 +               return FIBRE_NO(1);
45435 +       else
45436 +               return FIBRE_NO(0);
45437 +}
45438 +
45439 +/*
45440 + * ext.1 fibration: subdivide directory into 128 fibrations one for each
45441 + * 7bit extension character (file "foo.h" goes into fibre "h"), plus
45442 + * default fibre for the rest.
45443 + */
45444 +static __u64 fibre_ext_1(const struct inode *dir, const char *name, int len)
45445 +{
45446 +       if (len > 2 && name[len - 2] == '.')
45447 +               return FIBRE_NO(name[len - 1]);
45448 +       else
45449 +               return FIBRE_NO(0);
45450 +}
45451 +
45452 +/*
45453 + * ext.3 fibration: try to separate files with different 3-character
45454 + * extensions from each other.
45455 + */
45456 +static __u64 fibre_ext_3(const struct inode *dir, const char *name, int len)
45457 +{
45458 +       if (len > 4 && name[len - 4] == '.')
45459 +               return FIBRE_NO(name[len - 3] + name[len - 2] + name[len - 1]);
45460 +       else
45461 +               return FIBRE_NO(0);
45462 +}
45463 +
45464 +static int
45465 +change_fibration(struct inode * inode, reiser4_plugin * plugin)
45466 +{
45467 +       int result;
45468 +
45469 +       assert("nikita-3503", inode != NULL);
45470 +       assert("nikita-3504", plugin != NULL);
45471 +
45472 +       assert("nikita-3505", is_reiser4_inode(inode));
45473 +       assert("nikita-3506", inode_dir_plugin(inode) != NULL);
45474 +       assert("nikita-3507", plugin->h.type_id == REISER4_FIBRATION_PLUGIN_TYPE);
45475 +
45476 +       result = 0;
45477 +       if (inode_fibration_plugin(inode) == NULL ||
45478 +           inode_fibration_plugin(inode)->h.id != plugin->h.id) {
45479 +               if (is_dir_empty(inode) == 0)
45480 +                       result = plugin_set_fibration(&reiser4_inode_data(inode)->pset,
45481 +                                                     &plugin->fibration);
45482 +               else
45483 +                       result = RETERR(-ENOTEMPTY);
45484 +
45485 +       }
45486 +       return result;
45487 +}
45488 +
45489 +static reiser4_plugin_ops fibration_plugin_ops = {
45490 +       .init     = NULL,
45491 +       .load     = NULL,
45492 +       .save_len = NULL,
45493 +       .save     = NULL,
45494 +       .change   = change_fibration
45495 +};
45496 +
45497 +/* fibration plugins */
45498 +fibration_plugin fibration_plugins[LAST_FIBRATION_ID] = {
45499 +       [FIBRATION_LEXICOGRAPHIC] = {
45500 +               .h = {
45501 +                       .type_id = REISER4_FIBRATION_PLUGIN_TYPE,
45502 +                       .id = FIBRATION_LEXICOGRAPHIC,
45503 +                       .pops = &fibration_plugin_ops,
45504 +                       .label = "lexicographic",
45505 +                       .desc = "no fibration",
45506 +                       .linkage = TYPE_SAFE_LIST_LINK_ZERO
45507 +               },
45508 +               .fibre = fibre_trivial
45509 +       },
45510 +       [FIBRATION_DOT_O] = {
45511 +               .h = {
45512 +                       .type_id = REISER4_FIBRATION_PLUGIN_TYPE,
45513 +                       .id = FIBRATION_DOT_O,
45514 +                       .pops = &fibration_plugin_ops,
45515 +                       .label = "dot-o",
45516 +                       .desc = "fibrate .o files separately",
45517 +                       .linkage = TYPE_SAFE_LIST_LINK_ZERO
45518 +               },
45519 +               .fibre = fibre_dot_o
45520 +       },
45521 +       [FIBRATION_EXT_1] = {
45522 +               .h = {
45523 +                       .type_id = REISER4_FIBRATION_PLUGIN_TYPE,
45524 +                       .id = FIBRATION_EXT_1,
45525 +                       .pops = &fibration_plugin_ops,
45526 +                       .label = "ext-1",
45527 +                       .desc = "fibrate file by single character extension",
45528 +                       .linkage = TYPE_SAFE_LIST_LINK_ZERO
45529 +               },
45530 +               .fibre = fibre_ext_1
45531 +       },
45532 +       [FIBRATION_EXT_3] = {
45533 +               .h = {
45534 +                       .type_id = REISER4_FIBRATION_PLUGIN_TYPE,
45535 +                       .id = FIBRATION_EXT_3,
45536 +                       .pops = &fibration_plugin_ops,
45537 +                       .label = "ext-3",
45538 +                       .desc = "fibrate file by three character extension",
45539 +                       .linkage = TYPE_SAFE_LIST_LINK_ZERO
45540 +               },
45541 +               .fibre = fibre_ext_3
45542 +       }
45543 +};
45544 +
45545 +/* Make Linus happy.
45546 +   Local variables:
45547 +   c-indentation-style: "K&R"
45548 +   mode-name: "LC"
45549 +   c-basic-offset: 8
45550 +   tab-width: 8
45551 +   fill-column: 120
45552 +   End:
45553 +*/
45554 diff -rupN linux-2.6.8-rc3/fs/reiser4/plugin/fibration.h linux-2.6.8-rc3-a/fs/reiser4/plugin/fibration.h
45555 --- linux-2.6.8-rc3/fs/reiser4/plugin/fibration.h       1970-01-01 03:00:00.000000000 +0300
45556 +++ linux-2.6.8-rc3-a/fs/reiser4/plugin/fibration.h     2004-08-05 21:20:53.435586199 +0400
45557 @@ -0,0 +1,37 @@
45558 +/* Copyright 2004 by Hans Reiser, licensing governed by reiser4/README */
45559 +
45560 +/* Fibration plugin used by hashed directory plugin to segment content
45561 + * of directory. See fs/reiser4/plugin/fibration.c for more on this. */
45562 +
45563 +#if !defined( __FS_REISER4_PLUGIN_FIBRATION_H__ )
45564 +#define __FS_REISER4_PLUGIN_FIBRATION_H__
45565 +
45566 +#include "plugin_header.h"
45567 +
45568 +typedef struct fibration_plugin {
45569 +       /* generic fields */
45570 +       plugin_header h;
45571 +
45572 +       __u64 (*fibre)(const struct inode *dir, const char *name, int len);
45573 +} fibration_plugin;
45574 +
45575 +typedef enum {
45576 +       FIBRATION_LEXICOGRAPHIC,
45577 +       FIBRATION_DOT_O,
45578 +       FIBRATION_EXT_1,
45579 +       FIBRATION_EXT_3,
45580 +       LAST_FIBRATION_ID
45581 +} reiser4_fibration_id;
45582 +
45583 +/* __FS_REISER4_PLUGIN_FIBRATION_H__ */
45584 +#endif
45585 +
45586 +/* Make Linus happy.
45587 +   Local variables:
45588 +   c-indentation-style: "K&R"
45589 +   mode-name: "LC"
45590 +   c-basic-offset: 8
45591 +   tab-width: 8
45592 +   fill-column: 120
45593 +   End:
45594 +*/
45595 diff -rupN linux-2.6.8-rc3/fs/reiser4/plugin/file/file.c linux-2.6.8-rc3-a/fs/reiser4/plugin/file/file.c
45596 --- linux-2.6.8-rc3/fs/reiser4/plugin/file/file.c       1970-01-01 03:00:00.000000000 +0300
45597 +++ linux-2.6.8-rc3-a/fs/reiser4/plugin/file/file.c     2004-08-05 21:20:53.416590206 +0400
45598 @@ -0,0 +1,2531 @@
45599 +/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
45600 + * reiser4/README */
45601 +
45602 +#include "../../inode.h"
45603 +#include "../../super.h"
45604 +#include "../../tree_walk.h"
45605 +#include "../../carry.h"
45606 +#include "../../page_cache.h"
45607 +#include "../../ioctl.h"
45608 +#include "../object.h"
45609 +#include "../../prof.h"
45610 +#include "../../safe_link.h"
45611 +#include "funcs.h"
45612 +
45613 +#include <linux/writeback.h>
45614 +#include <linux/pagevec.h>
45615 +
45616 +/* this file contains file plugin methods of reiser4 unix files.
45617 +
45618 + Those files are either built of tail items only (FORMATTING_ID) or of extent
45619 + items only (EXTENT_POINTER_ID) or empty (have no items but stat data) */
45620 +
45621 +static int unpack(struct inode *inode, int forever);
45622 +
45623 +/* get unix file plugin specific portion of inode */
45624 +reiser4_internal unix_file_info_t *
45625 +unix_file_inode_data(const struct inode * inode)
45626 +{
45627 +       return &reiser4_inode_data(inode)->file_plugin_data.unix_file_info;
45628 +}
45629 +
45630 +static int
45631 +file_is_built_of_tails(const struct inode *inode)
45632 +{
45633 +       return unix_file_inode_data(inode)->container == UF_CONTAINER_TAILS;
45634 +}
45635 +
45636 +reiser4_internal int
45637 +file_is_built_of_extents(const struct inode *inode)
45638 +{
45639 +       return unix_file_inode_data(inode)->container == UF_CONTAINER_EXTENTS;
45640 +}
45641 +
45642 +reiser4_internal int
45643 +file_is_empty(const struct inode *inode)
45644 +{
45645 +       return unix_file_inode_data(inode)->container == UF_CONTAINER_EMPTY;
45646 +}
45647 +
45648 +reiser4_internal int
45649 +file_state_is_unknown(const struct inode *inode)
45650 +{
45651 +       return unix_file_inode_data(inode)->container == UF_CONTAINER_UNKNOWN;
45652 +}
45653 +
45654 +reiser4_internal void
45655 +set_file_state_extents(struct inode *inode)
45656 +{
45657 +       unix_file_inode_data(inode)->container = UF_CONTAINER_EXTENTS;
45658 +}
45659 +
45660 +reiser4_internal void
45661 +set_file_state_tails(struct inode *inode)
45662 +{
45663 +       unix_file_inode_data(inode)->container = UF_CONTAINER_TAILS;
45664 +}
45665 +
45666 +static void
45667 +set_file_state_empty(struct inode *inode)
45668 +{
45669 +       unix_file_inode_data(inode)->container = UF_CONTAINER_EMPTY;
45670 +}
45671 +
45672 +static void
45673 +set_file_state_unknown(struct inode *inode)
45674 +{
45675 +       unix_file_inode_data(inode)->container = UF_CONTAINER_UNKNOWN;
45676 +}
45677 +static int
45678 +less_than_ldk(znode *node, const reiser4_key *key)
45679 +{
45680 +       return UNDER_RW(dk, current_tree, read, keylt(key, znode_get_ld_key(node)));
45681 +}
45682 +
45683 +reiser4_internal int
45684 +equal_to_rdk(znode *node, const reiser4_key *key)
45685 +{
45686 +       return UNDER_RW(dk, current_tree, read, keyeq(key, znode_get_rd_key(node)));
45687 +}
45688 +
45689 +#if REISER4_DEBUG
45690 +
45691 +static int
45692 +less_than_rdk(znode *node, const reiser4_key *key)
45693 +{
45694 +       return UNDER_RW(dk, current_tree, read, keylt(key, znode_get_rd_key(node)));
45695 +}
45696 +
45697 +int
45698 +equal_to_ldk(znode *node, const reiser4_key *key)
45699 +{
45700 +       return UNDER_RW(dk, current_tree, read, keyeq(key, znode_get_ld_key(node)));
45701 +}
45702 +
45703 +/* get key of item next to one @coord is set to */
45704 +static reiser4_key *
45705 +get_next_item_key(const coord_t *coord, reiser4_key *next_key)
45706 +{
45707 +       if (coord->item_pos == node_num_items(coord->node) - 1) {
45708 +               /* get key of next item if it is in right neighbor */
45709 +               UNDER_RW_VOID(dk, znode_get_tree(coord->node), read,
45710 +                             *next_key = *znode_get_rd_key(coord->node));
45711 +       } else {
45712 +               /* get key of next item if it is in the same node */
45713 +               coord_t next;
45714 +
45715 +               coord_dup_nocheck(&next, coord);
45716 +               next.unit_pos = 0;
45717 +               check_me("vs-730", coord_next_item(&next) == 0);
45718 +               item_key_by_coord(&next, next_key);
45719 +       }
45720 +       return next_key;
45721 +}
45722 +
45723 +static int
45724 +item_of_that_file(const coord_t *coord, const reiser4_key *key)
45725 +{
45726 +       reiser4_key max_possible;
45727 +       item_plugin *iplug;
45728 +
45729 +       iplug = item_plugin_by_coord(coord);
45730 +       assert("vs-1011", iplug->b.max_key_inside);
45731 +       return keylt(key, iplug->b.max_key_inside(coord, &max_possible));
45732 +}
45733 +
45734 +static int
45735 +check_coord(const coord_t *coord, const reiser4_key *key)
45736 +{
45737 +       coord_t twin;
45738 +
45739 +       if (!REISER4_DEBUG)
45740 +               return 1;
45741 +       node_plugin_by_node(coord->node)->lookup(coord->node, key, FIND_MAX_NOT_MORE_THAN, &twin);
45742 +       return coords_equal(coord, &twin);
45743 +}
45744 +
45745 +#endif /* REISER4_DEBUG */
45746 +
45747 +void init_uf_coord(uf_coord_t *uf_coord, lock_handle *lh)
45748 +{
45749 +       coord_init_zero(&uf_coord->base_coord);
45750 +        coord_clear_iplug(&uf_coord->base_coord);
45751 +       uf_coord->lh = lh;
45752 +       init_lh(lh);
45753 +       memset(&uf_coord->extension, 0, sizeof(uf_coord->extension));
45754 +       uf_coord->valid = 0;
45755 +}
45756 +
45757 +static inline void
45758 +invalidate_extended_coord(uf_coord_t *uf_coord)
45759 +{
45760 +        coord_clear_iplug(&uf_coord->base_coord);
45761 +       uf_coord->valid = 0;
45762 +}
45763 +
45764 +static inline void
45765 +validate_extended_coord(uf_coord_t *uf_coord, loff_t offset)
45766 +{
45767 +       assert("vs-1333", uf_coord->valid == 0);
45768 +       assert("vs-1348", item_plugin_by_coord(&uf_coord->base_coord)->s.file.init_coord_extension);
45769 +
45770 +       /* FIXME: */
45771 +       item_body_by_coord(&uf_coord->base_coord);
45772 +       item_plugin_by_coord(&uf_coord->base_coord)->s.file.init_coord_extension(uf_coord, offset);
45773 +}
45774 +
45775 +reiser4_internal write_mode_t
45776 +how_to_write(uf_coord_t *uf_coord, const reiser4_key *key)
45777 +{
45778 +       write_mode_t result;
45779 +       coord_t *coord;
45780 +       ON_DEBUG(reiser4_key check);
45781 +
45782 +       coord = &uf_coord->base_coord;
45783 +
45784 +       assert("vs-1252", znode_is_wlocked(coord->node));
45785 +       assert("vs-1253", znode_is_loaded(coord->node));
45786 +
45787 +       if (uf_coord->valid == 1) {
45788 +               assert("vs-1332", check_coord(coord, key));
45789 +               return (coord->between == AFTER_UNIT) ? APPEND_ITEM : OVERWRITE_ITEM;
45790 +       }
45791 +
45792 +       if (less_than_ldk(coord->node, key)) {
45793 +               assert("vs-1014", get_key_offset(key) == 0);
45794 +
45795 +               coord_init_before_first_item(coord, coord->node);
45796 +               uf_coord->valid = 1;
45797 +               result = FIRST_ITEM;
45798 +               goto ok;
45799 +       }
45800 +
45801 +       assert("vs-1335", less_than_rdk(coord->node, key));
45802 +
45803 +       if (node_is_empty(coord->node)) {
45804 +               assert("vs-879", znode_get_level(coord->node) == LEAF_LEVEL);
45805 +               assert("vs-880", get_key_offset(key) == 0);
45806 +               /*
45807 +                * Situation that check below tried to handle is follows: some
45808 +                * other thread writes to (other) file and has to insert empty
45809 +                * leaf between two adjacent extents. Generally, we are not
45810 +                * supposed to muck with this node. But it is possible that
45811 +                * said other thread fails due to some error (out of disk
45812 +                * space, for example) and leaves empty leaf
45813 +                * lingering. Nothing prevents us from reusing it.
45814 +                */
45815 +               assert("vs-1000", UNDER_RW(dk, current_tree, read,
45816 +                                          keylt(key, znode_get_rd_key(coord->node))));
45817 +               assert("vs-1002", coord->between == EMPTY_NODE);
45818 +               result = FIRST_ITEM;
45819 +               uf_coord->valid = 1;
45820 +               goto ok;
45821 +       }
45822 +
45823 +       assert("vs-1336", coord->item_pos < node_num_items(coord->node));
45824 +       assert("vs-1007", ergo(coord->between == AFTER_UNIT || coord->between == AT_UNIT, keyle(item_key_by_coord(coord, &check), key)));
45825 +       assert("vs-1008", ergo(coord->between == AFTER_UNIT || coord->between == AT_UNIT, keylt(key, get_next_item_key(coord, &check))));
45826 +
45827 +       switch(coord->between) {
45828 +       case AFTER_ITEM:
45829 +               uf_coord->valid = 1;
45830 +               result = FIRST_ITEM;
45831 +               break;
45832 +       case AFTER_UNIT:
45833 +               assert("vs-1323", (item_is_tail(coord) || item_is_extent(coord)) && item_of_that_file(coord, key));
45834 +               assert("vs-1208", keyeq(item_plugin_by_coord(coord)->s.file.append_key(coord, &check), key));
45835 +               result = APPEND_ITEM;
45836 +               validate_extended_coord(uf_coord, get_key_offset(key));
45837 +               break;
45838 +       case AT_UNIT:
45839 +               /* FIXME: it would be nice to check that coord matches to key */
45840 +               assert("vs-1324", (item_is_tail(coord) || item_is_extent(coord)) && item_of_that_file(coord, key));
45841 +               validate_extended_coord(uf_coord, get_key_offset(key));
45842 +               result = OVERWRITE_ITEM;
45843 +               break;
45844 +       default:
45845 +               assert("vs-1337", 0);
45846 +               result = OVERWRITE_ITEM;
45847 +               break;
45848 +       }
45849 +
45850 +ok:
45851 +       assert("vs-1349", uf_coord->valid == 1);
45852 +       assert("vs-1332", check_coord(coord, key));
45853 +       return result;
45854 +}
45855 +
45856 +/* obtain lock on right neighbor and drop lock on current node */
45857 +reiser4_internal int
45858 +goto_right_neighbor(coord_t * coord, lock_handle * lh)
45859 +{
45860 +       int result;
45861 +       lock_handle lh_right;
45862 +
45863 +       assert("vs-1100", znode_is_locked(coord->node));
45864 +
45865 +       init_lh(&lh_right);
45866 +       result = reiser4_get_right_neighbor(
45867 +               &lh_right, coord->node,
45868 +               znode_is_wlocked(coord->node) ? ZNODE_WRITE_LOCK : ZNODE_READ_LOCK,
45869 +               GN_CAN_USE_UPPER_LEVELS);
45870 +       if (result) {
45871 +               done_lh(&lh_right);
45872 +               return result;
45873 +       }
45874 +
45875 +       done_lh(lh);
45876 +
45877 +       coord_init_first_unit_nocheck(coord, lh_right.node);
45878 +       move_lh(lh, &lh_right);
45879 +
45880 +       return 0;
45881 +
45882 +}
45883 +
45884 +/* this is to be used after find_file_item and in find_file_item_nohint to determine real state of file */
45885 +static void
45886 +set_file_state(struct inode *inode, int cbk_result, tree_level level)
45887 +{
45888 +       assert("vs-1649", inode != NULL);
45889 +
45890 +       if (cbk_errored(cbk_result))
45891 +               /* error happened in find_file_item */
45892 +               return;
45893 +
45894 +       assert("vs-1164", level == LEAF_LEVEL || level == TWIG_LEVEL);
45895 +
45896 +       if (inode_get_flag(inode, REISER4_PART_CONV)) {
45897 +               set_file_state_unknown(inode);
45898 +               return;
45899 +       }
45900 +
45901 +       if (file_state_is_unknown(inode)) {
45902 +               if (cbk_result == CBK_COORD_NOTFOUND)
45903 +                       set_file_state_empty(inode);
45904 +               else if (level == LEAF_LEVEL)
45905 +                       set_file_state_tails(inode);
45906 +               else
45907 +                       set_file_state_extents(inode);
45908 +       } else {
45909 +               /* file state is known, check that it is set correctly */
45910 +               assert("vs-1161", ergo(cbk_result == CBK_COORD_NOTFOUND,
45911 +                                      file_is_empty(inode)));
45912 +               assert("vs-1162", ergo(level == LEAF_LEVEL && cbk_result == CBK_COORD_FOUND,
45913 +                                      file_is_built_of_tails(inode)));
45914 +               assert("vs-1165", ergo(level == TWIG_LEVEL && cbk_result == CBK_COORD_FOUND,
45915 +                                      file_is_built_of_extents(inode)));
45916 +       }
45917 +}
45918 +
45919 +reiser4_internal int
45920 +find_file_item(hint_t *hint, /* coord, lock handle and seal are here */
45921 +              const reiser4_key *key, /* key of position in a file of next read/write */
45922 +              znode_lock_mode lock_mode, /* which lock (read/write) to put on returned node */
45923 +              ra_info_t *ra_info,
45924 +              struct inode *inode)
45925 +{
45926 +       int result;
45927 +       coord_t *coord;
45928 +       lock_handle *lh;
45929 +       __u32 cbk_flags;
45930 +
45931 +       assert("nikita-3030", schedulable());
45932 +
45933 +       /* collect statistics on the number of calls to this function */
45934 +       reiser4_stat_inc(file.find_file_item);
45935 +
45936 +       coord = &hint->coord.base_coord;
45937 +       lh = hint->coord.lh;
45938 +       init_lh(lh);
45939 +       if (hint) {
45940 +               result = hint_validate(hint, key, 1/*check key*/, lock_mode);
45941 +               if (!result) {
45942 +                       if (coord->between == AFTER_UNIT && equal_to_rdk(coord->node, key)) {
45943 +                               result = goto_right_neighbor(coord, lh);
45944 +                               if (result == -E_NO_NEIGHBOR)
45945 +                                       return RETERR(-EIO);
45946 +                               if (result)
45947 +                                       return result;
45948 +                               assert("vs-1152", equal_to_ldk(coord->node, key));
45949 +                               /* we moved to different node. Invalidate coord extension, zload is necessary to init it
45950 +                                  again */
45951 +                               hint->coord.valid = 0;
45952 +                               reiser4_stat_inc(file.find_file_item_via_right_neighbor);
45953 +                       } else {
45954 +                               reiser4_stat_inc(file.find_file_item_via_seal);
45955 +                       }
45956 +
45957 +                       set_file_state(inode, CBK_COORD_FOUND, znode_get_level(coord->node));
45958 +                       return CBK_COORD_FOUND;
45959 +               }
45960 +       }
45961 +
45962 +       /* collect statistics on the number of calls to this function which did not get optimized */
45963 +       reiser4_stat_inc(file.find_file_item_via_cbk);
45964 +
45965 +       coord_init_zero(coord);
45966 +       cbk_flags = (lock_mode == ZNODE_READ_LOCK) ? CBK_UNIQUE : (CBK_UNIQUE | CBK_FOR_INSERT);
45967 +       if (inode != NULL) {
45968 +               result = object_lookup(inode,
45969 +                                      key,
45970 +                                      coord,
45971 +                                      lh,
45972 +                                      lock_mode,
45973 +                                      FIND_MAX_NOT_MORE_THAN,
45974 +                                      TWIG_LEVEL,
45975 +                                      LEAF_LEVEL,
45976 +                                      cbk_flags,
45977 +                                      ra_info);
45978 +       } else {
45979 +               result = coord_by_key(current_tree,
45980 +                                     key,
45981 +                                     coord,
45982 +                                     lh,
45983 +                                     lock_mode,
45984 +                                     FIND_MAX_NOT_MORE_THAN,
45985 +                                     TWIG_LEVEL,
45986 +                                     LEAF_LEVEL,
45987 +                                     cbk_flags,
45988 +                                     ra_info);
45989 +       }
45990 +
45991 +       set_file_state(inode, result, znode_get_level(coord->node));
45992 +
45993 +       /* FIXME: we might already have coord extension initialized */
45994 +       hint->coord.valid = 0;
45995 +       return result;
45996 +}
45997 +
45998 +reiser4_internal int
45999 +find_file_item_nohint(coord_t *coord, lock_handle *lh, const reiser4_key *key,
46000 +                     znode_lock_mode lock_mode, struct inode *inode)
46001 +{
46002 +       int result;
46003 +
46004 +       result = object_lookup(inode, key, coord, lh, lock_mode,
46005 +                              FIND_MAX_NOT_MORE_THAN,
46006 +                              TWIG_LEVEL, LEAF_LEVEL,
46007 +                              (lock_mode == ZNODE_READ_LOCK) ? CBK_UNIQUE : (CBK_UNIQUE | CBK_FOR_INSERT),
46008 +                              NULL /* ra_info */);
46009 +       set_file_state(inode, result, znode_get_level(coord->node));
46010 +       return result;
46011 +}
46012 +
46013 +/* plugin->u.file.write_flowom = NULL
46014 +   plugin->u.file.read_flow = NULL */
46015 +
46016 +reiser4_internal void
46017 +hint_init_zero(hint_t *hint, lock_handle *lh)
46018 +{
46019 +       xmemset(hint, 0, sizeof (*hint));
46020 +       hint->coord.lh = lh;
46021 +}
46022 +
46023 +/* find position of last byte of last item of the file plus 1. This is used by truncate and mmap to find real file
46024 +   size */
46025 +static int
46026 +find_file_size(struct inode *inode, loff_t *file_size)
46027 +{
46028 +       int result;
46029 +       reiser4_key key;
46030 +       coord_t coord;
46031 +       lock_handle lh;
46032 +       item_plugin *iplug;
46033 +
46034 +       assert("vs-1247", inode_file_plugin(inode)->key_by_inode == key_by_inode_unix_file);
46035 +       key_by_inode_unix_file(inode, get_key_offset(max_key()), &key);
46036 +
46037 +       init_lh(&lh);
46038 +       result = find_file_item_nohint(&coord, &lh, &key, ZNODE_READ_LOCK, inode);
46039 +       if (cbk_errored(result)) {
46040 +               /* error happened */
46041 +               done_lh(&lh);
46042 +               return result;
46043 +       }
46044 +
46045 +       if (result == CBK_COORD_NOTFOUND) {
46046 +               /* empty file */
46047 +               done_lh(&lh);
46048 +               *file_size = 0;
46049 +               return 0;
46050 +       }
46051 +
46052 +       /* there are items of this file (at least one) */
46053 +       /*coord_clear_iplug(&coord);*/
46054 +       result = zload(coord.node);
46055 +       if (unlikely(result)) {
46056 +               done_lh(&lh);
46057 +               return result;
46058 +       }
46059 +       iplug = item_plugin_by_coord(&coord);
46060 +
46061 +       assert("vs-853", iplug->s.file.append_key);
46062 +       iplug->s.file.append_key(&coord, &key);
46063 +
46064 +       *file_size = get_key_offset(&key);
46065 +
46066 +       zrelse(coord.node);
46067 +       done_lh(&lh);
46068 +
46069 +       return 0;
46070 +}
46071 +
46072 +static int
46073 +find_file_state(unix_file_info_t *uf_info)
46074 +{
46075 +       int result;
46076 +
46077 +       assert("vs-1628", ea_obtained(uf_info));
46078 +
46079 +       result = 0;
46080 +       if (uf_info->container == UF_CONTAINER_UNKNOWN) {
46081 +               loff_t file_size;
46082 +
46083 +               result = find_file_size(unix_file_info_to_inode(uf_info), &file_size);
46084 +       }
46085 +       assert("vs-1074", ergo(result == 0, uf_info->container != UF_CONTAINER_UNKNOWN));
46086 +       return result;
46087 +}
46088 +
46089 +/* estimate and reserve space needed to truncate page which gets partially truncated: one block for page itself, stat
46090 +   data update (estimate_one_insert_into_item) and one item insertion (estimate_one_insert_into_item) which may happen
46091 +   if page corresponds to hole extent and unallocated one will have to be created */
46092 +static int reserve_partial_page(reiser4_tree *tree)
46093 +{
46094 +       grab_space_enable();
46095 +       return reiser4_grab_reserved(reiser4_get_current_sb(),
46096 +                                    1 +
46097 +                                    2 * estimate_one_insert_into_item(tree),
46098 +                                    BA_CAN_COMMIT);
46099 +}
46100 +
46101 +/* estimate and reserve space needed to cut one item and update one stat data */
46102 +reiser4_internal int reserve_cut_iteration(reiser4_tree *tree)
46103 +{
46104 +       __u64 estimate = estimate_one_item_removal(tree)
46105 +               + estimate_one_insert_into_item(tree);
46106 +
46107 +       assert("nikita-3172", lock_stack_isclean(get_current_lock_stack()));
46108 +
46109 +       grab_space_enable();
46110 +       /* We need to double our estimate now that we can delete more than one
46111 +          node. */
46112 +       return reiser4_grab_reserved(reiser4_get_current_sb(), estimate*2,
46113 +                                    BA_CAN_COMMIT);
46114 +}
46115 +
46116 +/* cut file items one by one starting from the last one until new file size (inode->i_size) is reached. Reserve space
46117 +   and update file stat data on every single cut from the tree */
46118 +reiser4_internal int
46119 +cut_file_items(struct inode *inode, loff_t new_size, int update_sd, loff_t cur_size, int mode)
46120 +{
46121 +       reiser4_key from_key, to_key;
46122 +       reiser4_key smallest_removed;
46123 +       int result;
46124 +
46125 +       assert("vs-1248", inode_file_plugin(inode)->key_by_inode == key_by_inode_unix_file);
46126 +       key_by_inode_unix_file(inode, new_size, &from_key);
46127 +       to_key = from_key;
46128 +       set_key_offset(&to_key, cur_size - 1/*get_key_offset(max_key())*/);
46129 +       /* this loop normally runs just once */
46130 +       while (1) {
46131 +               result = reserve_cut_iteration(tree_by_inode(inode));
46132 +               if (result)
46133 +                       break;
46134 +
46135 +               result = cut_tree_object(current_tree, &from_key, &to_key,
46136 +                                        &smallest_removed, inode, mode);
46137 +               if (result == -E_REPEAT) {
46138 +                       /* -E_REPEAT is a signal to interrupt a long file truncation process */
46139 +                       INODE_SET_FIELD(inode, i_size, get_key_offset(&smallest_removed));
46140 +                       if (update_sd) {
46141 +                               inode->i_ctime = inode->i_mtime = CURRENT_TIME;
46142 +                               result = reiser4_update_sd(inode);
46143 +                               if (result)
46144 +                                       break;
46145 +                       }
46146 +
46147 +                       all_grabbed2free();
46148 +                       reiser4_release_reserved(inode->i_sb);
46149 +
46150 +                       /* cut_tree_object() was interrupted probably because
46151 +                        * current atom requires commit, we have to release
46152 +                        * transaction handle to allow atom commit. */
46153 +                       txn_restart_current();
46154 +                       continue;
46155 +               }
46156 +               if (result && !(result == CBK_COORD_NOTFOUND && new_size == 0 && inode->i_size == 0))
46157 +                       break;
46158 +
46159 +               INODE_SET_FIELD(inode, i_size, new_size);
46160 +               if (update_sd) {
46161 +                       /* Final sd update after the file gets its correct size */
46162 +                       inode->i_ctime = inode->i_mtime = CURRENT_TIME;
46163 +                       result = reiser4_update_sd(inode);
46164 +               }
46165 +               break;
46166 +       }
46167 +
46168 +       all_grabbed2free();
46169 +       reiser4_release_reserved(inode->i_sb);
46170 +
46171 +       return result;
46172 +}
46173 +
46174 +int find_or_create_extent(struct page *page);
46175 +
46176 +/* part of unix_file_truncate: it is called when truncate is used to make file shorter */
46177 +static int
46178 +shorten_file(struct inode *inode, loff_t new_size)
46179 +{
46180 +       int result;
46181 +       struct page *page;
46182 +       int padd_from;
46183 +       unsigned long index;
46184 +       char *kaddr;
46185 +
46186 +       /* all items of ordinary reiser4 file are grouped together. That is why we can use cut_tree. Plan B files (for
46187 +          instance) can not be truncated that simply */
46188 +       result = cut_file_items(inode, new_size, 1/*update_sd*/, get_key_offset(max_key()), 1);
46189 +       if (result)
46190 +               return result;
46191 +
46192 +       assert("vs-1105", new_size == inode->i_size);
46193 +       if (new_size == 0) {
46194 +               set_file_state_empty(inode);
46195 +               return 0;
46196 +       }
46197 +
46198 +       if (file_is_built_of_tails(inode))
46199 +               /* No need to worry about zeroing last page after new file end */
46200 +               return 0;
46201 +
46202 +       padd_from = inode->i_size & (PAGE_CACHE_SIZE - 1);
46203 +       if (!padd_from)
46204 +               /* file is truncated to page boundary */
46205 +               return 0;
46206 +
46207 +       result = reserve_partial_page(tree_by_inode(inode));
46208 +       if (result) {
46209 +               reiser4_release_reserved(inode->i_sb);
46210 +               return result;
46211 +       }
46212 +
46213 +       /* last page is partially truncated - zero its content */
46214 +       index = (inode->i_size >> PAGE_CACHE_SHIFT);
46215 +       page = read_cache_page(inode->i_mapping, index, readpage_unix_file/*filler*/, 0);
46216 +       if (IS_ERR(page)) {
46217 +               all_grabbed2free();
46218 +               reiser4_release_reserved(inode->i_sb);
46219 +               if (likely(PTR_ERR(page) == -EINVAL)) {
46220 +                       /* looks like file is built of tail items */
46221 +                       return 0;
46222 +               }
46223 +               return PTR_ERR(page);
46224 +       }
46225 +       wait_on_page_locked(page);
46226 +       if (!PageUptodate(page)) {
46227 +               all_grabbed2free();
46228 +               page_cache_release(page);
46229 +               reiser4_release_reserved(inode->i_sb);
46230 +               return RETERR(-EIO);
46231 +       }
46232 +
46233 +       /* if page correspons to hole extent unit - unallocated one will be created here. This is not necessary */
46234 +       result = find_or_create_extent(page);
46235 +
46236 +       /* FIXME: cut_file_items has already updated inode. Probably it would be better to update it here when file is
46237 +          really truncated */
46238 +       all_grabbed2free();
46239 +       if (result) {
46240 +               page_cache_release(page);
46241 +               reiser4_release_reserved(inode->i_sb);
46242 +               return result;
46243 +       }
46244 +
46245 +       lock_page(page);
46246 +       assert("vs-1066", PageLocked(page));
46247 +       kaddr = kmap_atomic(page, KM_USER0);
46248 +       memset(kaddr + padd_from, 0, PAGE_CACHE_SIZE - padd_from);
46249 +       flush_dcache_page(page);
46250 +       kunmap_atomic(kaddr, KM_USER0);
46251 +       unlock_page(page);
46252 +       page_cache_release(page);
46253 +       reiser4_release_reserved(inode->i_sb);
46254 +       return 0;
46255 +}
46256 +
46257 +static loff_t
46258 +write_flow(struct file *, struct inode *, const char *buf, loff_t count, loff_t pos);
46259 +
46260 +/* it is called when truncate is used to make file longer and when write position is set past real end of file. It
46261 +   appends file which has size @cur_size with hole of certain size (@hole_size). It returns 0 on success, error code
46262 +   otherwise */
46263 +static int
46264 +append_hole(struct inode *inode, loff_t new_size)
46265 +{
46266 +       int result;
46267 +       loff_t written;
46268 +       loff_t hole_size;
46269 +
46270 +       assert("vs-1107", inode->i_size < new_size);
46271 +
46272 +       result = 0;
46273 +       hole_size = new_size - inode->i_size;
46274 +       written = write_flow(NULL, inode, NULL/*buf*/, hole_size,
46275 +                            inode->i_size);
46276 +       if (written != hole_size) {
46277 +               /* return error because file is not expanded as required */
46278 +               if (written > 0)
46279 +                       result = RETERR(-ENOSPC);
46280 +               else
46281 +                       result = written;
46282 +       } else {
46283 +               assert("vs-1081", inode->i_size == new_size);
46284 +       }
46285 +       return result;
46286 +}
46287 +
46288 +/* this either cuts or add items of/to the file so that items match new_size. It is used in unix_file_setattr when it is
46289 +   used to truncate
46290 +VS-FIXME-HANS: explain that
46291 +and in unix_file_delete */
46292 +static int
46293 +truncate_file_body(struct inode *inode, loff_t new_size)
46294 +{
46295 +       int result;
46296 +
46297 +       if (inode->i_size < new_size)
46298 +               result = append_hole(inode, new_size);
46299 +       else
46300 +               result = shorten_file(inode, new_size);
46301 +
46302 +       return result;
46303 +}
46304 +
46305 +/* plugin->u.file.truncate
46306 +   all the work is done on reiser4_setattr->unix_file_setattr->truncate_file_body
46307 +*/
46308 +reiser4_internal int
46309 +truncate_unix_file(struct inode *inode, loff_t new_size)
46310 +{
46311 +       return 0;
46312 +}
46313 +
46314 +/* plugin->u.write_sd_by_inode = write_sd_by_inode_common */
46315 +
46316 +/* get access hint (seal, coord, key, level) stored in reiser4 private part of
46317 +   struct file if it was stored in a previous access to the file */
46318 +reiser4_internal int
46319 +load_file_hint(struct file *file, hint_t *hint, lock_handle *lh)
46320 +{
46321 +       reiser4_file_fsdata *fsdata;
46322 +
46323 +       if (file) {
46324 +               fsdata = reiser4_get_file_fsdata(file);
46325 +               if (IS_ERR(fsdata))
46326 +                       return PTR_ERR(fsdata);
46327 +
46328 +               if (seal_is_set(&fsdata->reg.hint.seal)) {
46329 +                       *hint = fsdata->reg.hint;
46330 +                       hint->coord.lh = lh;
46331 +                       /* force re-validation of the coord on the first
46332 +                        * iteration of the read/write loop. */
46333 +                       hint->coord.valid = 0;
46334 +                       return 0;
46335 +               }
46336 +               xmemset(&fsdata->reg.hint, 0, sizeof(hint_t));
46337 +       }
46338 +       hint_init_zero(hint, lh);
46339 +       return 0;
46340 +}
46341 +
46342 +
46343 +/* this copies hint for future tree accesses back to reiser4 private part of
46344 +   struct file */
46345 +reiser4_internal void
46346 +save_file_hint(struct file *file, const hint_t *hint)
46347 +{
46348 +       reiser4_file_fsdata *fsdata;
46349 +
46350 +       if (!file || !seal_is_set(&hint->seal))
46351 +               return;
46352 +
46353 +       fsdata = reiser4_get_file_fsdata(file);
46354 +       assert("vs-965", !IS_ERR(fsdata));
46355 +       fsdata->reg.hint = *hint;
46356 +       return;
46357 +}
46358 +
46359 +reiser4_internal void
46360 +unset_hint(hint_t *hint)
46361 +{
46362 +       assert("vs-1315", hint);
46363 +       seal_done(&hint->seal);
46364 +}
46365 +
46366 +/* coord must be set properly. So, that set_hint has nothing to do */
46367 +reiser4_internal void
46368 +set_hint(hint_t *hint, const reiser4_key *key, znode_lock_mode mode)
46369 +{
46370 +       ON_DEBUG(coord_t *coord = &hint->coord.base_coord);
46371 +       assert("vs-1207", WITH_DATA(coord->node, check_coord(coord, key)));
46372 +
46373 +       seal_init(&hint->seal, &hint->coord.base_coord, key);
46374 +       hint->offset = get_key_offset(key);
46375 +       hint->level = znode_get_level(hint->coord.base_coord.node);
46376 +       hint->mode = mode;
46377 +}
46378 +
46379 +reiser4_internal int
46380 +hint_is_set(const hint_t *hint)
46381 +{
46382 +       return seal_is_set(&hint->seal);
46383 +}
46384 +
46385 +#if REISER4_DEBUG
46386 +static int all_but_offset_key_eq(const reiser4_key *k1, const reiser4_key *k2)
46387 +{
46388 +       return (get_key_locality(k1) == get_key_locality(k2) &&
46389 +               get_key_type(k1) == get_key_type(k2) &&
46390 +               get_key_band(k1) == get_key_band(k2) &&
46391 +               get_key_ordering(k1) == get_key_ordering(k2) &&
46392 +               get_key_objectid(k1) == get_key_objectid(k2));
46393 +}
46394 +#endif
46395 +
46396 +reiser4_internal int
46397 +hint_validate(hint_t *hint, const reiser4_key *key, int check_key, znode_lock_mode lock_mode)
46398 +{
46399 +       if (!hint || !hint_is_set(hint) || hint->mode != lock_mode)
46400 +               /* hint either not set or set by different operation */
46401 +               return RETERR(-E_REPEAT);
46402 +
46403 +       assert("vs-1277", all_but_offset_key_eq(key, &hint->seal.key));
46404 +
46405 +       if (check_key && get_key_offset(key) != hint->offset)
46406 +               /* hint is set for different key */
46407 +               return RETERR(-E_REPEAT);
46408 +
46409 +       return seal_validate(&hint->seal, &hint->coord.base_coord, key,
46410 +                            hint->level, hint->coord.lh,
46411 +                            FIND_MAX_NOT_MORE_THAN,
46412 +                            lock_mode, ZNODE_LOCK_LOPRI);
46413 +}
46414 +
46415 +/* look for place at twig level for extent corresponding to page, call extent's writepage method to create
46416 +   unallocated extent if it does not exist yet, initialize jnode, capture page */
46417 +reiser4_internal int
46418 +find_or_create_extent(struct page *page)
46419 +{
46420 +       int result;
46421 +       uf_coord_t uf_coord;
46422 +       coord_t *coord;
46423 +       lock_handle lh;
46424 +       reiser4_key key;
46425 +       item_plugin *iplug;
46426 +       znode *loaded;
46427 +       struct inode *inode;
46428 +
46429 +       reiser4_stat_inc(file.page_ops.writepage_calls);
46430 +
46431 +       assert("vs-1065", page->mapping && page->mapping->host);
46432 +       inode = page->mapping->host;
46433 +
46434 +       /* get key of first byte of the page */
46435 +       key_by_inode_unix_file(inode, (loff_t) page->index << PAGE_CACHE_SHIFT, &key);
46436 +
46437 +       init_uf_coord(&uf_coord, &lh);
46438 +       coord = &uf_coord.base_coord;
46439 +
46440 +       result = find_file_item_nohint(coord, &lh, &key, ZNODE_WRITE_LOCK, inode);
46441 +       if (IS_CBKERR(result)) {
46442 +               done_lh(&lh);
46443 +               return result;
46444 +       }
46445 +
46446 +       /*coord_clear_iplug(coord);*/
46447 +       result = zload(coord->node);
46448 +       if (result) {
46449 +               done_lh(&lh);
46450 +               return result;
46451 +       }
46452 +       loaded = coord->node;
46453 +
46454 +       /* get plugin of extent item */
46455 +       iplug = item_plugin_by_id(EXTENT_POINTER_ID);
46456 +       result = iplug->s.file.capture(&key, &uf_coord, page, how_to_write(&uf_coord, &key));
46457 +       assert("vs-429378", result != -E_REPEAT);
46458 +       zrelse(loaded);
46459 +       done_lh(&lh);
46460 +       return result;
46461 +}
46462 +
46463 +#if REISER4_USE_EFLUSH
46464 +static int inode_has_eflushed_jnodes(struct inode * inode)
46465 +{
46466 +       reiser4_tree * tree = &get_super_private(inode->i_sb)->tree;
46467 +       int ret;
46468 +
46469 +       RLOCK_TREE(tree);
46470 +       ret = (radix_tree_tagged(jnode_tree_by_inode(inode), EFLUSH_TAG_ANONYMOUS) ||
46471 +              radix_tree_tagged(jnode_tree_by_inode(inode), EFLUSH_TAG_CAPTURED));
46472 +       RUNLOCK_TREE(tree);
46473 +       return ret;
46474 +}
46475 +# else
46476 +#define inode_has_eflushed_jnodes(inode) (0)
46477 +#endif
46478 +
46479 +/* Check mapping for existence of not captured dirty pages. This returns !0 if either page tree contains pages tagged
46480 +   PAGECACHE_TAG_REISER4_MOVED or if eflushed jnode tree is not empty */
46481 +static int
46482 +inode_has_anonymous_pages(struct inode *inode)
46483 +{
46484 +       return (mapping_tagged(inode->i_mapping, PAGECACHE_TAG_REISER4_MOVED) ||
46485 +               inode_has_eflushed_jnodes(inode));
46486 +}
46487 +
46488 +static int
46489 +capture_page_and_create_extent(struct page *page)
46490 +{
46491 +       int result;
46492 +       struct inode *inode;
46493 +
46494 +       assert("vs-1084", page->mapping && page->mapping->host);
46495 +       inode = page->mapping->host;
46496 +       assert("vs-1139", file_is_built_of_extents(inode));
46497 +       /* page belongs to file */
46498 +       assert("vs-1393", inode->i_size > ((loff_t) page->index << PAGE_CACHE_SHIFT));
46499 +
46500 +       /* page capture may require extent creation (if it does not exist yet) and stat data's update (number of blocks
46501 +          changes on extent creation) */
46502 +       grab_space_enable ();
46503 +       result = reiser4_grab_space(2 * estimate_one_insert_into_item(tree_by_inode(inode)), BA_CAN_COMMIT);
46504 +       if (likely(!result))
46505 +               result = find_or_create_extent(page);
46506 +
46507 +       all_grabbed2free();
46508 +       if (result != 0)
46509 +               SetPageError(page);
46510 +       return result;
46511 +}
46512 +
46513 +/* plugin->u.file.capturepage handler */
46514 +reiser4_internal int
46515 +capturepage_unix_file(struct page * page) {
46516 +       int result;
46517 +
46518 +       page_cache_get(page);
46519 +       unlock_page(page);
46520 +       result = capture_page_and_create_extent(page);
46521 +       lock_page(page);
46522 +       page_cache_release(page);
46523 +       return result;
46524 +}
46525 +
46526 +static void
46527 +redirty_inode(struct inode *inode)
46528 +{
46529 +       spin_lock(&inode_lock);
46530 +       inode->i_state |= I_DIRTY;
46531 +       spin_unlock(&inode_lock);
46532 +}
46533 +
46534 +/*
46535 + * Support for "anonymous" pages and jnodes.
46536 + *
46537 + * When file is write-accessed through mmap pages can be dirtied from the user
46538 + * level. In this case kernel is not notified until one of following happens:
46539 + *
46540 + *     (1) msync()
46541 + *
46542 + *     (2) truncate() (either explicit or through unlink)
46543 + *
46544 + *     (3) VM scanner starts reclaiming mapped pages, dirtying them before
46545 + *     starting write-back.
46546 + *
46547 + * As a result of (3) ->writepage may be called on a dirty page without
46548 + * jnode. Such page is called "anonymous" in reiser4. Certain work-loads
46549 + * (iozone) generate huge number of anonymous pages. Emergency flush handles
46550 + * this situation by creating jnode for anonymous page, starting IO on the
46551 + * page, and marking jnode with JNODE_KEEPME bit so that it's not throw out of
46552 + * memory. Such jnode is also called anonymous.
46553 + *
46554 + * reiser4_sync_sb() method tries to insert anonymous pages and jnodes into
46555 + * tree. This is done by capture_anonymous_*() functions below.
46556 + *
46557 + */
46558 +
46559 +/* this returns 1 if it captured page */
46560 +static int
46561 +capture_anonymous_page(struct page *pg, int keepme)
46562 +{
46563 +       struct address_space *mapping;
46564 +       jnode *node;
46565 +       int result;
46566 +
46567 +       if (PageWriteback(pg))
46568 +               /* FIXME: do nothing? */
46569 +               return 0;
46570 +
46571 +       mapping = pg->mapping;
46572 +
46573 +       lock_page(pg);
46574 +       /* page is guaranteed to be in the mapping, because we are operating under rw-semaphore. */
46575 +       assert("nikita-3336", pg->mapping == mapping);
46576 +       node = jnode_of_page(pg);
46577 +       unlock_page(pg);
46578 +       if (!IS_ERR(node)) {
46579 +               result = jload(node);
46580 +               assert("nikita-3334", result == 0);
46581 +               assert("nikita-3335", jnode_page(node) == pg);
46582 +               result = capture_page_and_create_extent(pg);
46583 +               if (result == 0) {
46584 +                       /*
46585 +                        * node will be captured into atom by
46586 +                        * capture_page_and_create_extent(). Atom
46587 +                        * cannot commit (because we have open
46588 +                        * transaction handle), and node cannot be
46589 +                        * truncated, because we have non-exclusive
46590 +                        * access to the file.
46591 +                        */
46592 +                       assert("nikita-3327", node->atom != NULL);
46593 +                       JF_CLR(node, JNODE_KEEPME);
46594 +                       result = 1;
46595 +               } else
46596 +                       warning("nikita-3329",
46597 +                               "Cannot capture anon page: %i", result);
46598 +               jrelse(node);
46599 +               jput(node);
46600 +       } else
46601 +               result = PTR_ERR(node);
46602 +
46603 +       return result;
46604 +}
46605 +
46606 +
46607 +#define CAPTURE_APAGE_BURST      (1024)
46608 +
46609 +static int
46610 +capture_anonymous_pages(struct address_space *mapping, pgoff_t *index, long *captured)
46611 +{
46612 +       int result;
46613 +       unsigned to_capture;
46614 +       struct pagevec pvec;
46615 +       unsigned found_pages;
46616 +       jnode *jvec[PAGEVEC_SIZE];
46617 +       unsigned found_jnodes;
46618 +       pgoff_t cur, end;
46619 +       unsigned count;
46620 +       reiser4_tree * tree;
46621 +       unsigned i;
46622 +
46623 +       result = 0;
46624 +
46625 +       ON_TRACE(TRACE_CAPTURE_ANONYMOUS,
46626 +                "capture anonymous: oid %llu: start index %lu\n",
46627 +                get_inode_oid(mapping->host), *index);
46628 +
46629 +       to_capture = CAPTURE_APAGE_BURST;
46630 +       found_jnodes = 0;
46631 +       tree = &get_super_private(mapping->host->i_sb)->tree;
46632 +
46633 +       do {
46634 +               pagevec_init(&pvec, 0);
46635 +
46636 +               cur = *index;
46637 +               count = min(pagevec_space(&pvec), to_capture);
46638 +
46639 +               /* find and capture "anonymous" pages */
46640 +               found_pages = pagevec_lookup_tag(&pvec, mapping, index, PAGECACHE_TAG_REISER4_MOVED, count);
46641 +               if (found_pages != 0) {
46642 +                       ON_TRACE(TRACE_CAPTURE_ANONYMOUS,
46643 +                                "oid %llu: found %u moved pages in range starting from (%lu)\n",
46644 +                                get_inode_oid(mapping->host), found_pages, cur);
46645 +
46646 +                       for (i = 0; i < pagevec_count(&pvec); i ++) {
46647 +                               /* tag PAGECACHE_TAG_REISER4_MOVED will be cleared by set_page_dirty_internal which is
46648 +                                  called when jnode is captured */
46649 +                               result = capture_anonymous_page(pvec.pages[i], 0);
46650 +                               if (result == 1) {
46651 +                                       (*captured) ++;
46652 +                                       result = 0;
46653 +                                       to_capture --;
46654 +                               } else if (result < 0) {
46655 +                                       warning("vs-1454", "failed for moved page: result=%i (captured=%u)\n",
46656 +                                               result, CAPTURE_APAGE_BURST - to_capture);
46657 +                                       break;
46658 +                               } else {
46659 +                                       /* result == 0. capture_anonymous_page returns 0 for Writeback-ed page */
46660 +                                       ;
46661 +                               }
46662 +                       }
46663 +                       pagevec_release(&pvec);
46664 +                       if (result)
46665 +                               return result;
46666 +
46667 +                       end = *index;
46668 +               } else
46669 +                       /* there are no more anonymous pages, continue with anonymous jnodes only */
46670 +                       end = (pgoff_t)-1;
46671 +
46672 +#if REISER4_USE_EFLUSH
46673 +
46674 +               /* capture anonymous jnodes between cur and end */
46675 +               while (cur < end && to_capture > 0) {
46676 +                       pgoff_t nr_jnodes;
46677 +
46678 +                       nr_jnodes = min(to_capture, (unsigned)PAGEVEC_SIZE);
46679 +
46680 +                       /* spin_lock_eflush(mapping->host->i_sb); */
46681 +                       RLOCK_TREE(tree);
46682 +
46683 +                       found_jnodes = radix_tree_gang_lookup_tag(jnode_tree_by_inode(mapping->host),
46684 +                                                                 (void **)&jvec, cur, nr_jnodes,
46685 +                                                                 EFLUSH_TAG_ANONYMOUS);
46686 +                       if (found_jnodes != 0) {
46687 +                               for (i = 0; i < found_jnodes; i ++) {
46688 +                                       if (index_jnode(jvec[i]) < end) {
46689 +                                               jref(jvec[i]);
46690 +                                               cur = index_jnode(jvec[i]) + 1;
46691 +                                       } else {
46692 +                                               found_jnodes = i;
46693 +                                               break;
46694 +                                       }
46695 +                               }
46696 +
46697 +                               if (found_jnodes != 0) {
46698 +                                       /* there are anonymous jnodes from given range */
46699 +                                       /* spin_unlock_eflush(mapping->host->i_sb); */
46700 +                                       RUNLOCK_TREE(tree);
46701 +
46702 +                                       ON_TRACE(TRACE_CAPTURE_ANONYMOUS,
46703 +                                                "oid %llu: found %u anonymous jnodes in range (%lu %lu)\n",
46704 +                                                get_inode_oid(mapping->host), found_jnodes, cur, end - 1);
46705 +
46706 +                                       /* start i/o for eflushed nodes */
46707 +                                       for (i = 0; i < found_jnodes; i ++)
46708 +                                               jstartio(jvec[i]);
46709 +
46710 +                                       for (i = 0; i < found_jnodes; i ++) {
46711 +                                               result = jload(jvec[i]);
46712 +                                               if (result == 0) {
46713 +                                                       result = capture_anonymous_page(jnode_page(jvec[i]), 0);
46714 +                                                       if (result == 1) {
46715 +                                                               (*captured) ++;
46716 +                                                               result = 0;
46717 +                                                               to_capture --;
46718 +                                                       } else if (result < 0) {
46719 +                                                               jrelse(jvec[i]);
46720 +                                                               warning("nikita-3328",
46721 +                                                                       "failed for anonymous jnode: result=%i (captured=%u)\n",
46722 +                                                                       result, CAPTURE_APAGE_BURST - to_capture);
46723 +                                                               break;
46724 +                                                       } else {
46725 +                                                               /* result == 0. capture_anonymous_page returns 0 for Writeback-ed page */
46726 +                                                               ;
46727 +                                                       }
46728 +                                                       jrelse(jvec[i]);
46729 +                                               } else {
46730 +                                                       warning("vs-1454", "jload for anonymous jnode failed: captured %u, result=%i\n",
46731 +                                                               result, CAPTURE_APAGE_BURST - to_capture);
46732 +                                                       break;
46733 +                                               }
46734 +                                       }
46735 +                                       for (i = 0; i < found_jnodes; i ++)
46736 +                                               jput(jvec[i]);
46737 +                                       if (result)
46738 +                                               return result;
46739 +                                       continue;
46740 +                               }
46741 +                       }
46742 +                       RUNLOCK_TREE(tree);
46743 +                       /* spin_unlock_eflush(mapping->host->i_sb);*/
46744 +                       ON_TRACE(TRACE_CAPTURE_ANONYMOUS,
46745 +                                "oid %llu: no anonymous jnodes are found\n", get_inode_oid(mapping->host));
46746 +                       break;
46747 +               }
46748 +#endif /* REISER4_USE_EFLUSH */
46749 +       } while (to_capture && (found_pages || found_jnodes) && result == 0);
46750 +
46751 +       if (result) {
46752 +               warning("vs-1454", "Cannot capture anon pages: result=%i (captured=%d)\n",
46753 +                       result, CAPTURE_APAGE_BURST - to_capture);
46754 +               return result;
46755 +       }
46756 +
46757 +       assert("vs-1678", to_capture <= CAPTURE_APAGE_BURST);
46758 +       if (to_capture == 0)
46759 +               /* there may be left more pages */
46760 +               redirty_inode(mapping->host);
46761 +
46762 +       ON_TRACE(TRACE_CAPTURE_ANONYMOUS,
46763 +                "capture anonymous: oid %llu: end index %lu, captured %u\n",
46764 +                get_inode_oid(mapping->host), *index, CAPTURE_APAGE_BURST - to_capture);
46765 +
46766 +       return 0;
46767 +}
46768 +
46769 +/*
46770 + * Commit atom of the jnode of a page.
46771 + */
46772 +static int
46773 +sync_page(struct page *page)
46774 +{
46775 +       int result;
46776 +       do {
46777 +               jnode *node;
46778 +               txn_atom *atom;
46779 +
46780 +               lock_page(page);
46781 +               node = jprivate(page);
46782 +               if (node != NULL)
46783 +                       atom = UNDER_SPIN(jnode, node, jnode_get_atom(node));
46784 +               else
46785 +                       atom = NULL;
46786 +               unlock_page(page);
46787 +               result = sync_atom(atom);
46788 +       } while (result == -E_REPEAT);
46789 +       assert("nikita-3485", ergo(result == 0,
46790 +                                  get_current_context()->trans->atom == NULL));
46791 +       return result;
46792 +}
46793 +
46794 +/*
46795 + * Commit atoms of pages on @pages list.
46796 + * call sync_page for each page from mapping's page tree
46797 + */
46798 +static int
46799 +sync_page_list(struct inode *inode)
46800 +{
46801 +       int result;
46802 +       struct address_space *mapping;
46803 +       unsigned long from; /* start index for radix_tree_gang_lookup */
46804 +       unsigned int found; /* return value for radix_tree_gang_lookup */
46805 +
46806 +       mapping = inode->i_mapping;
46807 +       from = 0;
46808 +       result = 0;
46809 +       read_lock_irq(&mapping->tree_lock);
46810 +       while (result == 0) {
46811 +               struct page *page;
46812 +
46813 +               found = radix_tree_gang_lookup(&mapping->page_tree, (void **)&page, from, 1);
46814 +               assert("", found < 2);
46815 +               if (found == 0)
46816 +                       break;
46817 +
46818 +               /* page may not leave radix tree because it is protected from truncating by inode->i_sem downed by
46819 +                  sys_fsync */
46820 +               page_cache_get(page);
46821 +               read_unlock_irq(&mapping->tree_lock);
46822 +
46823 +               from = page->index + 1;
46824 +
46825 +               result = sync_page(page);
46826 +
46827 +               page_cache_release(page);
46828 +               read_lock_irq(&mapping->tree_lock);
46829 +       }
46830 +
46831 +       read_unlock_irq(&mapping->tree_lock);
46832 +       return result;
46833 +}
46834 +
46835 +static int
46836 +commit_file_atoms(struct inode *inode)
46837 +{
46838 +       int               result;
46839 +       unix_file_info_t *uf_info;
46840 +       reiser4_context  *ctx;
46841 +
46842 +       /*
46843 +        * close current transaction
46844 +        */
46845 +
46846 +       ctx = get_current_context();
46847 +       txn_restart(ctx);
46848 +
46849 +       uf_info = unix_file_inode_data(inode);
46850 +
46851 +       /*
46852 +        * finish extent<->tail conversion if necessary
46853 +        */
46854 +
46855 +       get_exclusive_access(uf_info);
46856 +       if (inode_get_flag(inode, REISER4_PART_CONV)) {
46857 +               result = finish_conversion(inode);
46858 +               if (result != 0) {
46859 +                       drop_exclusive_access(uf_info);
46860 +                       return result;
46861 +               }
46862 +       }
46863 +
46864 +       /*
46865 +        * find what items file is made from
46866 +        */
46867 +
46868 +       result = find_file_state(uf_info);
46869 +       drop_exclusive_access(uf_info);
46870 +       if (result != 0)
46871 +               return result;
46872 +
46873 +       /*
46874 +        * file state cannot change because we are under ->i_sem
46875 +        */
46876 +
46877 +       switch(uf_info->container) {
46878 +       case UF_CONTAINER_EXTENTS:
46879 +               result =
46880 +                       /*
46881 +                        * when we are called by
46882 +                        * filemap_fdatawrite->
46883 +                        *    do_writepages()->
46884 +                        *       reiser4_writepages()
46885 +                        *
46886 +                        * inode->i_mapping->dirty_pages are spices into
46887 +                        * ->io_pages, leaving ->dirty_pages dirty.
46888 +                        *
46889 +                        * When we are called from
46890 +                        * reiser4_fsync()->sync_unix_file(), we have to
46891 +                        * commit atoms of all pages on the ->dirty_list.
46892 +                        *
46893 +                        * So for simplicity we just commit ->io_pages and
46894 +                        * ->dirty_pages.
46895 +                        */
46896 +                       sync_page_list(inode);
46897 +               break;
46898 +       case UF_CONTAINER_TAILS:
46899 +               /*
46900 +                * NOTE-NIKITA probably we can be smarter for tails. For now
46901 +                * just commit all existing atoms.
46902 +                */
46903 +               result = txnmgr_force_commit_all(inode->i_sb, 0);
46904 +               break;
46905 +       case UF_CONTAINER_EMPTY:
46906 +               result = 0;
46907 +               break;
46908 +       case UF_CONTAINER_UNKNOWN:
46909 +       default:
46910 +               result = -EIO;
46911 +               break;
46912 +       }
46913 +
46914 +       /*
46915 +        * commit current transaction: there can be captured nodes from
46916 +        * find_file_state() and finish_conversion().
46917 +        */
46918 +       txn_restart(ctx);
46919 +       return result;
46920 +}
46921 +
46922 +/*
46923 + * this file plugin method is called to capture into current atom all
46924 + * "anonymous pages", that is, pages modified through mmap(2). For each such
46925 + * page this function creates jnode, captures this jnode, and creates (or
46926 + * modifies) extent. Anonymous pages are kept on the special inode list. Some
46927 + * of them can be emergency flushed. To cope with this list of eflushed jnodes
46928 + * from this inode is scanned.
46929 + */
46930 +reiser4_internal int
46931 +capture_unix_file(struct inode *inode, const struct writeback_control *wbc, long *captured)
46932 +{
46933 +       int               result;
46934 +       unix_file_info_t *uf_info;
46935 +       pgoff_t index;
46936 +
46937 +       if (!inode_has_anonymous_pages(inode))
46938 +               return 0;
46939 +
46940 +       result = 0;
46941 +       index = 0;
46942 +       do {
46943 +               reiser4_context ctx;
46944 +
46945 +               uf_info = unix_file_inode_data(inode);
46946 +               /*
46947 +                * locking: creation of extent requires read-semaphore on
46948 +                * file. _But_, this function can also be called in the
46949 +                * context of write system call from
46950 +                * balance_dirty_pages(). So, write keeps semaphore (possible
46951 +                * in write mode) on file A, and this function tries to
46952 +                * acquire semaphore on (possibly) different file B. A/B
46953 +                * deadlock is on a way. To avoid this try-lock is used
46954 +                * here. When invoked from sys_fsync() and sys_fdatasync(),
46955 +                * this function is out of reiser4 context and may safely
46956 +                * sleep on semaphore.
46957 +                */
46958 +               if (is_in_reiser4_context()) {
46959 +                       if (down_read_trylock(&uf_info->latch) == 0) {
46960 +                               result = RETERR(-EBUSY);
46961 +                               break;
46962 +                       }
46963 +               } else
46964 +                       down_read(&uf_info->latch);
46965 +
46966 +               init_context(&ctx, inode->i_sb);
46967 +               /* avoid recursive calls to ->sync_inodes */
46968 +               ctx.nobalance = 1;
46969 +               assert("zam-760", lock_stack_isclean(get_current_lock_stack()));
46970 +
46971 +               LOCK_CNT_INC(inode_sem_r);
46972 +
46973 +               result = capture_anonymous_pages(inode->i_mapping, &index, captured);
46974 +               up_read(&uf_info->latch);
46975 +               LOCK_CNT_DEC(inode_sem_r);
46976 +               if (result != 0 || wbc->sync_mode != WB_SYNC_ALL) {
46977 +                       reiser4_exit_context(&ctx);
46978 +                       break;
46979 +               }
46980 +               result = commit_file_atoms(inode);
46981 +               reiser4_exit_context(&ctx);
46982 +       } while (result == 0 && inode_has_anonymous_pages(inode) /* FIXME: it should be: there are anonymous pages with
46983 +                                                                   page->index >= index */);
46984 +
46985 +       return result;
46986 +}
46987 +
46988 +/*
46989 + * ->sync() method for unix file.
46990 + *
46991 + * We are trying to be smart here. Instead of committing all atoms (original
46992 + * solution), we scan dirty pages of this file and commit all atoms they are
46993 + * part of.
46994 + *
46995 + * Situation is complicated by anonymous pages: i.e., extent-less pages
46996 + * dirtied through mmap. Fortunately sys_fsync() first calls
46997 + * filemap_fdatawrite() that will ultimately call reiser4_writepages(), insert
46998 + * all missing extents and capture anonymous pages.
46999 + */
47000 +reiser4_internal int
47001 +sync_unix_file(struct inode *inode, int datasync)
47002 +{
47003 +       int result;
47004 +       reiser4_context *ctx;
47005 +
47006 +       ctx = get_current_context();
47007 +       assert("nikita-3486", ctx->trans->atom == NULL);
47008 +       result = commit_file_atoms(inode);
47009 +       assert("nikita-3484", ergo(result == 0, ctx->trans->atom == NULL));
47010 +       if (result == 0 && !datasync) {
47011 +               do {
47012 +                       /* commit "meta-data"---stat data in our case */
47013 +                       lock_handle lh;
47014 +                       coord_t coord;
47015 +                       reiser4_key key;
47016 +
47017 +                       coord_init_zero(&coord);
47018 +                       init_lh(&lh);
47019 +                       /* locate stat-data in a tree and return with znode
47020 +                        * locked */
47021 +                       result = locate_inode_sd(inode, &key, &coord, &lh);
47022 +                       if (result == 0) {
47023 +                               jnode    *node;
47024 +                               txn_atom *atom;
47025 +
47026 +                               node = jref(ZJNODE(coord.node));
47027 +                               done_lh(&lh);
47028 +                               txn_restart(ctx);
47029 +                               LOCK_JNODE(node);
47030 +                               atom = jnode_get_atom(node);
47031 +                               UNLOCK_JNODE(node);
47032 +                               result = sync_atom(atom);
47033 +                               jput(node);
47034 +                       } else
47035 +                               done_lh(&lh);
47036 +               } while (result == -E_REPEAT);
47037 +       }
47038 +       return result;
47039 +}
47040 +
47041 +/* plugin->u.file.readpage
47042 +   page must be not out of file. This is called either via page fault and in that case vp is struct file *file, or on
47043 +   truncate when last page of a file is to be read to perform its partial truncate and in that case vp is 0
47044 +*/
47045 +reiser4_internal int
47046 +readpage_unix_file(void *vp, struct page *page)
47047 +{
47048 +       int result;
47049 +       struct inode *inode;
47050 +       lock_handle lh;
47051 +       reiser4_key key;
47052 +       item_plugin *iplug;
47053 +       hint_t hint;
47054 +       coord_t *coord;
47055 +       struct file *file;
47056 +
47057 +
47058 +       reiser4_stat_inc(file.page_ops.readpage_calls);
47059 +
47060 +       assert("vs-1062", PageLocked(page));
47061 +       assert("vs-1061", page->mapping && page->mapping->host);
47062 +       assert("vs-1078", (page->mapping->host->i_size > ((loff_t) page->index << PAGE_CACHE_SHIFT)));
47063 +
47064 +       inode = page->mapping->host;
47065 +
47066 +       file = vp;
47067 +       result = load_file_hint(file, &hint, &lh);
47068 +       if (result)
47069 +               return result;
47070 +
47071 +       /* get key of first byte of the page */
47072 +       key_by_inode_unix_file(inode, (loff_t) page->index << PAGE_CACHE_SHIFT, &key);
47073 +
47074 +       /* look for file metadata corresponding to first byte of page */
47075 +       unlock_page(page);
47076 +       result = find_file_item(&hint, &key, ZNODE_READ_LOCK, 0/* ra_info */, inode);
47077 +       lock_page(page);
47078 +       if (result != CBK_COORD_FOUND) {
47079 +               /* this indicates file corruption */
47080 +               done_lh(&lh);
47081 +               return result;
47082 +       }
47083 +
47084 +       if (PageUptodate(page)) {
47085 +               done_lh(&lh);
47086 +               unlock_page(page);
47087 +               return 0;
47088 +       }
47089 +
47090 +       coord = &hint.coord.base_coord;
47091 +       result = zload(coord->node);
47092 +       if (result) {
47093 +               done_lh(&lh);
47094 +               return result;
47095 +       }
47096 +       if (!hint.coord.valid)
47097 +               validate_extended_coord(&hint.coord, (loff_t) page->index << PAGE_CACHE_SHIFT);
47098 +
47099 +       if (!coord_is_existing_unit(coord)) {
47100 +               /* this indicates corruption */
47101 +               warning("vs-280",
47102 +                       "Looking for page %lu of file %llu (size %lli). "
47103 +                       "No file items found (%d). "
47104 +                       "File is corrupted?\n",
47105 +                       page->index, get_inode_oid(inode), inode->i_size, result);
47106 +               zrelse(coord->node);
47107 +               done_lh(&lh);
47108 +               return RETERR(-EIO);
47109 +       }
47110 +
47111 +       /* get plugin of found item or use plugin if extent if there are no
47112 +          one */
47113 +       iplug = item_plugin_by_coord(coord);
47114 +       if (iplug->s.file.readpage)
47115 +               result = iplug->s.file.readpage(coord, page);
47116 +       else
47117 +               result = RETERR(-EINVAL);
47118 +
47119 +       if (!result) {
47120 +               set_key_offset(&key, (loff_t) (page->index + 1) << PAGE_CACHE_SHIFT);
47121 +               /* FIXME should call set_hint() */
47122 +               unset_hint(&hint);
47123 +       } else
47124 +               unset_hint(&hint);
47125 +       zrelse(coord->node);
47126 +       done_lh(&lh);
47127 +
47128 +       save_file_hint(file, &hint);
47129 +
47130 +       assert("vs-979", ergo(result == 0, (PageLocked(page) || PageUptodate(page))));
47131 +
47132 +       return result;
47133 +}
47134 +
47135 +/* returns 1 if file of that size (@new_size) has to be stored in unformatted
47136 +   nodes */
47137 +/* Audited by: green(2002.06.15) */
47138 +static int
47139 +should_have_notail(const unix_file_info_t *uf_info, loff_t new_size)
47140 +{
47141 +       if (!uf_info->tplug)
47142 +               return 1;
47143 +       return !uf_info->tplug->have_tail(unix_file_info_to_inode(uf_info),
47144 +                                         new_size);
47145 +
47146 +}
47147 +
47148 +static reiser4_block_nr unix_file_estimate_read(struct inode *inode,
47149 +                                               loff_t count UNUSED_ARG)
47150 +{
47151 +       /* We should reserve one block, because of updating of the stat data
47152 +          item */
47153 +       assert("vs-1249", inode_file_plugin(inode)->estimate.update == estimate_update_common);
47154 +       return estimate_update_common(inode);
47155 +}
47156 +
47157 +/* plugin->u.file.read
47158 +
47159 +   the read method for the unix_file plugin
47160 +
47161 +*/
47162 +reiser4_internal ssize_t
47163 +read_unix_file(struct file *file, char *buf, size_t read_amount, loff_t *off)
47164 +{
47165 +       int result;
47166 +       struct inode *inode;
47167 +       flow_t f;
47168 +       lock_handle lh;
47169 +       hint_t hint;
47170 +       coord_t *coord;
47171 +       size_t read;
47172 +       reiser4_block_nr needed;
47173 +       int (*read_f) (struct file *, flow_t *, hint_t *);
47174 +       unix_file_info_t *uf_info;
47175 +       loff_t size;
47176 +
47177 +       if (unlikely(!read_amount))
47178 +               return 0;
47179 +
47180 +       inode = file->f_dentry->d_inode;
47181 +       assert("vs-972", !inode_get_flag(inode, REISER4_NO_SD));
47182 +
47183 +       uf_info = unix_file_inode_data(inode);
47184 +       get_nonexclusive_access(uf_info);
47185 +
47186 +       size = i_size_read(inode);
47187 +       if (*off >= size) {
47188 +               /* position to read from is past the end of file */
47189 +               drop_access(uf_info);
47190 +               return 0;
47191 +       }
47192 +
47193 +       if (*off + read_amount > size)
47194 +               read_amount = size - *off;
47195 +
47196 +       /* we have nonexclusive access (NA) obtained. File's container may not change until we drop NA. If possible -
47197 +          calculate read function beforehand */
47198 +       switch(uf_info->container) {
47199 +       case UF_CONTAINER_EXTENTS:
47200 +               read_f = item_plugin_by_id(EXTENT_POINTER_ID)->s.file.read;
47201 +               break;
47202 +
47203 +       case UF_CONTAINER_TAILS:
47204 +               /* this is read-ahead for tails-only files */
47205 +               result = reiser4_file_readahead(file, *off, read_amount);
47206 +               if (result) {
47207 +                       drop_access(uf_info);
47208 +                       return result;
47209 +               }
47210 +
47211 +               read_f = item_plugin_by_id(FORMATTING_ID)->s.file.read;
47212 +               break;
47213 +
47214 +       case UF_CONTAINER_UNKNOWN:
47215 +               read_f = 0;
47216 +               break;
47217 +
47218 +       case UF_CONTAINER_EMPTY:
47219 +       default:
47220 +               warning("vs-1297", "File (ino %llu) has unexpected state: %d\n", get_inode_oid(inode), uf_info->container);
47221 +               drop_access(uf_info);
47222 +               return RETERR(-EIO);
47223 +       }
47224 +
47225 +       needed = unix_file_estimate_read(inode, read_amount); /* FIXME: tree_by_inode(inode)->estimate_one_insert */
47226 +       result = reiser4_grab_space(needed, BA_CAN_COMMIT);
47227 +       if (result != 0) {
47228 +               drop_access(uf_info);
47229 +               return result;
47230 +       }
47231 +
47232 +       /* build flow */
47233 +       assert("vs-1250", inode_file_plugin(inode)->flow_by_inode == flow_by_inode_unix_file);
47234 +       result = flow_by_inode_unix_file(inode, buf, 1 /* user space */ , read_amount, *off, READ_OP, &f);
47235 +       if (unlikely(result)) {
47236 +               drop_access(uf_info);
47237 +               return result;
47238 +       }
47239 +
47240 +       /* get seal and coord sealed with it from reiser4 private data of struct file.  The coord will tell us where our
47241 +          last read of this file finished, and the seal will help to determine if that location is still valid.
47242 +       */
47243 +       coord = &hint.coord.base_coord;
47244 +       result = load_file_hint(file, &hint, &lh);
47245 +
47246 +       while (f.length && result == 0) {
47247 +               result = find_file_item(&hint, &f.key, ZNODE_READ_LOCK, NULL, inode);
47248 +               if (cbk_errored(result))
47249 +                       /* error happened */
47250 +                       break;
47251 +
47252 +               if (coord->between != AT_UNIT)
47253 +                       /* there were no items corresponding to given offset */
47254 +                       break;
47255 +
47256 +               /*coord_clear_iplug(coord);*/
47257 +               hint.coord.valid = 0;
47258 +               result = zload(coord->node);
47259 +               if (unlikely(result))
47260 +                       break;
47261 +
47262 +               validate_extended_coord(&hint.coord, get_key_offset(&f.key));
47263 +
47264 +               /* call item's read method */
47265 +               if (!read_f)
47266 +                       read_f = item_plugin_by_coord(coord)->s.file.read;
47267 +               result = read_f(file, &f, &hint);
47268 +               zrelse(coord->node);
47269 +               done_lh(&lh);
47270 +       }
47271 +
47272 +       done_lh(&lh);
47273 +       save_file_hint(file, &hint);
47274 +
47275 +       read = read_amount - f.length;
47276 +       if (read)
47277 +               /* something was read. Update stat data */
47278 +               update_atime(inode);
47279 +
47280 +       drop_access(uf_info);
47281 +
47282 +       /* update position in a file */
47283 +       *off += read;
47284 +
47285 +       /* return number of read bytes or error code if nothing is read */
47286 +       return read ?: result;
47287 +}
47288 +
47289 +typedef int (*write_f_t)(struct inode *, flow_t *, hint_t *, int grabbed, write_mode_t);
47290 +
47291 +/* This searches for write position in the tree and calls write method of
47292 +   appropriate item to actually copy user data into filesystem. This loops
47293 +   until all the data from flow @f are written to a file. */
47294 +static loff_t
47295 +append_and_or_overwrite(struct file *file, struct inode *inode, flow_t *flow)
47296 +{
47297 +       int result;
47298 +       lock_handle lh;
47299 +       hint_t hint;
47300 +       loff_t to_write;
47301 +       write_f_t write_f;
47302 +       file_container_t cur_container, new_container;
47303 +       znode *loaded;
47304 +       unix_file_info_t *uf_info;
47305 +
47306 +       assert("nikita-3031", schedulable());
47307 +       assert("vs-1109", get_current_context()->grabbed_blocks == 0);
47308 +
47309 +       /* get seal and coord sealed with it from reiser4 private data of
47310 +          struct file */
47311 +       result = load_file_hint(file, &hint, &lh);
47312 +       if (result)
47313 +               return result;
47314 +
47315 +       uf_info = unix_file_inode_data(inode);
47316 +
47317 +       to_write = flow->length;
47318 +       while (flow->length) {
47319 +               assert("vs-1123", get_current_context()->grabbed_blocks == 0);
47320 +
47321 +               {
47322 +                       size_t count;
47323 +
47324 +                       count = PAGE_CACHE_SIZE;
47325 +
47326 +                       if (count > flow->length)
47327 +                               count = flow->length;
47328 +                       fault_in_pages_readable(flow->data, count);
47329 +               }
47330 +
47331 +               if (to_write == flow->length) {
47332 +                       /* it may happend that find_next_item will have to insert empty node to the tree (empty leaf
47333 +                          node between two extent items) */
47334 +                       result = reiser4_grab_space_force(1 + estimate_one_insert_item(tree_by_inode(inode)), 0);
47335 +                       if (result)
47336 +                               return result;
47337 +               }
47338 +               /* look for file's metadata (extent or tail item) corresponding to position we write to */
47339 +               result = find_file_item(&hint, &flow->key, ZNODE_WRITE_LOCK, NULL/* ra_info */, inode);
47340 +               all_grabbed2free();
47341 +               if (IS_CBKERR(result)) {
47342 +                       /* error occurred */
47343 +                       done_lh(&lh);
47344 +                       return result;
47345 +               }
47346 +
47347 +               cur_container = uf_info->container;
47348 +               switch (cur_container) {
47349 +               case UF_CONTAINER_EMPTY:
47350 +                       assert("vs-1196", get_key_offset(&flow->key) == 0);
47351 +                       if (should_have_notail(uf_info, get_key_offset(&flow->key) + flow->length)) {
47352 +                               new_container = UF_CONTAINER_EXTENTS;
47353 +                               write_f = item_plugin_by_id(EXTENT_POINTER_ID)->s.file.write;
47354 +                       } else {
47355 +                               new_container = UF_CONTAINER_TAILS;
47356 +                               write_f = item_plugin_by_id(FORMATTING_ID)->s.file.write;
47357 +                       }
47358 +                       break;
47359 +
47360 +               case UF_CONTAINER_EXTENTS:
47361 +                       write_f = item_plugin_by_id(EXTENT_POINTER_ID)->s.file.write;
47362 +                       new_container = cur_container;
47363 +                       break;
47364 +
47365 +               case UF_CONTAINER_TAILS:
47366 +                       if (should_have_notail(uf_info, get_key_offset(&flow->key) + flow->length)) {
47367 +                               longterm_unlock_znode(&lh);
47368 +                               if (!ea_obtained(uf_info))
47369 +                                       return RETERR(-E_REPEAT);
47370 +                               result = tail2extent(uf_info);
47371 +                               if (result)
47372 +                                       return result;
47373 +                               unset_hint(&hint);
47374 +                               continue;
47375 +                       }
47376 +                       write_f = item_plugin_by_id(FORMATTING_ID)->s.file.write;
47377 +                       new_container = cur_container;
47378 +                       break;
47379 +
47380 +               default:
47381 +                       longterm_unlock_znode(&lh);
47382 +                       return RETERR(-EIO);
47383 +               }
47384 +
47385 +               result = zload(lh.node);
47386 +               if (result) {
47387 +                       longterm_unlock_znode(&lh);
47388 +                       return result;
47389 +               }
47390 +               loaded = lh.node;
47391 +
47392 +               result = write_f(inode,
47393 +                                flow,
47394 +                                &hint,
47395 +                                0/* not grabbed */,
47396 +                                how_to_write(&hint.coord, &flow->key));
47397 +
47398 +               assert("nikita-3142", get_current_context()->grabbed_blocks == 0);
47399 +               if (cur_container == UF_CONTAINER_EMPTY && to_write != flow->length) {
47400 +                       /* file was empty and we have written something and we are having exclusive access to the file -
47401 +                          change file state */
47402 +                       assert("vs-1195", (new_container == UF_CONTAINER_TAILS ||
47403 +                                          new_container == UF_CONTAINER_EXTENTS));
47404 +                       uf_info->container = new_container;
47405 +               }
47406 +               zrelse(loaded);
47407 +               done_lh(&lh);
47408 +               if (result && result != -E_REPEAT)
47409 +                       break;
47410 +               preempt_point();
47411 +       }
47412 +       if (result == -EEXIST)
47413 +               printk("write returns EEXIST!\n");
47414 +       save_file_hint(file, &hint);
47415 +
47416 +       /* if nothing were written - there must be an error */
47417 +       assert("vs-951", ergo((to_write == flow->length), result < 0));
47418 +       assert("vs-1110", get_current_context()->grabbed_blocks == 0);
47419 +
47420 +       return (to_write - flow->length) ? (to_write - flow->length) : result;
47421 +}
47422 +
47423 +/* make flow and write data (@buf) to the file. If @buf == 0 - hole of size @count will be created. This is called with
47424 +   uf_info->latch either read- or write-locked */
47425 +static loff_t
47426 +write_flow(struct file *file, struct inode *inode, const char *buf, loff_t count, loff_t pos)
47427 +{
47428 +       int result;
47429 +       flow_t flow;
47430 +
47431 +       assert("vs-1251", inode_file_plugin(inode)->flow_by_inode == flow_by_inode_unix_file);
47432 +
47433 +       result = flow_by_inode_unix_file(inode,
47434 +                                        (char *)buf, 1 /* user space */, count, pos, WRITE_OP, &flow);
47435 +       if (result)
47436 +               return result;
47437 +
47438 +       return append_and_or_overwrite(file, inode, &flow);
47439 +}
47440 +
47441 +reiser4_internal void
47442 +drop_access(unix_file_info_t *uf_info)
47443 +{
47444 +       if (uf_info->exclusive_use)
47445 +               drop_exclusive_access(uf_info);
47446 +       else
47447 +               drop_nonexclusive_access(uf_info);
47448 +}
47449 +
47450 +reiser4_internal struct page *
47451 +unix_file_filemap_nopage(struct vm_area_struct *area, unsigned long address, int * unused)
47452 +{
47453 +       struct page *page;
47454 +       struct inode *inode;
47455 +
47456 +       inode = area->vm_file->f_dentry->d_inode;
47457 +
47458 +       /* block filemap_nopage if copy on capture is processing with a node of this file */
47459 +       down_read(&reiser4_inode_data(inode)->coc_sem);
47460 +       get_nonexclusive_access(unix_file_inode_data(inode));
47461 +
47462 +       page = filemap_nopage(area, address, 0);
47463 +
47464 +       drop_nonexclusive_access(unix_file_inode_data(inode));
47465 +       up_read(&reiser4_inode_data(inode)->coc_sem);
47466 +       return page;
47467 +}
47468 +
47469 +static struct vm_operations_struct unix_file_vm_ops = {
47470 +       .nopage = unix_file_filemap_nopage,
47471 +};
47472 +
47473 +/* This function takes care about @file's pages. First of all it checks if
47474 +   filesystems readonly and if so gets out. Otherwise, it throws out all
47475 +   pages of file if it was mapped for read and going to be mapped for write
47476 +   and consists of tails. This is done in order to not manage few copies
47477 +   of the data (first in page cache and second one in tails them selves)
47478 +   for the case of mapping files consisting tails.
47479 +
47480 +   Here also tail2extent conversion is performed if it is allowed and file
47481 +   is going to be written or mapped for write. This functions may be called
47482 +   from write_unix_file() or mmap_unix_file(). */
47483 +static int
47484 +check_pages_unix_file(struct inode *inode)
47485 +{
47486 +       reiser4_invalidate_pages(inode->i_mapping, 0,
47487 +                                (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT);
47488 +       return unpack(inode, 0 /* not forever */);
47489 +}
47490 +
47491 +/* plugin->u.file.mmap
47492 +   make sure that file is built of extent blocks. An estimation is in tail2extent */
47493 +
47494 +/* This sets inode flags: file has mapping. if file is mmaped with VM_MAYWRITE - invalidate pages and convert. */
47495 +reiser4_internal int
47496 +mmap_unix_file(struct file *file, struct vm_area_struct *vma)
47497 +{
47498 +       int result;
47499 +       struct inode *inode;
47500 +       unix_file_info_t *uf_info;
47501 +
47502 +       inode = file->f_dentry->d_inode;
47503 +       uf_info = unix_file_inode_data(inode);
47504 +
47505 +       get_exclusive_access(uf_info);
47506 +
47507 +       if (!IS_RDONLY(inode) && (vma->vm_flags & (VM_MAYWRITE | VM_SHARED))) {
47508 +               /* we need file built of extent items. If it is still built of tail items we have to convert it. Find
47509 +                  what items the file is built of */
47510 +               result = finish_conversion(inode);
47511 +               if (result) {
47512 +                       drop_exclusive_access(uf_info);
47513 +                       return result;
47514 +               }
47515 +
47516 +               result = find_file_state(uf_info);
47517 +               if (result != 0) {
47518 +                       drop_exclusive_access(uf_info);
47519 +                       return result;
47520 +               }
47521 +
47522 +               assert("vs-1648", (uf_info->container == UF_CONTAINER_TAILS ||
47523 +                                  uf_info->container == UF_CONTAINER_EXTENTS ||
47524 +                                  uf_info->container == UF_CONTAINER_EMPTY));
47525 +               if (uf_info->container == UF_CONTAINER_TAILS) {
47526 +                       /* invalidate all pages and convert file from tails to extents */
47527 +                       result = check_pages_unix_file(inode);
47528 +                       if (result) {
47529 +                               drop_exclusive_access(uf_info);
47530 +                               return result;
47531 +                       }
47532 +               }
47533 +       }
47534 +
47535 +       result = generic_file_mmap(file, vma);
47536 +       if (result == 0) {
47537 +               /* mark file as having mapping. */
47538 +               inode_set_flag(inode, REISER4_HAS_MMAP);
47539 +               vma->vm_ops = &unix_file_vm_ops;
47540 +       }
47541 +
47542 +       drop_exclusive_access(uf_info);
47543 +       return result;
47544 +}
47545 +
47546 +static ssize_t
47547 +write_file(struct file *file, /* file to write to */
47548 +          const char *buf, /* address of user-space buffer */
47549 +          size_t count, /* number of bytes to write */
47550 +          loff_t *off /* position in file to write to */)
47551 +{
47552 +       struct inode *inode;
47553 +       ssize_t written;        /* amount actually written so far */
47554 +       loff_t pos;             /* current location in the file */
47555 +
47556 +       inode = file->f_dentry->d_inode;
47557 +
47558 +       /* estimation for write is entrusted to write item plugins */
47559 +       pos = *off;
47560 +
47561 +       if (inode->i_size < pos) {
47562 +               /* pos is set past real end of file */
47563 +               written = append_hole(inode, pos);
47564 +               if (written)
47565 +                       return written;
47566 +               assert("vs-1081", pos == inode->i_size);
47567 +       }
47568 +
47569 +       /* write user data to the file */
47570 +       written = write_flow(file, inode, buf, count, pos);
47571 +       if (written > 0)
47572 +               /* update position in a file */
47573 +               *off = pos + written;
47574 +
47575 +       /* return number of written bytes, or error code */
47576 +       return written;
47577 +}
47578 +
47579 +/* plugin->u.file.write */
47580 +reiser4_internal ssize_t
47581 +write_unix_file(struct file *file, /* file to write to */
47582 +               const char *buf, /* address of user-space buffer */
47583 +               size_t count, /* number of bytes to write */
47584 +               loff_t *off /* position in file to write to */)
47585 +{
47586 +       struct inode *inode;
47587 +       ssize_t written;        /* amount actually written so far */
47588 +       int result;
47589 +
47590 +       if (unlikely(count == 0))
47591 +               return 0;
47592 +
47593 +       inode = file->f_dentry->d_inode;
47594 +       assert("vs-947", !inode_get_flag(inode, REISER4_NO_SD));
47595 +
47596 +       /* linux's VM requires this. See mm/vmscan.c:shrink_list() */
47597 +       current->backing_dev_info = inode->i_mapping->backing_dev_info;
47598 +
47599 +       down(&inode->i_sem);
47600 +       written = generic_write_checks(file, off, &count, 0);
47601 +       if (written == 0) {
47602 +               unix_file_info_t *uf_info;
47603 +
47604 +               uf_info = unix_file_inode_data(inode);
47605 +
47606 +               if (inode_get_flag(inode, REISER4_HAS_MMAP)) {
47607 +                       /* file has been mmaped before. If it is built of
47608 +                          tails - invalidate pages created so far and convert
47609 +                          to extents */
47610 +                       get_exclusive_access(uf_info);
47611 +                       written = finish_conversion(inode);
47612 +                       if (written == 0)
47613 +                               if (uf_info->container == UF_CONTAINER_TAILS)
47614 +                                       written = check_pages_unix_file(inode);
47615 +
47616 +                       drop_exclusive_access(uf_info);
47617 +               }
47618 +               if (written == 0) {
47619 +                       int rep;
47620 +                       int try_free_space = 1;
47621 +
47622 +                       for (rep = 0; ; ++ rep) {
47623 +                               if (inode_get_flag(inode,
47624 +                                                  REISER4_PART_CONV)) {
47625 +                                       get_exclusive_access(uf_info);
47626 +                                       written = finish_conversion(inode);
47627 +                                       if (written != 0) {
47628 +                                               drop_access(uf_info);
47629 +                                               break;
47630 +                                       }
47631 +                               } else if (inode->i_size == 0 || rep)
47632 +                                       get_exclusive_access(uf_info);
47633 +                               else
47634 +                                       get_nonexclusive_access(uf_info);
47635 +
47636 +                               if (rep == 0) {
47637 +                                       /* UNIX behavior: clear suid bit on
47638 +                                        * file modification. This cannot be
47639 +                                        * done earlier, because removing suid
47640 +                                        * bit captures blocks into
47641 +                                        * transaction, which should be done
47642 +                                        * after taking exclusive access on
47643 +                                        * the file. */
47644 +                                       written = remove_suid(file->f_dentry);
47645 +                                       if (written != 0) {
47646 +                                               drop_access(uf_info);
47647 +                                               break;
47648 +                                       }
47649 +                                       grab_space_enable();
47650 +                               }
47651 +
47652 +                               all_grabbed2free();
47653 +                               written = write_file(file, buf, count, off);
47654 +                               drop_access(uf_info);
47655 +
47656 +                               /* With no locks held we can commit atoms in
47657 +                                * attempt to recover free space. */
47658 +                               if (written == -ENOSPC && try_free_space) {
47659 +                                       txnmgr_force_commit_all(inode->i_sb, 0);
47660 +                                       try_free_space = 0;
47661 +                                       continue;
47662 +                               }
47663 +
47664 +                               if (written == -E_REPEAT)
47665 +                                       /* write_file required exclusive
47666 +                                        * access (for tail2extent). It
47667 +                                        * returned E_REPEAT so that we
47668 +                                        * restart it with exclusive access */
47669 +                                       txn_restart_current();
47670 +                               else
47671 +                                       break;
47672 +                       }
47673 +               }
47674 +       }
47675 +
47676 +       if ((file->f_flags & O_SYNC) || IS_SYNC(inode)) {
47677 +               txn_restart_current();
47678 +               result = sync_unix_file(inode, 0/* data and stat data */);
47679 +               if (result)
47680 +                       warning("reiser4-7", "failed to sync file %llu",
47681 +                               get_inode_oid(inode));
47682 +       }
47683 +       up(&inode->i_sem);
47684 +       current->backing_dev_info = 0;
47685 +       return written;
47686 +}
47687 +
47688 +/* plugin->u.file.release() convert all extent items into tail items if
47689 +   necessary */
47690 +reiser4_internal int
47691 +release_unix_file(struct inode *object, struct file *file)
47692 +{
47693 +       unix_file_info_t *uf_info;
47694 +       int result;
47695 +
47696 +       uf_info = unix_file_inode_data(object);
47697 +       result = 0;
47698 +
47699 +       get_exclusive_access(uf_info);
47700 +       if (atomic_read(&file->f_dentry->d_count) == 1 &&
47701 +           uf_info->container == UF_CONTAINER_EXTENTS &&
47702 +           !should_have_notail(uf_info, object->i_size) &&
47703 +           !rofs_inode(object)) {
47704 +               result = extent2tail(uf_info);
47705 +               if (result != 0) {
47706 +                       warning("nikita-3233", "Failed to convert in %s (%llu)",
47707 +                               __FUNCTION__, get_inode_oid(object));
47708 +                       print_inode("inode", object);
47709 +               }
47710 +       }
47711 +       drop_exclusive_access(uf_info);
47712 +       return 0;
47713 +}
47714 +
47715 +static void
47716 +set_file_notail(struct inode *inode)
47717 +{
47718 +       reiser4_inode *state;
47719 +       formatting_plugin   *tplug;
47720 +
47721 +       state = reiser4_inode_data(inode);
47722 +       tplug = formatting_plugin_by_id(NEVER_TAILS_FORMATTING_ID);
47723 +       plugin_set_formatting(&state->pset, tplug);
47724 +       inode_set_plugin(inode,
47725 +                        formatting_plugin_to_plugin(tplug), PSET_FORMATTING);
47726 +}
47727 +
47728 +/* if file is built of tails - convert it to extents */
47729 +static int
47730 +unpack(struct inode *inode, int forever)
47731 +{
47732 +       int            result = 0;
47733 +       unix_file_info_t *uf_info;
47734 +
47735 +
47736 +       uf_info = unix_file_inode_data(inode);
47737 +       assert("vs-1628", ea_obtained(uf_info));
47738 +
47739 +       result = find_file_state(uf_info);
47740 +       assert("vs-1074", ergo(result == 0, uf_info->container != UF_CONTAINER_UNKNOWN));
47741 +       if (result == 0) {
47742 +               if (uf_info->container == UF_CONTAINER_TAILS)
47743 +                       result = tail2extent(uf_info);
47744 +               if (result == 0 && forever)
47745 +                       set_file_notail(inode);
47746 +               if (result == 0) {
47747 +                       __u64 tograb;
47748 +
47749 +                       grab_space_enable();
47750 +                       tograb = inode_file_plugin(inode)->estimate.update(inode);
47751 +                       result = reiser4_grab_space(tograb, BA_CAN_COMMIT);
47752 +                       if (result == 0)
47753 +                               update_atime(inode);
47754 +               }
47755 +       }
47756 +
47757 +       return result;
47758 +}
47759 +
47760 +/* plugin->u.file.ioctl */
47761 +reiser4_internal int
47762 +ioctl_unix_file(struct inode *inode, struct file *filp UNUSED_ARG, unsigned int cmd, unsigned long arg UNUSED_ARG)
47763 +{
47764 +       int result;
47765 +
47766 +       switch (cmd) {
47767 +       case REISER4_IOC_UNPACK:
47768 +               get_exclusive_access(unix_file_inode_data(inode));
47769 +               result = unpack(inode, 1 /* forever */);
47770 +               drop_exclusive_access(unix_file_inode_data(inode));
47771 +               break;
47772 +
47773 +       default:
47774 +               result = RETERR(-ENOSYS);
47775 +               break;
47776 +       }
47777 +       return result;
47778 +}
47779 +
47780 +/* plugin->u.file.get_block */
47781 +reiser4_internal int
47782 +get_block_unix_file(struct inode *inode,
47783 +                   sector_t block, struct buffer_head *bh_result, int create UNUSED_ARG)
47784 +{
47785 +       int result;
47786 +       reiser4_key key;
47787 +       coord_t coord;
47788 +       lock_handle lh;
47789 +       item_plugin *iplug;
47790 +
47791 +       assert("vs-1091", create == 0);
47792 +
47793 +       key_by_inode_unix_file(inode, (loff_t) block * current_blocksize, &key);
47794 +
47795 +       init_lh(&lh);
47796 +       result = find_file_item_nohint(&coord, &lh, &key, ZNODE_READ_LOCK, inode);
47797 +       if (cbk_errored(result)) {
47798 +               done_lh(&lh);
47799 +               return result;
47800 +       }
47801 +
47802 +       /*coord_clear_iplug(&coord);*/
47803 +       result = zload(coord.node);
47804 +       if (result) {
47805 +               done_lh(&lh);
47806 +               return result;
47807 +       }
47808 +       iplug = item_plugin_by_coord(&coord);
47809 +       if (iplug->s.file.get_block)
47810 +               result = iplug->s.file.get_block(&coord, block, bh_result);
47811 +       else
47812 +               result = RETERR(-EINVAL);
47813 +
47814 +       zrelse(coord.node);
47815 +       done_lh(&lh);
47816 +       return result;
47817 +}
47818 +
47819 +/* plugin->u.file.flow_by_inode
47820 +   initialize flow (key, length, buf, etc) */
47821 +reiser4_internal int
47822 +flow_by_inode_unix_file(struct inode *inode /* file to build flow for */ ,
47823 +                       char *buf /* user level buffer */ ,
47824 +                       int user  /* 1 if @buf is of user space, 0 - if it is kernel space */ ,
47825 +                       loff_t size /* buffer size */ ,
47826 +                       loff_t off /* offset to start operation(read/write) from */ ,
47827 +                       rw_op op /* READ or WRITE */ ,
47828 +                       flow_t *flow /* resulting flow */ )
47829 +{
47830 +       assert("nikita-1100", inode != NULL);
47831 +
47832 +       flow->length = size;
47833 +       flow->data = buf;
47834 +       flow->user = user;
47835 +       flow->op = op;
47836 +       assert("nikita-1931", inode_file_plugin(inode) != NULL);
47837 +       assert("nikita-1932", inode_file_plugin(inode)->key_by_inode == key_by_inode_unix_file);
47838 +       /* calculate key of write position and insert it into flow->key */
47839 +       return key_by_inode_unix_file(inode, off, &flow->key);
47840 +}
47841 +
47842 +/* plugin->u.file.key_by_inode */
47843 +reiser4_internal int
47844 +key_by_inode_unix_file(struct inode *inode, loff_t off, reiser4_key *key)
47845 +{
47846 +       return key_by_inode_and_offset_common(inode, off, key);
47847 +}
47848 +
47849 +/* plugin->u.file.set_plug_in_sd = NULL
47850 +   plugin->u.file.set_plug_in_inode = NULL
47851 +   plugin->u.file.create_blank_sd = NULL */
47852 +/* plugin->u.file.delete */
47853 +/*
47854 +   plugin->u.file.add_link = add_link_common
47855 +   plugin->u.file.rem_link = NULL */
47856 +
47857 +/* plugin->u.file.owns_item
47858 +   this is common_file_owns_item with assertion */
47859 +/* Audited by: green(2002.06.15) */
47860 +reiser4_internal int
47861 +owns_item_unix_file(const struct inode *inode  /* object to check against */ ,
47862 +                   const coord_t *coord /* coord to check */ )
47863 +{
47864 +       int result;
47865 +
47866 +       result = owns_item_common(inode, coord);
47867 +       if (!result)
47868 +               return 0;
47869 +       if (item_type_by_coord(coord) != UNIX_FILE_METADATA_ITEM_TYPE)
47870 +               return 0;
47871 +       assert("vs-547",
47872 +              item_id_by_coord(coord) == EXTENT_POINTER_ID ||
47873 +              item_id_by_coord(coord) == FORMATTING_ID);
47874 +       return 1;
47875 +}
47876 +
47877 +static int
47878 +setattr_truncate(struct inode *inode, struct iattr *attr)
47879 +{
47880 +       int result;
47881 +       int s_result;
47882 +       loff_t old_size;
47883 +       reiser4_tree *tree;
47884 +
47885 +       inode_check_scale(inode, inode->i_size, attr->ia_size);
47886 +
47887 +       old_size = inode->i_size;
47888 +       tree = tree_by_inode(inode);
47889 +
47890 +       result = safe_link_grab(tree, BA_CAN_COMMIT);
47891 +       if (result == 0)
47892 +               result = safe_link_add(inode, SAFE_TRUNCATE);
47893 +       all_grabbed2free();
47894 +       if (result == 0)
47895 +               result = truncate_file_body(inode, attr->ia_size);
47896 +       if (result)
47897 +               warning("vs-1588", "truncate_file failed: oid %lli, old size %lld, new size %lld, retval %d",
47898 +                       get_inode_oid(inode), old_size, attr->ia_size, result);
47899 +
47900 +       s_result = safe_link_grab(tree, BA_CAN_COMMIT);
47901 +       if (s_result == 0)
47902 +               s_result = safe_link_del(inode, SAFE_TRUNCATE);
47903 +       if (s_result != 0) {
47904 +               warning("nikita-3417", "Cannot kill safelink %lli: %i",
47905 +                       get_inode_oid(inode), s_result);
47906 +       }
47907 +       safe_link_release(tree);
47908 +       all_grabbed2free();
47909 +       return result;
47910 +}
47911 +
47912 +/* plugin->u.file.setattr method */
47913 +/* This calls inode_setattr and if truncate is in effect it also takes
47914 +   exclusive inode access to avoid races */
47915 +reiser4_internal int
47916 +setattr_unix_file(struct inode *inode, /* Object to change attributes */
47917 +                 struct iattr *attr /* change description */ )
47918 +{
47919 +       int result;
47920 +
47921 +       if (attr->ia_valid & ATTR_SIZE) {
47922 +               /* truncate does reservation itself and requires exclusive
47923 +                * access obtained */
47924 +               unix_file_info_t *ufo;
47925 +
47926 +               ufo = unix_file_inode_data(inode);
47927 +               get_exclusive_access(ufo);
47928 +               result = setattr_truncate(inode, attr);
47929 +               drop_exclusive_access(ufo);
47930 +       } else
47931 +               result = setattr_common(inode, attr);
47932 +
47933 +       return result;
47934 +}
47935 +
47936 +/* plugin->u.file.can_add_link = common_file_can_add_link */
47937 +/* VS-FIXME-HANS: why does this always resolve to extent pointer?  this wrapper serves what purpose?  get rid of it. */
47938 +/* plugin->u.file.readpages method */
47939 +reiser4_internal void
47940 +readpages_unix_file(struct file *file, struct address_space *mapping,
47941 +                   struct list_head *pages)
47942 +{
47943 +       reiser4_file_fsdata *fsdata;
47944 +       item_plugin *iplug;
47945 +
47946 +       /* FIXME: readpages_unix_file() only supports files built of extents. */
47947 +       if (unix_file_inode_data(mapping->host)->container != UF_CONTAINER_EXTENTS)
47948 +               return;
47949 +
47950 +       fsdata = reiser4_get_file_fsdata(file);
47951 +       iplug = item_plugin_by_id(EXTENT_POINTER_ID);
47952 +       iplug->s.file.readpages(fsdata->reg.coord, mapping, pages);
47953 +       return;
47954 +}
47955 +
47956 +/* plugin->u.file.init_inode_data */
47957 +reiser4_internal void
47958 +init_inode_data_unix_file(struct inode *inode,
47959 +                         reiser4_object_create_data *crd, int create)
47960 +{
47961 +       unix_file_info_t *data;
47962 +
47963 +       data = unix_file_inode_data(inode);
47964 +       data->container = create ? UF_CONTAINER_EMPTY : UF_CONTAINER_UNKNOWN;
47965 +       init_rwsem(&data->latch);
47966 +       data->tplug = inode_formatting_plugin(inode);
47967 +       data->exclusive_use = 0;
47968 +
47969 +#if REISER4_DEBUG
47970 +       data->ea_owner = 0;
47971 +#endif
47972 +       init_inode_ordering(inode, crd, create);
47973 +}
47974 +
47975 +/* VS-FIXME-HANS: what is pre deleting all about? */
47976 +/* plugin->u.file.pre_delete */
47977 +reiser4_internal int
47978 +pre_delete_unix_file(struct inode *inode)
47979 +{
47980 +       /* FIXME: put comment here */
47981 +       /*if (inode->i_size == 0)
47982 +         return 0;*/
47983 +       return truncate_file_body(inode, 0/* size */);
47984 +}
47985 +
47986 +/* Reads @count bytes from @file and calls @actor for every page read. This is
47987 +   needed for loop back devices support. */
47988 +reiser4_internal ssize_t sendfile_common (
47989 +       struct file *file, loff_t *ppos, size_t count, read_actor_t actor, void __user *target)
47990 +{
47991 +       file_plugin *fplug;
47992 +       struct inode *inode;
47993 +       read_descriptor_t desc;
47994 +       struct page *page = NULL;
47995 +       int ret = 0;
47996 +
47997 +       assert("umka-3108", file != NULL);
47998 +
47999 +       inode = file->f_dentry->d_inode;
48000 +
48001 +       desc.error = 0;
48002 +       desc.written = 0;
48003 +       desc.arg.data = target;
48004 +       desc.count = count;
48005 +
48006 +       fplug = inode_file_plugin(inode);
48007 +       if (fplug->readpage == NULL)
48008 +               return RETERR(-EINVAL);
48009 +
48010 +       while (desc.count != 0) {
48011 +               unsigned long read_request_size;
48012 +               unsigned long index;
48013 +               unsigned long offset;
48014 +               loff_t file_size = i_size_read(inode);
48015 +
48016 +               if (*ppos >= file_size)
48017 +                       break;
48018 +
48019 +               index = *ppos >> PAGE_CACHE_SHIFT;
48020 +               offset = *ppos & ~PAGE_CACHE_MASK;
48021 +
48022 +               page_cache_readahead(inode->i_mapping, &file->f_ra, file, offset);
48023 +
48024 +               /* determine valid read request size. */
48025 +               read_request_size = PAGE_CACHE_SIZE - offset;
48026 +               if (read_request_size > desc.count)
48027 +                       read_request_size = desc.count;
48028 +               if (*ppos + read_request_size >= file_size) {
48029 +                       read_request_size = file_size - *ppos;
48030 +                       if (read_request_size == 0)
48031 +                               break;
48032 +               }
48033 +               page = grab_cache_page(inode->i_mapping, index);
48034 +               if (unlikely(page == NULL)) {
48035 +                       desc.error = RETERR(-ENOMEM);
48036 +                       break;
48037 +               }
48038 +
48039 +               if (PageUptodate(page))
48040 +                       /* process locked, up-to-date  page by read actor */
48041 +                       goto actor;
48042 +
48043 +               ret = fplug->readpage(file, page);
48044 +               if (ret != 0) {
48045 +                       SetPageError(page);
48046 +                       ClearPageUptodate(page);
48047 +                       desc.error = ret;
48048 +                       goto fail_locked_page;
48049 +               }
48050 +
48051 +               lock_page(page);
48052 +               if (!PageUptodate(page)) {
48053 +                       desc.error = RETERR(-EIO);
48054 +                       goto fail_locked_page;
48055 +               }
48056 +
48057 +       actor:
48058 +               ret = actor(&desc, page, offset, read_request_size);
48059 +               unlock_page(page);
48060 +               page_cache_release(page);
48061 +
48062 +               (*ppos) += ret;
48063 +
48064 +               if (ret != read_request_size)
48065 +                       break;
48066 +       }
48067 +
48068 +       if (0) {
48069 +       fail_locked_page:
48070 +               unlock_page(page);
48071 +               page_cache_release(page);
48072 +       }
48073 +
48074 +       update_atime(inode);
48075 +
48076 +       if (desc.written)
48077 +               return desc.written;
48078 +       return desc.error;
48079 +}
48080 +
48081 +reiser4_internal ssize_t sendfile_unix_file(struct file *file, loff_t *ppos, size_t count,
48082 +                                           read_actor_t actor, void __user *target)
48083 +{
48084 +       ssize_t ret;
48085 +       struct inode *inode;
48086 +       unix_file_info_t *ufo;
48087 +
48088 +       inode = file->f_dentry->d_inode;
48089 +       ufo = unix_file_inode_data(inode);
48090 +
48091 +       down(&inode->i_sem);
48092 +       inode_set_flag(inode, REISER4_HAS_MMAP);
48093 +       up(&inode->i_sem);
48094 +
48095 +       get_nonexclusive_access(ufo);
48096 +       ret = sendfile_common(file, ppos, count, actor, target);
48097 +       drop_nonexclusive_access(ufo);
48098 +       return ret;
48099 +}
48100 +
48101 +reiser4_internal int prepare_write_unix_file(struct file *file, struct page *page,
48102 +                                            unsigned from, unsigned to)
48103 +{
48104 +       unix_file_info_t *uf_info;
48105 +       int ret;
48106 +
48107 +       uf_info = unix_file_inode_data(file->f_dentry->d_inode);
48108 +       get_exclusive_access(uf_info);
48109 +       ret = find_file_state(uf_info);
48110 +       if (ret == 0) {
48111 +               if (uf_info->container == UF_CONTAINER_TAILS)
48112 +                       ret = -EINVAL;
48113 +               else
48114 +                       ret = prepare_write_common(file, page, from, to);
48115 +       }
48116 +       drop_exclusive_access(uf_info);
48117 +       return ret;
48118 +}
48119 +
48120 +/*
48121 +   Local variables:
48122 +   c-indentation-style: "K&R"
48123 +   mode-name: "LC"
48124 +   c-basic-offset: 8
48125 +   tab-width: 8
48126 +   fill-column: 120
48127 +   scroll-step: 1
48128 +   End:
48129 +*/
48130 diff -rupN linux-2.6.8-rc3/fs/reiser4/plugin/file/file.h linux-2.6.8-rc3-a/fs/reiser4/plugin/file/file.h
48131 --- linux-2.6.8-rc3/fs/reiser4/plugin/file/file.h       1970-01-01 03:00:00.000000000 +0300
48132 +++ linux-2.6.8-rc3-a/fs/reiser4/plugin/file/file.h     2004-08-05 21:20:52.761728332 +0400
48133 @@ -0,0 +1,138 @@
48134 +/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
48135 + * reiser4/README */
48136 +
48137 +#if !defined( __REISER4_FILE_H__ )
48138 +#define __REISER4_FILE_H__
48139 +
48140 +/* declarations of functions implementing file plugin for unix file plugin */
48141 +int truncate_unix_file(struct inode *, loff_t size);
48142 +int readpage_unix_file(void *, struct page *);
48143 +int capturepage_unix_file(struct page *);
48144 +int capture_unix_file(struct inode *, const struct writeback_control *, long *);
48145 +ssize_t read_unix_file(struct file *, char *buf, size_t size, loff_t *off);
48146 +ssize_t write_unix_file(struct file *, const char *buf, size_t size, loff_t *off);
48147 +int release_unix_file(struct inode *inode, struct file *);
48148 +int ioctl_unix_file(struct inode *, struct file *, unsigned int cmd, unsigned long arg);
48149 +int mmap_unix_file(struct file *, struct vm_area_struct *vma);
48150 +int get_block_unix_file(struct inode *, sector_t block, struct buffer_head *bh_result, int create);
48151 +int flow_by_inode_unix_file(struct inode *, char *buf, int user, loff_t, loff_t, rw_op, flow_t *);
48152 +int key_by_inode_unix_file(struct inode *, loff_t off, reiser4_key *);
48153 +int owns_item_unix_file(const struct inode *, const coord_t *);
48154 +int setattr_unix_file(struct inode *, struct iattr *);
48155 +void readpages_unix_file(struct file *, struct address_space *, struct list_head *pages);
48156 +void init_inode_data_unix_file(struct inode *, reiser4_object_create_data *, int create);
48157 +int pre_delete_unix_file(struct inode *);
48158 +
48159 +extern ssize_t sendfile_common (
48160 +       struct file *file, loff_t *ppos, size_t count, read_actor_t actor, void __user *target);
48161 +extern ssize_t sendfile_unix_file (
48162 +       struct file *file, loff_t *ppos, size_t count, read_actor_t actor, void __user *target);
48163 +extern int prepare_write_unix_file (struct file *, struct page *, unsigned, unsigned);
48164 +
48165 +int sync_unix_file(struct inode *, int datasync);
48166 +
48167 +
48168 +/* all the write into unix file is performed by item write method. Write method of unix file plugin only decides which
48169 +   item plugin (extent or tail) and in which mode (one from the enum below) to call */
48170 +typedef enum {
48171 +       FIRST_ITEM = 1,
48172 +       APPEND_ITEM = 2,
48173 +       OVERWRITE_ITEM = 3
48174 +} write_mode_t;
48175 +
48176 +
48177 +/* unix file may be in one the following states */
48178 +typedef enum {
48179 +       UF_CONTAINER_UNKNOWN = 0,
48180 +       UF_CONTAINER_TAILS = 1,
48181 +       UF_CONTAINER_EXTENTS = 2,
48182 +       UF_CONTAINER_EMPTY = 3
48183 +} file_container_t;
48184 +
48185 +struct formatting_plugin;
48186 +struct inode;
48187 +
48188 +/* unix file plugin specific part of reiser4 inode */
48189 +typedef struct unix_file_info {
48190 +       struct rw_semaphore latch; /* this read-write lock protects file containerization change. Accesses which do not change
48191 +                            file containerization (see file_container_t) (read, readpage, writepage, write (until tail
48192 +                            conversion is involved)) take read-lock. Accesses which modify file containerization
48193 +                            (truncate, conversion from tail to extent and back) take write-lock. */
48194 +       file_container_t container; /* this enum specifies which items are used to build the file */
48195 +       struct formatting_plugin *tplug; /* plugin which controls when file is to be converted to extents and back to
48196 +                                           tail */
48197 +       /* if this is set, file is in exclusive use */
48198 +       int exclusive_use;
48199 +#if REISER4_DEBUG
48200 +       void *ea_owner; /* pointer to task struct of thread owning exclusive
48201 +                        * access to file */
48202 +#endif
48203 +} unix_file_info_t;
48204 +
48205 +struct unix_file_info *unix_file_inode_data(const struct inode * inode);
48206 +
48207 +#include "../item/extent.h"
48208 +#include "../item/tail.h"
48209 +
48210 +struct uf_coord {
48211 +       coord_t base_coord;
48212 +       lock_handle *lh;
48213 +       int valid;
48214 +       union {
48215 +               extent_coord_extension_t extent;
48216 +               tail_coord_extension_t tail;
48217 +       } extension;
48218 +};
48219 +
48220 +#include "../../seal.h"
48221 +
48222 +/* structure used to speed up file operations (reads and writes). It contains
48223 + * a seal over last file item accessed. */
48224 +struct hint {
48225 +       seal_t seal;
48226 +       uf_coord_t coord;
48227 +       loff_t offset;
48228 +       tree_level level;
48229 +       znode_lock_mode mode;
48230 +};
48231 +
48232 +void set_hint(hint_t *, const reiser4_key *, znode_lock_mode);
48233 +void unset_hint(hint_t *);
48234 +int hint_validate(hint_t *, const reiser4_key *, int check_key, znode_lock_mode);
48235 +
48236 +
48237 +#if REISER4_DEBUG
48238 +static inline struct task_struct *
48239 +inode_ea_owner(const unix_file_info_t *uf_info)
48240 +{
48241 +       return uf_info->ea_owner;
48242 +}
48243 +
48244 +static inline void ea_set(unix_file_info_t *uf_info, void *value)
48245 +{
48246 +       uf_info->ea_owner = value;
48247 +}
48248 +#else
48249 +#define ea_set(inode, value) noop
48250 +#endif
48251 +
48252 +static inline int ea_obtained(const unix_file_info_t *uf_info)
48253 +{
48254 +       assert("vs-1167", ergo (inode_ea_owner(uf_info) != NULL,
48255 +                               inode_ea_owner(uf_info) == current));
48256 +       return uf_info->exclusive_use;
48257 +}
48258 +
48259 +/* __REISER4_FILE_H__ */
48260 +#endif
48261 +
48262 +/*
48263 +   Local variables:
48264 +   c-indentation-style: "K&R"
48265 +   mode-name: "LC"
48266 +   c-basic-offset: 8
48267 +   tab-width: 8
48268 +   fill-column: 120
48269 +   scroll-step: 1
48270 +   End:
48271 +*/
48272 diff -rupN linux-2.6.8-rc3/fs/reiser4/plugin/file/funcs.h linux-2.6.8-rc3-a/fs/reiser4/plugin/file/funcs.h
48273 --- linux-2.6.8-rc3/fs/reiser4/plugin/file/funcs.h      1970-01-01 03:00:00.000000000 +0300
48274 +++ linux-2.6.8-rc3-a/fs/reiser4/plugin/file/funcs.h    2004-08-05 21:20:53.091658741 +0400
48275 @@ -0,0 +1,28 @@
48276 +/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by reiser4/README */
48277 +
48278 +/* this prototyles functions used by both file.c and tail_conversion.c */
48279 +void get_exclusive_access(unix_file_info_t *);
48280 +void drop_exclusive_access(unix_file_info_t *);
48281 +void get_nonexclusive_access(unix_file_info_t *);
48282 +void drop_nonexclusive_access(unix_file_info_t *);
48283 +void drop_access(unix_file_info_t *uf_info);
48284 +
48285 +int tail2extent(unix_file_info_t *);
48286 +int extent2tail(unix_file_info_t *);
48287 +int finish_conversion(struct inode *inode);
48288 +
48289 +void hint_init_zero(hint_t *, lock_handle *);
48290 +int find_file_item(hint_t *, const reiser4_key *, znode_lock_mode,
48291 +                  ra_info_t *, struct inode *);
48292 +int find_file_item_nohint(coord_t *, lock_handle *, const reiser4_key *,
48293 +                         znode_lock_mode, struct inode *);
48294 +
48295 +int goto_right_neighbor(coord_t *, lock_handle *);
48296 +int find_or_create_extent(struct page *);
48297 +write_mode_t how_to_write(uf_coord_t *, const reiser4_key *);
48298 +
48299 +extern inline int
48300 +cbk_errored(int cbk_result)
48301 +{
48302 +       return (cbk_result != CBK_COORD_NOTFOUND && cbk_result != CBK_COORD_FOUND);
48303 +}
48304 diff -rupN linux-2.6.8-rc3/fs/reiser4/plugin/file/invert.c linux-2.6.8-rc3-a/fs/reiser4/plugin/file/invert.c
48305 --- linux-2.6.8-rc3/fs/reiser4/plugin/file/invert.c     1970-01-01 03:00:00.000000000 +0300
48306 +++ linux-2.6.8-rc3-a/fs/reiser4/plugin/file/invert.c   2004-08-05 21:20:53.493573968 +0400
48307 @@ -0,0 +1,511 @@
48308 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
48309 +
48310 +/* Suppose you want to conveniently read and write a large variety of small files conveniently within a single emacs
48311 +   buffer, without having a separate buffer for each 8 byte or so file.  Inverts are the way to do that.  An invert
48312 +   provides you with the contents of a set of subfiles plus its own contents.  It is a file which inherits other files
48313 +   when you read it, and allows you to write to it and through it to the files that it inherits from.  In order for it
48314 +   to know which subfiles each part of your write should go into, there must be delimiters indicating that.  It tries to
48315 +   make that easy for you by providing those delimiters in what you read from it.
48316 +
48317 +  When you read it, an invert performs an inverted assignment.  Instead of taking an assignment command and writing a
48318 +  bunch of files, it takes a bunch of files and composes an assignment command for you to read from it that if executed
48319 +  would create those files.  But which files?  Well, that must be specified in the body of the invert using a special
48320 +  syntax, and that specification is called the invert of the assignment.
48321 +
48322 +  When written to, an invert performs the assignment command that is written
48323 +  to it, and modifies its own body to contain the invert of that
48324 +  assignment.
48325 +
48326 +  In other words, writing to an invert file what you have read from it
48327 +  is the identity operation.
48328 +
48329 +  Malformed assignments cause write errors.  Partial writes are not
48330 +  supported in v4.0, but will be.
48331 +
48332 +  Example:
48333 +
48334 +    If an invert contains:
48335 +
48336 +    /filenameA/<>+"(some text stored in the invert)+/filenameB/<>
48337 +
48338 +======================
48339 +Each element in this definition should be an invert, and all files
48340 +should be called recursively - too.  This is bad. If one of the
48341 +included files in not a regular or invert file, then we can't read
48342 +main file.
48343 +
48344 +I think to make it is possible easier:
48345 +
48346 +internal structure of invert file should be like symlink file. But
48347 +read and write method should be explitely indicated in i/o operation..
48348 +
48349 +By default we read and write (if probably) as symlink and if we
48350 +specify ..invert at reading time that too we can specify it at write time.
48351 +
48352 +example:
48353 +/my_invert_file/..invert<- ( (/filenameA<-"(The contents of filenameA))+"(some text stored in the invert)+(/filenameB<-"(The contents of filenameB) ) )
48354 +will create  /my_invert_file as invert, and will creat /filenameA and /filenameB with specified body.
48355 +
48356 +read of /my_invert_file/..invert will be
48357 +/filenameA<-"(The contents of filenameA)+"(some text stored in the invert)+/filenameB<-"(The contents of filenameB)
48358 +
48359 +but read of /my_invert_file/ will be
48360 +The contents of filenameAsome text stored in the invertThe contents of filenameB
48361 +
48362 +we also can creat this file as
48363 +/my_invert_file/<-/filenameA+"(some text stored in the invert)+/filenameB
48364 +will create  /my_invert_file , and use existing files /filenameA and /filenameB.
48365 +
48366 +and when we will read it will be as previously invert file.
48367 +
48368 +This is correct?
48369 +
48370 + vv
48371 +DEMIDOV-FIXME-HANS:
48372 +
48373 +Maybe you are right, but then you must disable writes to /my_invert_file/ and only allow writes to /my_invert_file/..invert
48374 +
48375 +Do you agree?  Discuss it on reiserfs-list....
48376 +
48377 +-Hans
48378 +=======================
48379 +
48380 +  Then a read will return:
48381 +
48382 +    /filenameA<-"(The contents of filenameA)+"(some text stored in the invert)+/filenameB<-"(The contents of filenameB)
48383 +
48384 +    and a write of the line above to the invert will set the contents of
48385 +    the invert and filenameA and filenameB to their original values.
48386 +
48387 +  Note that the contents of an invert have no influence on the effect
48388 +  of a write unless the write is a partial write (and a write of a
48389 +  shorter file without using truncate first is a partial write).
48390 +
48391 +  truncate() has no effect on filenameA and filenameB, it merely
48392 +  resets the value of the invert.
48393 +
48394 +  Writes to subfiles via the invert are implemented by preceding them
48395 +  with truncates.
48396 +
48397 +  Parse failures cause write failures.
48398 +
48399 +  Questions to ponder: should the invert be acted on prior to file
48400 +  close when writing to an open filedescriptor?
48401 +
48402 + Example:
48403 +
48404 + If an invert contains:
48405 +
48406 +   "(This text and a pair of quotes are all that is here.)
48407 +
48408 +Then a read will return:
48409 +
48410 +   "(This text and a pair of quotes are all that is here.)
48411 +
48412 +*/
48413 +
48414 +/* OPEN method places a struct file in memory associated with invert body
48415 +  and returns something like file descriptor to the user for the future access
48416 +  to the invert file.
48417 +  During opening we parse the body of invert and get a list of the 'entryes'
48418 +  (that describes all its subfiles) and place pointer on the first struct in
48419 +  reiserfs-specific part of invert inode (arbitrary decision).
48420 +
48421 +  Each subfile is described by the struct inv_entry that has a pointer @sd on
48422 +  in-core based stat-data and  a pointer on struct file @f (if we find that the
48423 +  subfile uses more then one unformated node (arbitrary decision), we load
48424 +  struct file in memory, otherwise we load base stat-data (and maybe 1-2 bytes
48425 +  of some other information we need)
48426 +
48427 +  Since READ and WRITE methods for inverts were formulated in assignment
48428 +  language, they don't contain arguments 'size' and 'offset' that make sense
48429 +  only in ordinary read/write methods.
48430 +
48431 +  READ method is a combination of two methods:
48432 +  1) ordinary read method (with offset=0, lenght = @f->...->i_size) for entries
48433 +  with @f != 0, this method uses pointer on struct file as an argument
48434 +  2) read method for inode-less files with @sd != 0, this method uses
48435 +  in-core based stat-data instead struct file as an argument.
48436 +  in the first case we don't use pagecache, just copy data that we got after
48437 +  cbk() into userspace.
48438 +
48439 +  WRITE method for invert files is more complex.
48440 +  Besides declared WRITE-interface in assignment languageb above we need
48441 +  to have an opportunity to edit unwrapped body of invert file with some
48442 +  text editor, it means we need GENERIC WRITE METHOD for invert file:
48443 +
48444 +  my_invert_file/..invert <- "string"
48445 +
48446 +  this method parses "string" and looks for correct subfile signatures, also
48447 +  the parsing process splits this "string" on the set of flows in  accordance
48448 +  with the set of subfiles specified by this signarure.
48449 +  The found list of signatures #S is compared with the opened one #I of invert
48450 +  file. If it doesn't have this one (#I==0, it will be so for instance if we
48451 +  have just create this invert file) the write method assignes found signature
48452 +  (#I=#S;) to the invert file. Then if #I==#S, generic write method splits
48453 +  itself to the some write methods for ordinary or light-weight, or call itself
48454 +  recursively for invert files with corresponding flows.
48455 +  I am not sure, but the list of signatures looks like what mr.Demidov means
48456 +  by 'delimiters'.
48457 +
48458 +  The cases when #S<#I (#I<#S) (in the sense of set-theory) are also available
48459 +  and cause delete (create new) subfiles (arbitrary decision - it may looks
48460 +  too complex, but this interface will be the completest). The order of entries
48461 +  of list #S (#I) and inherited order on #I (#S) must coincide.
48462 +  The other parsing results give malformed signature that aborts READ method
48463 +  and releases all resources.
48464 +
48465 +
48466 +  Format of subfile (entry) signature:
48467 +
48468 +  "START_MAGIC"<>(TYPE="...",LOOKUP_ARG="...")SUBFILE_BODY"END_MAGIC"
48469 +
48470 +  Legend:
48471 +
48472 +    START_MAGIC - keyword indicates the start of subfile signature;
48473 +
48474 +    <> indicates the start of 'subfile metadata', that is the pair
48475 +  (TYPE="...",LOOKUP_ARG="...") in parenthesis separated by comma.
48476 +
48477 +    TYPE - the string "type" indicates the start of one of the three words:
48478 +  - ORDINARY_FILE,
48479 +  - LIGHT_WEIGHT_FILE,
48480 +  - INVERT_FILE;
48481 +
48482 +    LOOKUP_ARG - lookup argument depends on previous type:
48483 +  */
48484 +
48485 + /************************************************************/
48486 + /*       TYPE        *          LOOKUP ARGUMENT             */
48487 + /************************************************************/
48488 + /* LIGH_WEIGHT_FILE  *           stat-data key              */
48489 + /************************************************************/
48490 + /*   ORDINARY_FILE   *             filename                 */
48491 + /************************************************************/
48492 + /*   INVERT_FILE     *             filename                 */
48493 + /************************************************************/
48494 +
48495 + /* where:
48496 +  *stat-data key - the string contains stat data key of this subfile, it will be
48497 +  passed to fast-access lookup method for light-weight files;
48498 +  *filename - pathname of this subfile, iyt well be passed to VFS lookup methods
48499 +  for ordinary and invert files;
48500 +
48501 +  SUBFILE_BODY - data of this subfile (it will go to the flow)
48502 +  END_MAGIC - the keyword indicates the end of subfile signature.
48503 +
48504 +  The other simbols inside the signature interpreted as 'unformatted content',
48505 +  which is available with VFS's read_link() (arbitraruy decision).
48506 +
48507 +  NOTE: Parse method for a body of invert file uses mentioned signatures _without_
48508 +  subfile bodies.
48509 +
48510 +  Now the only unclear thing is WRITE in regular light-weight subfile A that we
48511 +  can describe only in  assignment language:
48512 +
48513 +  A <- "some_string"
48514 +
48515 +  I guess we don't want to change stat-data and body items of file A
48516 +  if this file exist, and size(A) != size("some_string") because this operation is
48517 +  expencive, so we only do the partial write if size(A) > size("some_string")
48518 +  and do truncate of the "some_string", and then do A <- "truncated string", if
48519 +  size(A) < size("some_string"). This decision is also arbitrary..
48520 +  */
48521 +
48522 +/* here is infrastructure for formated flows */
48523 +
48524 +#define SUBFILE_HEADER_MAGIC 0x19196605
48525 +#define FLOW_HEADER_MAGIC 0x01194304
48526 +
48527 +#include "../plugin.h"
48528 +#include "../../debug.h"
48529 +#include "../../forward.h"
48530 +#include "../object.h"
48531 +#include "../item/item.h"
48532 +#include "../item/static_stat.h"
48533 +#include "../../dformat.h"
48534 +#include "../znode.h"
48535 +#include "../inode.h"
48536 +
48537 +#include <linux/types.h>
48538 +#include <linux/fs.h>          /* for struct file  */
48539 +#include <linux/list.h>                /* for struct list_head */
48540 +
48541 +typedef enum {
48542 +       LIGHT_WEIGHT_FILE,
48543 +       ORDINARY_FILE,
48544 +       INVERT_FILE
48545 +} inv_entry_type;
48546 +
48547 +typedef struct flow_header {
48548 +       d32 fl_magic;
48549 +       d16 fl_nr;              /* number of subfiles in the flow */
48550 +};
48551 +
48552 +typedef struct subfile_header {
48553 +       d32 sh_magic;           /* subfile magic */
48554 +       d16 sh_type;            /* type of subfile: light-weight, ordinary, invert */
48555 +       d16 sh_arg_len;         /* lenght of lookup argument (filename, key) */
48556 +       d32 sh_body_len;        /* lenght of subfile body */
48557 +};
48558 +
48559 +/* functions to get/set fields of flow header */
48560 +
48561 +static void
48562 +fl_set_magic(flow_header * fh, __u32 value)
48563 +{
48564 +       cputod32(value, &fh->fh_magic);
48565 +}
48566 +
48567 +static __u32
48568 +fl_get_magic(flow_header * fh)
48569 +{
48570 +       return d32tocpu(&fh->fh_magic);
48571 +}
48572 +static void
48573 +fl_set_number(flow_header * fh, __u16 value)
48574 +{
48575 +       cputod16(value, &fh->fh_nr);
48576 +}
48577 +static unsigned
48578 +fl_get_number(flow_header * fh)
48579 +{
48580 +       return d16tocpu(&fh->fh_nr);
48581 +}
48582 +
48583 +/* functions to get/set fields of subfile header */
48584 +
48585 +static void
48586 +sh_set_magic(subfile_header * sh, __u32 value)
48587 +{
48588 +       cputod32(value, &sh->sh_magic);
48589 +}
48590 +
48591 +static __u32
48592 +sh_get_magic(subfile_header * sh)
48593 +{
48594 +       return d32tocpu(&sh->sh_magic);
48595 +}
48596 +static void
48597 +sh_set_type(subfile_header * sh, __u16 value)
48598 +{
48599 +       cputod16(value, &sh->sh_magic);
48600 +}
48601 +static unsigned
48602 +sh_get_type(subfile_header * sh)
48603 +{
48604 +       return d16tocpu(&sh->sh_magic);
48605 +}
48606 +static void
48607 +sh_set_arg_len(subfile_header * sh, __u16 value)
48608 +{
48609 +       cputod16(value, &sh->sh_arg_len);
48610 +}
48611 +static unsigned
48612 +sh_get_arg_len(subfile_header * sh)
48613 +{
48614 +       return d16tocpu(&sh->sh_arg_len);
48615 +}
48616 +static void
48617 +sh_set_body_len(subfile_header * sh, __u32 value)
48618 +{
48619 +       cputod32(value, &sh->sh_body_len);
48620 +}
48621 +
48622 +static __u32
48623 +sh_get_body_len(subfile_header * sh)
48624 +{
48625 +       return d32tocpu(&sh->sh_body_len);
48626 +}
48627 +
48628 +/* in-core minimal stat-data, light-weight analog of inode */
48629 +
48630 +struct incore_sd_base {
48631 +       umode_t isd_mode;
48632 +       nlink_t isd_nlink;
48633 +       loff_t isd_size;
48634 +       char *isd_data;         /* 'subflow' to write */
48635 +};
48636 +
48637 +/* open invert create a list of invert entries,
48638 +   every entry is represented by structure inv_entry */
48639 +
48640 +struct inv_entry {
48641 +       struct list_head *ie_list;
48642 +       struct file *ie_file;   /* this is NULL if the file doesn't
48643 +                                  have unformated nodes */
48644 +       struct incore_sd_base *ie_sd;   /* inode-less analog of struct file */
48645 +};
48646 +
48647 +/* allocate and init invert entry */
48648 +
48649 +static struct inv_entry *
48650 +allocate_inv_entry(void)
48651 +{
48652 +       struct inv_entry *inv_entry;
48653 +
48654 +       inv_entry = reiser4_kmalloc(sizeof (struct inv_entry), GFP_KERNEL);
48655 +       if (!inv_entry)
48656 +               return ERR_PTR(RETERR(-ENOMEM));
48657 +       inv_entry->ie_file = NULL;
48658 +       inv_entry->ie_sd = NULL;
48659 +       INIT_LIST_HEAD(&inv_entry->ie_list);
48660 +       return inv_entry;
48661 +}
48662 +
48663 +static int
48664 +put_inv_entry(struct inv_entry *ientry)
48665 +{
48666 +       int result = 0;
48667 +
48668 +       assert("edward-96", ientry != NULL);
48669 +       assert("edward-97", ientry->ie_list != NULL);
48670 +
48671 +       list_del(ientry->ie_list);
48672 +       if (ientry->ie_sd != NULL) {
48673 +               kfree(ientry->ie_sd);
48674 +               kfree(ientry);
48675 +       }
48676 +       if (ientry->ie_file != NULL)
48677 +               result = filp_close(ientry->file, NULL);
48678 +       return result;
48679 +}
48680 +
48681 +static int
48682 +allocate_incore_sd_base(struct inv_entry *inv_entry)
48683 +{
48684 +       struct incore_sd_base *isd_base assert("edward-98", inv_entry != NULL);
48685 +       assert("edward-99", inv_entry->ie_inode = NULL);
48686 +       assert("edward-100", inv_entry->ie_sd = NULL);
48687 +
48688 +       isd_base = reiser4_kmalloc(sizeof (struct incore_sd_base), GFP_KERNEL);
48689 +       if (!isd_base)
48690 +               return RETERR(-ENOMEM);
48691 +       inv_entry->ie_sd = isd_base;
48692 +       return 0;
48693 +}
48694 +
48695 +/* this can be installed as ->init_inv_entry () method of
48696 +   item_plugins[ STATIC_STAT_DATA_IT ] (fs/reiser4/plugin/item/item.c).
48697 +   Copies data from on-disk stat-data format into light-weight analog of inode .
48698 +   Doesn't hanlde stat-data extensions. */
48699 +
48700 +static void
48701 +sd_base_load(struct inv_entry *inv_entry, char *sd)
48702 +{
48703 +       reiser4_stat_data_base *sd_base;
48704 +
48705 +       assert("edward-101", inv_entry != NULL);
48706 +       assert("edward-101", inv_entry->ie_sd != NULL);
48707 +       assert("edward-102", sd != NULL);
48708 +
48709 +       sd_base = (reiser4_stat_data_base *) sd;
48710 +       inv_entry->incore_sd_base->isd_mode = d16tocpu(&sd_base->mode);
48711 +       inv_entry->incore_sd_base->isd_nlink = d32tocpu(&sd_base->nlink);
48712 +       inv_entry->incore_sd_base->isd_size = d64tocpu(&sd_base->size);
48713 +       inv_entry->incore_sd_base->isd_data = NULL;
48714 +}
48715 +
48716 +/* initialise incore stat-data */
48717 +
48718 +static void
48719 +init_incore_sd_base(struct inv_entry *inv_entry, coord_t * coord)
48720 +{
48721 +       reiser4_plugin *plugin = item_plugin_by_coord(coord);
48722 +       void *body = item_body_by_coord(coord);
48723 +
48724 +       assert("edward-103", inv_entry != NULL);
48725 +       assert("edward-104", plugin != NULL);
48726 +       assert("edward-105", body != NULL);
48727 +
48728 +       sd_base_load(inv_entry, body);
48729 +}
48730 +
48731 +/* takes a key or filename and allocates new invert_entry,
48732 +   init and adds it into the list,
48733 +   we use lookup_sd_by_key() for light-weight files and VFS lookup by filename */
48734 +
48735 +int
48736 +get_inv_entry(struct inode *invert_inode,      /* inode of invert's body */
48737 +             inv_entry_type type,      /* LIGHT-WEIGHT or ORDINARY */
48738 +             const reiser4_key * key,  /* key of invert entry stat-data */
48739 +             char *filename,   /* filename of the file to be opened */
48740 +             int flags, int mode)
48741 +{
48742 +       int result;
48743 +       struct inv_entry *ientry;
48744 +
48745 +       assert("edward-107", invert_inode != NULL);
48746 +
48747 +       ientry = allocate_inv_entry();
48748 +       if (IS_ERR(ientry))
48749 +               return (PTR_ERR(ientry));
48750 +
48751 +       if (type == LIGHT_WEIGHT_FILE) {
48752 +               coord_t coord;
48753 +               lock_handle lh;
48754 +
48755 +               assert("edward-108", key != NULL);
48756 +
48757 +               init_coord(&coord);
48758 +               init_lh(&lh);
48759 +               result = lookup_sd_by_key(tree_by_inode(invert_inode), ZNODE_READ_LOCK, &coord, &lh, key);
48760 +               if (result == 0)
48761 +                       init_incore_sd_base(ientry, coord);
48762 +
48763 +               done_lh(&lh);
48764 +               done_coord(&coord);
48765 +               return (result);
48766 +       } else {
48767 +               struct file *file = filp_open(filename, flags, mode);
48768 +               /* FIXME_EDWARD here we need to check if we
48769 +                  did't follow to any mount point */
48770 +
48771 +               assert("edward-108", filename != NULL);
48772 +
48773 +               if (IS_ERR(file))
48774 +                       return (PTR_ERR(file));
48775 +               ientry->ie_file = file;
48776 +               return 0;
48777 +       }
48778 +}
48779 +
48780 +/* takes inode of invert, reads the body of this invert, parses it,
48781 +   opens all invert entries and return pointer on the first inv_entry */
48782 +
48783 +struct inv_entry *
48784 +open_invert(struct file *invert_file)
48785 +{
48786 +
48787 +}
48788 +
48789 +ssize_t subfile_read(struct *invert_entry, flow * f)
48790 +{
48791 +
48792 +}
48793 +
48794 +ssize_t subfile_write(struct *invert_entry, flow * f)
48795 +{
48796 +
48797 +}
48798 +
48799 +ssize_t invert_read(struct *file, flow * f)
48800 +{
48801 +
48802 +}
48803 +
48804 +ssize_t invert_write(struct *file, flow * f)
48805 +{
48806 +
48807 +}
48808 +
48809 +/* Make Linus happy.
48810 +   Local variables:
48811 +   c-indentation-style: "K&R"
48812 +   mode-name: "LC"
48813 +   c-basic-offset: 8
48814 +   tab-width: 8
48815 +   fill-column: 120
48816 +   scroll-step: 1
48817 +   End:
48818 +*/
48819 diff -rupN linux-2.6.8-rc3/fs/reiser4/plugin/file/pseudo.c linux-2.6.8-rc3-a/fs/reiser4/plugin/file/pseudo.c
48820 --- linux-2.6.8-rc3/fs/reiser4/plugin/file/pseudo.c     1970-01-01 03:00:00.000000000 +0300
48821 +++ linux-2.6.8-rc3-a/fs/reiser4/plugin/file/pseudo.c   2004-08-05 21:20:52.861707244 +0400
48822 @@ -0,0 +1,182 @@
48823 +/* Copyright 2003 by Hans Reiser, licensing governed by reiser4/README */
48824 +
48825 +/*
48826 + * Pseudo file plugin. This contains helper functions used by pseudo files.
48827 + */
48828 +
48829 +#include "pseudo.h"
48830 +#include "../plugin.h"
48831 +
48832 +#include "../../inode.h"
48833 +
48834 +#include <linux/seq_file.h>
48835 +#include <linux/fs.h>
48836 +
48837 +struct seq_operations pseudo_seq_op;
48838 +
48839 +/* extract pseudo file plugin, stored in @file */
48840 +static pseudo_plugin *
48841 +get_pplug(struct file * file)
48842 +{
48843 +       struct inode  *inode;
48844 +
48845 +       inode = file->f_dentry->d_inode;
48846 +       return reiser4_inode_data(inode)->file_plugin_data.pseudo_info.plugin;
48847 +}
48848 +
48849 +/* common routine to open pseudo file. */
48850 +reiser4_internal int open_pseudo(struct inode * inode, struct file * file)
48851 +{
48852 +       int result;
48853 +       pseudo_plugin *pplug;
48854 +
48855 +       pplug = get_pplug(file);
48856 +
48857 +       /* for pseudo files based on seq_file interface */
48858 +       if (pplug->read_type == PSEUDO_READ_SEQ) {
48859 +               result = seq_open(file, &pplug->read.ops);
48860 +               if (result == 0) {
48861 +                       struct seq_file *m;
48862 +
48863 +                       m = file->private_data;
48864 +                       m->private = file;
48865 +               }
48866 +       } else if (pplug->read_type == PSEUDO_READ_SINGLE)
48867 +               /* for pseudo files containing one record */
48868 +               result = single_open(file, pplug->read.single_show, file);
48869 +       else
48870 +               result = 0;
48871 +
48872 +       return result;
48873 +}
48874 +
48875 +/* common read method for pseudo files */
48876 +reiser4_internal ssize_t read_pseudo(struct file *file,
48877 +                                    char __user *buf, size_t size, loff_t *ppos)
48878 +{
48879 +       switch (get_pplug(file)->read_type) {
48880 +       case PSEUDO_READ_SEQ:
48881 +       case PSEUDO_READ_SINGLE:
48882 +               /* seq_file behaves like pipe, requiring @ppos to always be
48883 +                * address of file->f_pos */
48884 +               return seq_read(file, buf, size, &file->f_pos);
48885 +       case PSEUDO_READ_FORWARD:
48886 +               return get_pplug(file)->read.read(file, buf, size, ppos);
48887 +       default:
48888 +               return 0;
48889 +       }
48890 +}
48891 +
48892 +/* common seek method for pseudo files */
48893 +reiser4_internal loff_t seek_pseudo(struct file *file, loff_t offset, int origin)
48894 +{
48895 +       switch (get_pplug(file)->read_type) {
48896 +       case PSEUDO_READ_SEQ:
48897 +       case PSEUDO_READ_SINGLE:
48898 +               return seq_lseek(file, offset, origin);
48899 +       default:
48900 +               return 0;
48901 +       }
48902 +}
48903 +
48904 +/* common release method for pseudo files */
48905 +reiser4_internal int release_pseudo(struct inode *inode, struct file *file)
48906 +{
48907 +       int result;
48908 +
48909 +       switch (get_pplug(file)->read_type) {
48910 +       case PSEUDO_READ_SEQ:
48911 +       case PSEUDO_READ_SINGLE:
48912 +               result = seq_release(inode, file);
48913 +               file->private_data = NULL;
48914 +               break;
48915 +       default:
48916 +               result = 0;
48917 +       }
48918 +       return result;
48919 +}
48920 +
48921 +/* pseudo files need special ->drop() method, because they don't have nlink
48922 + * and only exist while host object does. */
48923 +reiser4_internal void drop_pseudo(struct inode * object)
48924 +{
48925 +       /* pseudo files are not protected from deletion by their ->i_nlink */
48926 +       generic_delete_inode(object);
48927 +}
48928 +
48929 +/* common write method for pseudo files */
48930 +reiser4_internal ssize_t
48931 +write_pseudo(struct file *file,
48932 +            const char __user *buf, size_t size, loff_t *ppos)
48933 +{
48934 +       ssize_t result;
48935 +
48936 +       switch (get_pplug(file)->write_type) {
48937 +       case PSEUDO_WRITE_STRING: {
48938 +               char * inkernel;
48939 +
48940 +               inkernel = getname(buf);
48941 +               if (!IS_ERR(inkernel)) {
48942 +                       result = get_pplug(file)->write.gets(file, inkernel);
48943 +                       putname(inkernel);
48944 +                       if (result == 0)
48945 +                               result = size;
48946 +               } else
48947 +                       result = PTR_ERR(inkernel);
48948 +               break;
48949 +       }
48950 +       case PSEUDO_WRITE_FORWARD:
48951 +               result = get_pplug(file)->write.write(file, buf, size, ppos);
48952 +               break;
48953 +       default:
48954 +               result = size;
48955 +       }
48956 +       return result;
48957 +}
48958 +
48959 +/* on-wire serialization of pseudo files. */
48960 +
48961 +/* this is not implemented so far (and, hence, pseudo files are not accessible
48962 + * over NFS, closing remote exploits a fortiori */
48963 +
48964 +reiser4_internal int
48965 +wire_size_pseudo(struct inode *inode)
48966 +{
48967 +       return RETERR(-ENOTSUPP);
48968 +}
48969 +
48970 +reiser4_internal char *
48971 +wire_write_pseudo(struct inode *inode, char *start)
48972 +{
48973 +       return ERR_PTR(RETERR(-ENOTSUPP));
48974 +}
48975 +
48976 +reiser4_internal char *
48977 +wire_read_pseudo(char *addr, reiser4_object_on_wire *obj)
48978 +{
48979 +       return ERR_PTR(RETERR(-ENOTSUPP));
48980 +}
48981 +
48982 +reiser4_internal void
48983 +wire_done_pseudo(reiser4_object_on_wire *obj)
48984 +{
48985 +       /* nothing to do */
48986 +}
48987 +
48988 +reiser4_internal struct dentry *
48989 +wire_get_pseudo(struct super_block *sb, reiser4_object_on_wire *obj)
48990 +{
48991 +       return ERR_PTR(RETERR(-ENOTSUPP));
48992 +}
48993 +
48994 +
48995 +/* Make Linus happy.
48996 +   Local variables:
48997 +   c-indentation-style: "K&R"
48998 +   mode-name: "LC"
48999 +   c-basic-offset: 8
49000 +   tab-width: 8
49001 +   fill-column: 120
49002 +   scroll-step: 1
49003 +   End:
49004 +*/
49005 diff -rupN linux-2.6.8-rc3/fs/reiser4/plugin/file/pseudo.h linux-2.6.8-rc3-a/fs/reiser4/plugin/file/pseudo.h
49006 --- linux-2.6.8-rc3/fs/reiser4/plugin/file/pseudo.h     1970-01-01 03:00:00.000000000 +0300
49007 +++ linux-2.6.8-rc3-a/fs/reiser4/plugin/file/pseudo.h   2004-08-05 21:20:52.782723904 +0400
49008 @@ -0,0 +1,39 @@
49009 +/* Copyright 2003 by Hans Reiser, licensing governed by reiser4/README */
49010 +
49011 +#if !defined(__REISER4_PSEUDO_FILE_H__)
49012 +#define __REISER4_PSEUDO_FILE_H__
49013 +
49014 +#include "../plugin.h"
49015 +
49016 +#include <linux/fs.h>
49017 +
49018 +extern int open_pseudo(struct inode * inode, struct file * file);
49019 +extern ssize_t read_pseudo(struct file *file,
49020 +                          char __user *buf, size_t size, loff_t *ppos);
49021 +extern ssize_t write_pseudo(struct file *file,
49022 +                           const char __user *buf, size_t size, loff_t *ppos);
49023 +extern loff_t seek_pseudo(struct file *file, loff_t offset, int origin);
49024 +extern int release_pseudo(struct inode *inode, struct file *file);
49025 +extern void drop_pseudo(struct inode * object);
49026 +
49027 +extern int wire_size_pseudo(struct inode *inode);
49028 +extern char *wire_write_pseudo(struct inode *inode, char *start);
49029 +extern char *wire_read_pseudo(char *addr, reiser4_object_on_wire *obj);
49030 +extern void wire_done_pseudo(reiser4_object_on_wire *obj);
49031 +extern struct dentry *wire_get_pseudo(struct super_block *sb,
49032 +                                     reiser4_object_on_wire *obj);
49033 +
49034 +/* __REISER4_PSEUDO_FILE_H__ */
49035 +#endif
49036 +
49037 +/*
49038 +   Local variables:
49039 +   c-indentation-style: "K&R"
49040 +   mode-name: "LC"
49041 +   c-basic-offset: 8
49042 +   tab-width: 8
49043 +   fill-column: 120
49044 +   scroll-step: 1
49045 +   End:
49046 +*/
49047 +
49048 diff -rupN linux-2.6.8-rc3/fs/reiser4/plugin/file/symfile.c linux-2.6.8-rc3-a/fs/reiser4/plugin/file/symfile.c
49049 --- linux-2.6.8-rc3/fs/reiser4/plugin/file/symfile.c    1970-01-01 03:00:00.000000000 +0300
49050 +++ linux-2.6.8-rc3-a/fs/reiser4/plugin/file/symfile.c  2004-08-05 21:20:52.785723271 +0400
49051 @@ -0,0 +1,98 @@
49052 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
49053 +
49054 +/* Symfiles are a generalization of Unix symlinks.
49055 +
49056 +   A symfile when read behaves as though you took its contents and
49057 +   substituted them into the reiser4 naming system as the right hand side
49058 +   of an assignment, and then read that which you had assigned to it.
49059 +
49060 +   A key issue for symfiles is how to implement writes through to
49061 +   subfiles.  In general, one must have some method of determining what
49062 +   of that which is written to the symfile is written to what subfile.
49063 +   This can be done by use of custom plugin methods written by users, or
49064 +   by using a few general methods we provide for those willing to endure
49065 +   the insertion of delimiters into what is read.
49066 +
49067 +   Writing to symfiles without delimiters to denote what is written to
49068 +   what subfile is not supported by any plugins we provide in this
49069 +   release.  Our most sophisticated support for writes is that embodied
49070 +   by the invert plugin (see invert.c).
49071 +
49072 +   A read only version of the /etc/passwd file might be
49073 +   constructed as a symfile whose contents are as follows:
49074 +
49075 +   /etc/passwd/userlines/*
49076 +
49077 +   or
49078 +
49079 +   /etc/passwd/userlines/demidov+/etc/passwd/userlines/edward+/etc/passwd/userlines/reiser+/etc/passwd/userlines/root
49080 +
49081 +   or
49082 +
49083 +   /etc/passwd/userlines/(demidov+edward+reiser+root)
49084 +
49085 +   A symfile with contents
49086 +
49087 +   /filenameA+"(some text stored in the uninvertable symfile)+/filenameB
49088 +
49089 +   will return when read
49090 +
49091 +   The contents of filenameAsome text stored in the uninvertable symfileThe contents of filenameB
49092 +
49093 +   and write of what has been read will not be possible to implement as
49094 +   an identity operation because there are no delimiters denoting the
49095 +   boundaries of what is to be written to what subfile.
49096 +
49097 +   Note that one could make this a read/write symfile if one specified
49098 +   delimiters, and the write method understood those delimiters delimited
49099 +   what was written to subfiles.
49100 +
49101 +   So, specifying the symfile in a manner that allows writes:
49102 +
49103 +   /etc/passwd/userlines/demidov+"(
49104 +   )+/etc/passwd/userlines/edward+"(
49105 +   )+/etc/passwd/userlines/reiser+"(
49106 +   )+/etc/passwd/userlines/root+"(
49107 +   )
49108 +
49109 +   or
49110 +
49111 +   /etc/passwd/userlines/(demidov+"(
49112 +   )+edward+"(
49113 +   )+reiser+"(
49114 +   )+root+"(
49115 +   ))
49116 +
49117 +   and the file demidov might be specified as:
49118 +
49119 +   /etc/passwd/userlines/demidov/username+"(:)+/etc/passwd/userlines/demidov/password+"(:)+/etc/passwd/userlines/demidov/userid+"(:)+/etc/passwd/userlines/demidov/groupid+"(:)+/etc/passwd/userlines/demidov/gecos+"(:)+/etc/passwd/userlines/demidov/home+"(:)+/etc/passwd/userlines/demidov/shell
49120 +
49121 +   or
49122 +
49123 +   /etc/passwd/userlines/demidov/(username+"(:)+password+"(:)+userid+"(:)+groupid+"(:)+gecos+"(:)+home+"(:)+shell)
49124 +
49125 +   Notice that if the file demidov has a carriage return in it, the
49126 +   parsing fails, but then if you put carriage returns in the wrong place
49127 +   in a normal /etc/passwd file it breaks things also.
49128 +
49129 +   Note that it is forbidden to have no text between two interpolations
49130 +   if one wants to be able to define what parts of a write go to what
49131 +   subfiles referenced in an interpolation.
49132 +
49133 +   If one wants to be able to add new lines by writing to the file, one
49134 +   must either write a custom plugin for /etc/passwd that knows how to
49135 +   name an added line, or one must use an invert, or one must use a more
49136 +   sophisticated symfile syntax that we are not planning to write for
49137 +   version 4.0.
49138 +*/
49139 +
49140 +
49141 +
49142 +
49143 +
49144 +
49145 +
49146 +
49147 +
49148 +
49149 +
49150 diff -rupN linux-2.6.8-rc3/fs/reiser4/plugin/file/tail_conversion.c linux-2.6.8-rc3-a/fs/reiser4/plugin/file/tail_conversion.c
49151 --- linux-2.6.8-rc3/fs/reiser4/plugin/file/tail_conversion.c    1970-01-01 03:00:00.000000000 +0300
49152 +++ linux-2.6.8-rc3-a/fs/reiser4/plugin/file/tail_conversion.c  2004-08-05 21:20:53.004677088 +0400
49153 @@ -0,0 +1,719 @@
49154 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
49155 +
49156 +#include "../../inode.h"
49157 +#include "../../super.h"
49158 +#include "../../page_cache.h"
49159 +#include "../../carry.h"
49160 +#include "../../lib.h"
49161 +#include "../../safe_link.h"
49162 +#include "funcs.h"
49163 +#include <linux/writeback.h>
49164 +
49165 +/* this file contains:
49166 +   tail2extent and extent2tail */
49167 +
49168 +
49169 +/* exclusive access to a file is acquired when file state changes: tail2extent, empty2tail, extent2tail, etc */
49170 +reiser4_internal void
49171 +get_exclusive_access(unix_file_info_t *uf_info)
49172 +{
49173 +       assert("nikita-3028", schedulable());
49174 +       assert("nikita-3047", LOCK_CNT_NIL(inode_sem_w));
49175 +       assert("nikita-3048", LOCK_CNT_NIL(inode_sem_r));
49176 +       /*
49177 +        * "deadlock detection": sometimes we commit a transaction under
49178 +        * rw-semaphore on a file. Such commit can deadlock with another
49179 +        * thread that captured some block (hence preventing atom from being
49180 +        * committed) and waits on rw-semaphore.
49181 +        */
49182 +       assert("nikita-3361", get_current_context()->trans->atom == NULL);
49183 +       BUG_ON(get_current_context()->trans->atom != NULL);
49184 +       LOCK_CNT_INC(inode_sem_w);
49185 +       down_write(&uf_info->latch);
49186 +       assert("nikita-3060", inode_ea_owner(uf_info) == NULL);
49187 +       assert("vs-1157", !ea_obtained(uf_info));
49188 +       ea_set(uf_info, current);
49189 +       uf_info->exclusive_use = 1;
49190 +}
49191 +
49192 +reiser4_internal void
49193 +drop_exclusive_access(unix_file_info_t *uf_info)
49194 +{
49195 +       assert("nikita-3060", inode_ea_owner(uf_info) == current);
49196 +       assert("vs-1158", ea_obtained(uf_info));
49197 +       ea_set(uf_info, 0);
49198 +       uf_info->exclusive_use = 0;
49199 +       up_write(&uf_info->latch);
49200 +       assert("nikita-3049", LOCK_CNT_NIL(inode_sem_r));
49201 +       assert("nikita-3049", LOCK_CNT_GTZ(inode_sem_w));
49202 +       LOCK_CNT_DEC(inode_sem_w);
49203 +}
49204 +
49205 +/* nonexclusive access to a file is acquired for read, write, readpage */
49206 +reiser4_internal void
49207 +get_nonexclusive_access(unix_file_info_t *uf_info)
49208 +{
49209 +       assert("nikita-3029", schedulable());
49210 +       down_read(&uf_info->latch);
49211 +       LOCK_CNT_INC(inode_sem_r);
49212 +       assert("nikita-3060", inode_ea_owner(uf_info) == NULL);
49213 +       assert("vs-1159", !ea_obtained(uf_info));
49214 +}
49215 +
49216 +reiser4_internal void
49217 +drop_nonexclusive_access(unix_file_info_t *uf_info)
49218 +{
49219 +       assert("nikita-3060", inode_ea_owner(uf_info) == NULL);
49220 +       assert("vs-1160", !ea_obtained(uf_info));
49221 +       up_read(&uf_info->latch);
49222 +       LOCK_CNT_DEC(inode_sem_r);
49223 +}
49224 +
49225 +/* part of tail2extent. Cut all items covering @count bytes starting from
49226 +   @offset */
49227 +/* Audited by: green(2002.06.15) */
49228 +static int
49229 +cut_formatting_items(struct inode *inode, loff_t offset, int count)
49230 +{
49231 +       reiser4_key from, to;
49232 +
49233 +       /* AUDIT: How about putting an assertion here, what would check
49234 +          all provided range is covered by tail items only? */
49235 +       /* key of first byte in the range to be cut  */
49236 +       key_by_inode_unix_file(inode, offset, &from);
49237 +
49238 +       /* key of last byte in that range */
49239 +       to = from;
49240 +       set_key_offset(&to, (__u64) (offset + count - 1));
49241 +
49242 +       /* cut everything between those keys */
49243 +       return cut_tree(tree_by_inode(inode), &from, &to, inode, 1);
49244 +}
49245 +
49246 +static void
49247 +release_all_pages(struct page **pages, unsigned nr_pages)
49248 +{
49249 +       unsigned i;
49250 +
49251 +       for (i = 0; i < nr_pages; i++) {
49252 +               if (pages[i] == NULL) {
49253 +                       unsigned j;
49254 +                       for (j = i + 1; j < nr_pages; j ++)
49255 +                               assert("vs-1620", pages[j] == NULL);
49256 +                       break;
49257 +               }
49258 +               page_cache_release(pages[i]);
49259 +               pages[i] = NULL;
49260 +       }
49261 +}
49262 +
49263 +/* part of tail2extent. replace tail items with extent one. Content of tail
49264 +   items (@count bytes) being cut are copied already into
49265 +   pages. extent_writepage method is called to create extents corresponding to
49266 +   those pages */
49267 +static int
49268 +replace(struct inode *inode, struct page **pages, unsigned nr_pages, int count)
49269 +{
49270 +       int result;
49271 +       unsigned i;
49272 +       STORE_COUNTERS;
49273 +
49274 +       if (nr_pages == 0)
49275 +               return 0;
49276 +
49277 +       assert("vs-596", pages[0]);
49278 +
49279 +       /* cut copied items */
49280 +       result = cut_formatting_items(inode, (loff_t) pages[0]->index << PAGE_CACHE_SHIFT, count);
49281 +       if (result)
49282 +               return result;
49283 +
49284 +       CHECK_COUNTERS;
49285 +
49286 +       /* put into tree replacement for just removed items: extent item, namely */
49287 +       for (i = 0; i < nr_pages; i++) {
49288 +               result = add_to_page_cache_lru(pages[i], inode->i_mapping,
49289 +                                              pages[i]->index, mapping_gfp_mask(inode->i_mapping));
49290 +               if (result)
49291 +                       break;
49292 +               unlock_page(pages[i]);
49293 +               result = find_or_create_extent(pages[i]);
49294 +               if (result)
49295 +                       break;
49296 +               SetPageUptodate(pages[i]);
49297 +       }
49298 +       return result;
49299 +}
49300 +
49301 +#define TAIL2EXTENT_PAGE_NUM 3 /* number of pages to fill before cutting tail
49302 +                                * items */
49303 +
49304 +static int
49305 +reserve_tail2extent_iteration(struct inode *inode)
49306 +{
49307 +       reiser4_block_nr unformatted_nodes;
49308 +       reiser4_tree *tree;
49309 +
49310 +       tree = tree_by_inode(inode);
49311 +
49312 +       /* number of unformatted nodes which will be created */
49313 +       unformatted_nodes = TAIL2EXTENT_PAGE_NUM;
49314 +
49315 +       /*
49316 +        * space required for one iteration of extent->tail conversion:
49317 +        *
49318 +        *     1. kill N tail items
49319 +        *
49320 +        *     2. insert TAIL2EXTENT_PAGE_NUM unformatted nodes
49321 +        *
49322 +        *     3. insert TAIL2EXTENT_PAGE_NUM (worst-case single-block
49323 +        *     extents) extent units.
49324 +        *
49325 +        *     4. drilling to the leaf level by coord_by_key()
49326 +        *
49327 +        *     5. possible update of stat-data
49328 +        *
49329 +        */
49330 +       grab_space_enable();
49331 +       return reiser4_grab_space
49332 +               (2 * tree->height +
49333 +                TAIL2EXTENT_PAGE_NUM +
49334 +                TAIL2EXTENT_PAGE_NUM * estimate_one_insert_into_item(tree) +
49335 +                1 + estimate_one_insert_item(tree) +
49336 +                inode_file_plugin(inode)->estimate.update(inode),
49337 +                BA_CAN_COMMIT);
49338 +}
49339 +
49340 +/* this is used by tail2extent and extent2tail to detect where previous uncompleted conversion stopped */
49341 +static int
49342 +find_start(struct inode *object, reiser4_plugin_id id, __u64 *offset)
49343 +{
49344 +       int               result;
49345 +       lock_handle       lh;
49346 +       coord_t           coord;
49347 +       unix_file_info_t *ufo;
49348 +       int               found;
49349 +       reiser4_key       key;
49350 +
49351 +       ufo = unix_file_inode_data(object);
49352 +       init_lh(&lh);
49353 +       result = 0;
49354 +       found = 0;
49355 +       key_by_inode_unix_file(object, *offset, &key);
49356 +       do {
49357 +               init_lh(&lh);
49358 +               result = find_file_item_nohint(&coord, &lh, &key,
49359 +                                              ZNODE_READ_LOCK, object);
49360 +
49361 +               if (result == CBK_COORD_FOUND) {
49362 +                       if (coord.between == AT_UNIT) {
49363 +                               /*coord_clear_iplug(&coord);*/
49364 +                               result = zload(coord.node);
49365 +                               if (result == 0) {
49366 +                                       if (item_id_by_coord(&coord) == id)
49367 +                                               found = 1;
49368 +                                       else
49369 +                                               item_plugin_by_coord(&coord)->s.file.append_key(&coord, &key);
49370 +                                       zrelse(coord.node);
49371 +                               }
49372 +                       } else
49373 +                               result = RETERR(-ENOENT);
49374 +               }
49375 +               done_lh(&lh);
49376 +       } while (result == 0 && !found);
49377 +       *offset = get_key_offset(&key);
49378 +       return result;
49379 +}
49380 +
49381 +/* clear stat data's flag indicating that conversion is being converted */
49382 +static int
49383 +complete_conversion(struct inode *inode)
49384 +{
49385 +       int result;
49386 +
49387 +       all_grabbed2free();
49388 +       grab_space_enable();
49389 +       result = reiser4_grab_space(inode_file_plugin(inode)->estimate.update(inode),
49390 +                                   BA_CAN_COMMIT);
49391 +       if (result == 0) {
49392 +               inode_clr_flag(inode, REISER4_PART_CONV);
49393 +               result = reiser4_update_sd(inode);
49394 +       }
49395 +       if (result)
49396 +               warning("vs-1696", "Failed to clear converting bit of %llu: %i",
49397 +                       get_inode_oid(inode), result);
49398 +       return 0;
49399 +}
49400 +
49401 +reiser4_internal int
49402 +tail2extent(unix_file_info_t *uf_info)
49403 +{
49404 +       int result;
49405 +       reiser4_key key;        /* key of next byte to be moved to page */
49406 +       ON_DEBUG(reiser4_key tmp;)
49407 +       char *p_data;           /* data of page */
49408 +       unsigned page_off = 0,  /* offset within the page where to copy data */
49409 +        count;                 /* number of bytes of item which can be
49410 +                                * copied to page */
49411 +       struct page *pages[TAIL2EXTENT_PAGE_NUM];
49412 +       struct page *page;
49413 +       int done;               /* set to 1 when all file is read */
49414 +       char *item;
49415 +       int i;
49416 +       struct inode *inode;
49417 +       __u64 offset;
49418 +       int first_iteration;
49419 +       int bytes;
49420 +
49421 +       /* collect statistics on the number of tail2extent conversions */
49422 +       reiser4_stat_inc(file.tail2extent);
49423 +
49424 +       assert("nikita-3362", ea_obtained(uf_info));
49425 +       inode = unix_file_info_to_inode(uf_info);
49426 +       assert("nikita-3412", !IS_RDONLY(inode));
49427 +       assert("vs-1649", uf_info->container != UF_CONTAINER_EXTENTS);
49428 +
49429 +       offset = 0;
49430 +       if (inode_get_flag(inode, REISER4_PART_CONV)) {
49431 +               /* find_start() doesn't need block reservation */
49432 +               result = find_start(inode, FORMATTING_ID, &offset);
49433 +               if (result == -ENOENT)
49434 +                       /* no extent found, everything is converted */
49435 +                       return 0;
49436 +               else if (result != 0)
49437 +                       /* some other error */
49438 +                       return result;
49439 +       }
49440 +
49441 +       /* get key of first byte of a file */
49442 +       key_by_inode_unix_file(inode, offset, &key);
49443 +
49444 +       done = 0;
49445 +       result = 0;
49446 +       first_iteration = 1;
49447 +       while (done == 0) {
49448 +               xmemset(pages, 0, sizeof (pages));
49449 +               all_grabbed2free();
49450 +               result = reserve_tail2extent_iteration(inode);
49451 +               if (result != 0)
49452 +                       goto out;
49453 +               if (first_iteration) {
49454 +                       inode_set_flag(inode, REISER4_PART_CONV);
49455 +                       reiser4_update_sd(inode);
49456 +                       first_iteration = 0;
49457 +               }
49458 +               bytes = 0;
49459 +               for (i = 0; i < sizeof_array(pages) && done == 0; i++) {
49460 +                       assert("vs-598", (get_key_offset(&key) & ~PAGE_CACHE_MASK) == 0);
49461 +                       page = alloc_page(mapping_gfp_mask(inode->i_mapping));
49462 +                       if (!page) {
49463 +                               result = RETERR(-ENOMEM);
49464 +                               goto error;
49465 +                       }
49466 +
49467 +                       page->index = (unsigned long) (get_key_offset(&key) >> PAGE_CACHE_SHIFT);
49468 +                       /* usually when one is going to longterm lock znode (as
49469 +                          find_file_item does, for instance) he must not hold
49470 +                          locked pages. However, there is an exception for
49471 +                          case tail2extent. Pages appearing here are not
49472 +                          reachable to everyone else, they are clean, they do
49473 +                          not have jnodes attached so keeping them locked do
49474 +                          not risk deadlock appearance
49475 +                       */
49476 +                       assert("vs-983", !PagePrivate(page));
49477 +
49478 +                       for (page_off = 0; page_off < PAGE_CACHE_SIZE;) {
49479 +                               coord_t coord;
49480 +                               lock_handle lh;
49481 +
49482 +                               /* get next item */
49483 +                               /* FIXME: we might want to readahead here */
49484 +                               init_lh(&lh);
49485 +                               result = find_file_item_nohint(&coord, &lh, &key, ZNODE_READ_LOCK, inode);
49486 +                               if (cbk_errored(result) || result == CBK_COORD_NOTFOUND) {
49487 +                                       /* error happened of not items of file were found */
49488 +                                       done_lh(&lh);
49489 +                                       page_cache_release(page);
49490 +                                       goto error;
49491 +                               }
49492 +
49493 +                               if (coord.between == AFTER_UNIT) {
49494 +                                       /* this is used to detect end of file when inode->i_size can not be used */
49495 +                                       done_lh(&lh);
49496 +                                       done = 1;
49497 +                                       p_data = kmap_atomic(page, KM_USER0);
49498 +                                       xmemset(p_data + page_off, 0, PAGE_CACHE_SIZE - page_off);
49499 +                                       kunmap_atomic(p_data, KM_USER0);
49500 +                                       break;
49501 +                               }
49502 +
49503 +                               result = zload(coord.node);
49504 +                               if (result) {
49505 +                                       page_cache_release(page);
49506 +                                       done_lh(&lh);
49507 +                                       goto error;
49508 +                               }
49509 +                               assert("vs-562", owns_item_unix_file(inode, &coord));
49510 +                               assert("vs-856", coord.between == AT_UNIT);
49511 +                               assert("green-11", keyeq(&key, unit_key_by_coord(&coord, &tmp)));
49512 +                               item = ((char *)item_body_by_coord(&coord)) + coord.unit_pos;
49513 +
49514 +                               /* how many bytes to copy */
49515 +                               count = item_length_by_coord(&coord) - coord.unit_pos;
49516 +                               /* limit length of copy to end of page */
49517 +                               if (count > PAGE_CACHE_SIZE - page_off)
49518 +                                       count = PAGE_CACHE_SIZE - page_off;
49519 +
49520 +                               /* kmap/kunmap are necessary for pages which are not addressable by direct kernel
49521 +                                  virtual addresses */
49522 +                               p_data = kmap_atomic(page, KM_USER0);
49523 +                               /* copy item (as much as will fit starting from the beginning of the item) into the
49524 +                                  page */
49525 +                               memcpy(p_data + page_off, item, (unsigned) count);
49526 +                               kunmap_atomic(p_data, KM_USER0);
49527 +
49528 +                               page_off += count;
49529 +                               bytes += count;
49530 +                               set_key_offset(&key, get_key_offset(&key) + count);
49531 +
49532 +                               zrelse(coord.node);
49533 +                               done_lh(&lh);
49534 +                       } /* end of loop which fills one page by content of formatting items */
49535 +
49536 +                       if (page_off) {
49537 +                               /* something was copied into page */
49538 +                               pages[i] = page;
49539 +                       } else {
49540 +                               page_cache_release(page);
49541 +                               assert("vs-1648", done == 1);
49542 +                               break;
49543 +                       }
49544 +               } /* end of loop through pages of one conversion iteration */
49545 +
49546 +               if (i > 0) {
49547 +                       result = replace(inode, pages, i, bytes);
49548 +                       release_all_pages(pages, sizeof_array(pages));
49549 +                       if (result)
49550 +                               goto error;
49551 +                       /* throttle the conversion */
49552 +                       balance_dirty_pages_ratelimited(inode->i_mapping);
49553 +               }
49554 +       }
49555 +
49556 +       if (result == 0) {
49557 +               /* file is converted to extent items */
49558 +               assert("vs-1697", inode_get_flag(inode, REISER4_PART_CONV));
49559 +
49560 +               uf_info->container = UF_CONTAINER_EXTENTS;
49561 +               complete_conversion(inode);
49562 +       } else {
49563 +               /* conversion is not complete. Inode was already marked as
49564 +                * REISER4_PART_CONV and stat-data were updated at the first
49565 +                * iteration of the loop above. */
49566 + error:
49567 +               release_all_pages(pages, sizeof_array(pages));
49568 +               warning("nikita-2282", "Partial conversion of %llu: %i",
49569 +                       get_inode_oid(inode), result);
49570 +               print_inode("inode", inode);
49571 +       }
49572 +
49573 + out:
49574 +       all_grabbed2free();
49575 +       return result;
49576 +}
49577 +
49578 +
49579 +/* part of extent2tail. Page contains data which are to be put into tree by
49580 +   tail items. Use tail_write for this. flow is composed like in
49581 +   unix_file_write. The only difference is that data for writing are in
49582 +   kernel space */
49583 +/* Audited by: green(2002.06.15) */
49584 +static int
49585 +write_page_by_tail(struct inode *inode, struct page *page, unsigned count)
49586 +{
49587 +       flow_t f;
49588 +       hint_t hint;
49589 +       coord_t *coord;
49590 +       lock_handle lh;
49591 +       znode *loaded;
49592 +       item_plugin *iplug;
49593 +       int result;
49594 +
49595 +       result = 0;
49596 +
49597 +       assert("vs-1089", count);
49598 +       assert("vs-1647", inode_file_plugin(inode)->flow_by_inode == flow_by_inode_unix_file);
49599 +
49600 +       /* build flow */
49601 +       /* FIXME: do not kmap here */
49602 +       flow_by_inode_unix_file(inode, kmap(page), 0 /* not user space */ ,
49603 +                               count, (loff_t) (page->index << PAGE_CACHE_SHIFT), WRITE_OP, &f);
49604 +       iplug = item_plugin_by_id(FORMATTING_ID);
49605 +       init_lh(&lh);
49606 +       hint_init_zero(&hint, &lh);
49607 +       coord = &hint.coord.base_coord;
49608 +       while (f.length) {
49609 +               result = find_file_item_nohint(coord, &lh, &f.key, ZNODE_WRITE_LOCK, inode);
49610 +               if (IS_CBKERR(result))
49611 +                       break;
49612 +
49613 +               assert("vs-957", ergo(result == CBK_COORD_NOTFOUND, get_key_offset(&f.key) == 0));
49614 +               assert("vs-958", ergo(result == CBK_COORD_FOUND, get_key_offset(&f.key) != 0));
49615 +
49616 +               /*coord_clear_iplug(coord);*/
49617 +               result = zload(coord->node);
49618 +               if (result)
49619 +                       break;
49620 +               loaded = coord->node;
49621 +
49622 +               result = iplug->s.file.write(inode, &f, &hint, 1/*grabbed*/, how_to_write(&hint.coord, &f.key));
49623 +               zrelse(loaded);
49624 +               done_lh(&lh);
49625 +
49626 +               if (result == -E_REPEAT)
49627 +                       result = 0;
49628 +               else if (result)
49629 +                       break;
49630 +       }
49631 +
49632 +       done_lh(&lh);
49633 +       kunmap(page);
49634 +
49635 +       /* result of write is 0 or error */
49636 +       assert("vs-589", result <= 0);
49637 +       /* if result is 0 - all @count bytes is written completely */
49638 +       assert("vs-588", ergo(result == 0, f.length == 0));
49639 +       return result;
49640 +}
49641 +
49642 +/* flow insertion is limited by CARRY_FLOW_NEW_NODES_LIMIT of new nodes. Therefore, minimal number of bytes of flow
49643 +   which can be put into tree by one insert_flow is number of bytes contained in CARRY_FLOW_NEW_NODES_LIMIT nodes if
49644 +   they all are filled completely by one tail item. Fortunately, there is a one to one mapping between bytes of tail
49645 +   items and bytes of flow. If there were not, we would have to have special item plugin */
49646 +reiser4_internal int min_bytes_per_flow(void)
49647 +{
49648 +       assert("vs-1103", current_tree->nplug && current_tree->nplug->max_item_size);
49649 +       return CARRY_FLOW_NEW_NODES_LIMIT * current_tree->nplug->max_item_size();
49650 +}
49651 +
49652 +static int
49653 +reserve_extent2tail_iteration(struct inode *inode)
49654 +{
49655 +       reiser4_tree *tree;
49656 +
49657 +       tree = tree_by_inode(inode);
49658 +       /*
49659 +        * reserve blocks for (in this order):
49660 +        *
49661 +        *     1. removal of extent item
49662 +        *
49663 +        *     2. insertion of tail by insert_flow()
49664 +        *
49665 +        *     3. drilling to the leaf level by coord_by_key()
49666 +        *
49667 +        *     4. possible update of stat-data
49668 +        */
49669 +       grab_space_enable();
49670 +       return reiser4_grab_space
49671 +               (estimate_one_item_removal(tree) +
49672 +                estimate_insert_flow(tree->height) +
49673 +                1 + estimate_one_insert_item(tree) +
49674 +                inode_file_plugin(inode)->estimate.update(inode),
49675 +                BA_CAN_COMMIT);
49676 +}
49677 +
49678 +/* for every page of file: read page, cut part of extent pointing to this page,
49679 +   put data of page tree by tail item */
49680 +reiser4_internal int
49681 +extent2tail(unix_file_info_t *uf_info)
49682 +{
49683 +       int result;
49684 +       struct inode *inode;
49685 +       struct page *page;
49686 +       unsigned long num_pages, i;
49687 +       unsigned long start_page;
49688 +       reiser4_key from;
49689 +       reiser4_key to;
49690 +       unsigned count;
49691 +       __u64 offset;
49692 +
49693 +       /* collect statistics on the number of extent2tail conversions */
49694 +       reiser4_stat_inc(file.extent2tail);
49695 +
49696 +       assert("nikita-3362", ea_obtained(uf_info));
49697 +       inode = unix_file_info_to_inode(uf_info);
49698 +       assert("nikita-3412", !IS_RDONLY(inode));
49699 +       assert("vs-1649", uf_info->container != UF_CONTAINER_TAILS);
49700 +
49701 +       offset = 0;
49702 +       if (inode_get_flag(inode, REISER4_PART_CONV)) {
49703 +               /* find_start() doesn't need block reservation */
49704 +               result = find_start(inode, EXTENT_POINTER_ID, &offset);
49705 +               if (result == -ENOENT)
49706 +                       /* no extent found, everything is converted */
49707 +                       return 0;
49708 +               else if (result != 0)
49709 +                       /* some other error */
49710 +                       return result;
49711 +       }
49712 +
49713 +       /* number of pages in the file */
49714 +       num_pages =
49715 +               (inode->i_size - offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
49716 +       start_page = offset >> PAGE_CACHE_SHIFT;
49717 +
49718 +       key_by_inode_unix_file(inode, offset, &from);
49719 +       to = from;
49720 +
49721 +       result = 0;
49722 +       for (i = 0; i < num_pages; i++) {
49723 +               __u64 start_byte;
49724 +
49725 +               all_grabbed2free();
49726 +               result = reserve_extent2tail_iteration(inode);
49727 +               if (result != 0)
49728 +                       break;
49729 +               if (i == 0) {
49730 +                       inode_set_flag(inode, REISER4_PART_CONV);
49731 +                       reiser4_update_sd(inode);
49732 +               }
49733 +
49734 +               page = read_cache_page(inode->i_mapping,
49735 +                                      (unsigned) (i + start_page),
49736 +                                      readpage_unix_file/*filler*/, 0);
49737 +               if (IS_ERR(page)) {
49738 +                       result = PTR_ERR(page);
49739 +                       break;
49740 +               }
49741 +
49742 +               wait_on_page_locked(page);
49743 +
49744 +               if (!PageUptodate(page)) {
49745 +                       page_cache_release(page);
49746 +                       result = RETERR(-EIO);
49747 +                       break;
49748 +               }
49749 +
49750 +               /* cut part of file we have read */
49751 +               start_byte = (__u64) (i << PAGE_CACHE_SHIFT) + offset;
49752 +               set_key_offset(&from, start_byte);
49753 +               set_key_offset(&to, start_byte + PAGE_CACHE_SIZE - 1);
49754 +               /*
49755 +                * cut_tree_object() returns -E_REPEAT to allow atom
49756 +                * commits during over-long truncates. But
49757 +                * extent->tail conversion should be performed in one
49758 +                * transaction.
49759 +                */
49760 +               result = cut_tree(tree_by_inode(inode), &from, &to, inode, 1);
49761 +
49762 +               if (result) {
49763 +                       page_cache_release(page);
49764 +                       break;
49765 +               }
49766 +
49767 +               /* put page data into tree via tail_write */
49768 +               count = PAGE_CACHE_SIZE;
49769 +               if (i == num_pages - 1)
49770 +                       count = (inode->i_size & ~PAGE_CACHE_MASK) ? : PAGE_CACHE_SIZE;
49771 +               result = write_page_by_tail(inode, page, count);
49772 +               if (result) {
49773 +                       page_cache_release(page);
49774 +                       break;
49775 +               }
49776 +
49777 +               /* release page */
49778 +               lock_page(page);
49779 +               /* page is already detached from jnode and mapping. */
49780 +               assert("vs-1086", page->mapping == NULL);
49781 +               assert("nikita-2690", (!PagePrivate(page) && page->private == 0));
49782 +               /* waiting for writeback completion with page lock held is
49783 +                * perfectly valid. */
49784 +               wait_on_page_writeback(page);
49785 +               drop_page(page);
49786 +               /* release reference taken by read_cache_page() above */
49787 +               page_cache_release(page);
49788 +       }
49789 +
49790 +       assert("vs-1260", (reiser4_inode_data(inode)->captured_eflushed == 0 &&
49791 +                          reiser4_inode_data(inode)->anonymous_eflushed == 0));
49792 +
49793 +       if (i == num_pages) {
49794 +               /* file is converted to formatted items */
49795 +               assert("vs-1698", inode_get_flag(inode, REISER4_PART_CONV));
49796 +
49797 +               uf_info->container = UF_CONTAINER_TAILS;
49798 +               complete_conversion(inode);
49799 +       } else {
49800 +               /* conversion is not complete. Inode was already marked as
49801 +                * REISER4_PART_CONV and stat-data were updated at the first
49802 +                * iteration of the loop above. */
49803 +               warning("nikita-2282",
49804 +                       "Partial conversion of %llu: %lu of %lu: %i",
49805 +                       get_inode_oid(inode), i, num_pages, result);
49806 +               print_inode("inode", inode);
49807 +       }
49808 +       all_grabbed2free();
49809 +       return result;
49810 +}
49811 +
49812 +/* this is used to find which conversion did not complete */
49813 +static int
49814 +find_first_item(struct inode *inode)
49815 +{
49816 +       coord_t coord;
49817 +       lock_handle lh;
49818 +       reiser4_key key;
49819 +       int result;
49820 +
49821 +       coord_init_zero(&coord);
49822 +       init_lh(&lh);
49823 +       key_by_inode_unix_file(inode, 0, &key);
49824 +       result = find_file_item_nohint(&coord, &lh, &key, ZNODE_READ_LOCK, inode);
49825 +       if (result == CBK_COORD_FOUND) {
49826 +               if (coord.between == AT_UNIT) {
49827 +                       /*coord_clear_iplug(&coord);*/
49828 +                       result = zload(coord.node);
49829 +                       if (result == 0) {
49830 +                               result = item_id_by_coord(&coord);
49831 +                               zrelse(coord.node);
49832 +                               if (result != EXTENT_POINTER_ID && result != FORMATTING_ID)
49833 +                                       result = RETERR(-EIO);
49834 +                       }
49835 +               } else
49836 +                       result = RETERR(-EIO);
49837 +       }
49838 +       done_lh(&lh);
49839 +       return result;
49840 +}
49841 +
49842 +/* exclusive access is obtained. File may be "partially converted" - that is file body may have both formatting and
49843 +   extent items. Find which conversion did not completed and complete */
49844 +reiser4_internal int
49845 +finish_conversion(struct inode *inode)
49846 +{
49847 +       int result;
49848 +
49849 +       if (inode_get_flag(inode, REISER4_PART_CONV)) {
49850 +               result = find_first_item(inode);
49851 +               if (result == EXTENT_POINTER_ID)
49852 +                       /* first item is extent, therefore there was incomplete tail2extent conversion. Complete it */
49853 +                       result = tail2extent(unix_file_inode_data(inode));
49854 +               else if (result == FORMATTING_ID)
49855 +                       /* first item is formatting item, therefore there was incomplete extent2tail
49856 +                          conversion. Complete it */
49857 +                       result = extent2tail(unix_file_inode_data(inode));
49858 +       } else
49859 +               result = 0;
49860 +       return result;
49861 +}
49862 +
49863 +/*
49864 +   Local variables:
49865 +   c-indentation-style: "K&R"
49866 +   mode-name: "LC"
49867 +   c-basic-offset: 8
49868 +   tab-width: 8
49869 +   fill-column: 120
49870 +   scroll-step: 1
49871 +   End:
49872 +*/
49873 diff -rupN linux-2.6.8-rc3/fs/reiser4/plugin/flush/flush.alg linux-2.6.8-rc3-a/fs/reiser4/plugin/flush/flush.alg
49874 --- linux-2.6.8-rc3/fs/reiser4/plugin/flush/flush.alg   1970-01-01 03:00:00.000000000 +0300
49875 +++ linux-2.6.8-rc3-a/fs/reiser4/plugin/flush/flush.alg 2004-08-05 21:20:53.501572281 +0400
49876 @@ -0,0 +1,515 @@
49877 +
49878 +
49879 +
49880 +
49881 +
49882 +
49883 +
49884 +
49885 +
49886 +
49887 +/*
49888 +
49889 +The use of atomic commits dramatically impacts the use of LRU as the
49890 +basis for page cleaning (though using it for clean page discarding is
49891 +still effective.)
49892 +
49893 +The use of write clustering dramatically impacts the use of LRU as the
49894 +basis for page cleaning.
49895 +
49896 +ReiserFS v4 uses both.
49897 +
49898 +We will not use LRU in v4.0 of reiserfs, and then in later versions we
49899 +may gradually partially reintroduce it.
49900 +
49901 +Optimizations to make on flush:
49902 +
49903 +* block (re)allocation
49904 +
49905 +* tail conversion
49906 +
49907 +* extent formation
49908 +
49909 +* node repacking
49910 +
49911 +* wandering log definition
49912 +
49913 +
49914 +Memory Pressure:
49915 +
49916 +There are kinds of memory pressure:
49917 +
49918 +* general lack of memory for processes requesting it
49919 +
49920 +* too much dirty memory
49921 +
49922 +* dirty memory is too old and should be more permanently preserved on disk
49923 +
49924 +* particular page needs freeing for DMA setup
49925 +
49926 +[All programmers should understand that I expect strict observance of the
49927 +following taboo: you will not add an unnecessary copying of data, while coding
49928 +this.  I cannot understand why there is resistance to this, but I keep seeing
49929 +code which ignores this.]
49930 +
49931 +
49932 +Unlike clean pages, dirty memory must be written to disk before being
49933 +freed for other use.  It also may require processing that will require
49934 +more memory before it can be cleaned by writing it to disk.  This
49935 +processing makes it vulnerable to deadlocks in extreme cases.
49936 +Precisely reserving enough memory to allow that extra processing
49937 +without deadlock is often difficult.
49938 +
49939 +reiser4 limits its usage of dirty pages to 75%, which is enough to
49940 +ensure that the extra processing will not cause the system to run out
49941 +of memory.  More safeguards are possible, including letting commit
49942 +choose to swap, but we will wait until this rather simple mechanism
49943 +has a problem in practice before elaborating it.
49944 +
49945 +reiser4 supports the reiserfs_flush(page) command for cleaning pages
49946 +
49947 +If the Linux VM system was properly designed, it would be based upon
49948 +memory sub-managers each reflecting some common principles of
49949 +responding to pressure that is in proportion to their size.  Linux
49950 +used to have multiple caches, these caches had no understand of how
49951 +large they were, they made no attempt to proportionalize the pressure
49952 +upon each cache, and their management was generally badly designed
49953 +without any effective mechanism for ensuring that the caches did not
49954 +get out of balance with each other.  From this it was concluded that
49955 +there should be only one unified cache, rather than designing an
49956 +effective mechanism for expressing to each subcache a sense of
49957 +pressure in proportion to the size of the subcache, and requiring that
49958 +the subcache embody some effective mechanism for responding to that
49959 +sense of pressure.
49960 +
49961 +The unified cache is indeed better than badly designed multiple
49962 +caches.  It does however perform very poorly at handling caches of
49963 +objects that are not page sized.
49964 +
49965 +Linus says it already has a subcache manager design, we just need to
49966 +use writepage.  Ok, fine, we will be the first subcache.
49967 +
49968 +So, understand that in reiserfs, writepage does not write pages, it
49969 +pressures the reiserfs memory manager, and understand that the place a
49970 +page has on the various mm lists does not determine when it gets
49971 +written out, it merely determines when it triggers the next pressure
49972 +on the reiserfs memory manager.
49973 +
49974 +What reiser4 does is interpret pressure on a page as pressure on a
49975 +subcache within reiserfs.
49976 +
49977 +Write clustering, transaction commits, objects whose value to cache is
49978 +out of proportion to the number of bytes consumed by them, caches
49979 +whose working set size and pattern of access is known to the
49980 +application, and those occasions when other factors knowable to the
49981 +filesystem or application but not the OS generally are important to
49982 +deciding what to eject, and objects much smaller than a page with no
49983 +correlation of references for objects on the same page, or larger than
49984 +a page with a complete correlation between their pages, are good example of when cache
49985 +submanagers should be employed.
49986 +
49987 + */
49988 +
49989 +/* You should read crypt.c and then return. */
49990 +/* You should read block_alloc.c and then return. */
49991 +
49992 +current_leaf = find_leftmost_leaf_in_slum();
49993 +/* current_leaf is locked */
49994 +parent = parent(current_leaf);
49995 +/* parent is locked */
49996 +
49997 +if (is_relocate_set(current_leaf))
49998 +{
49999 +  dirty(parent);
50000 +}
50001 +
50002 +if (is_dirty(parent))
50003 +{
50004 +  squeeze_level_left(parent);
50005 +  /* this can create an enormous recursive chain that could overflow
50006 +   the kernel stack, hmmmm..... */
50007 +  flush_all_other_child_slums(parent, min_key(current_leaf));
50008 +}
50009 +else
50010 +{
50011 +  unlock(parent);
50012 +}
50013 +/* parent is unlocked by squeeze_level_left, and squeezing may have
50014 +   changed the parent of current_leaf */
50015 +
50016 +parent = parent(current_leaf);
50017 +/* parent is locked */
50018 +if (leftmost_child(parent) == current_leaf)
50019 +     allocate(parent);
50020 +
50021 +/* ok, now we are ready to proceed through slum on the leaf level */
50022 +next_leaf = get_right_neighbor_in_slum_level(current_leaf);
50023 +/* next_leaf is locked or null */
50024 +
50025 +/* need to review locking in the below */
50026 +while(next_leaf)
50027 +{
50028 +  if (is_formatted(current_leaf) && is_formatted(next_leaf))
50029 +    {
50030 +      squeeze_left(current_leaf, next_leaf);
50031 +      if (is_empty(next_leaf))
50032 +       {
50033 +         delete_node(next_leaf);
50034 +         next_leaf = get_right_neighbor_in_slum_level(current_leaf);
50035 +         check_and_handle_parent_change();
50036 +         continue;
50037 +       }
50038 +    }
50039 +  if (is_unformatted(current_leaf))
50040 +    /* allocate or reallocate */
50041 +    allocate_extent_in_parent_if_needed(current_leaf);
50042 +  /* the above may change the parent */
50043 +  check_and_handle_parent_change();
50044 +  allocate(current_leaf);
50045 +  next_leaf = get_right_neighbor_in_slum_level(current_leaf);
50046 +  check_and_handle_parent_change();
50047 +
50048 +
50049 +}
50050 +
50051 +/* this means squeeze it as well as allocate it */
50052 +handle_non_leaf_end_of_slum();
50053 +
50054 +check_and_handle_parent_change()
50055 +{
50056 +if ( (new_parent = parent(current_leaf)) != parent)
50057 +  squeeze_left(parent, new_parent(parent));
50058 + else
50059 +   return;
50060 +
50061 +/* the line above can change who the parent is so retest... */
50062 +  if((new_parent = parent(current_leaf)) != parent)
50063 +    {
50064 +      parent = new_parent;
50065 +      if (leftmost_child(parent) != current_leaf)
50066 +       reiser_panic("reiser-2071: this needs recoding to handle this case");
50067 +      allocate_node(parent);
50068 +                               /* allocating other ancestors left for josh */
50069 +
50070 +      /* our new parent might not be well packed, and we want
50071 +        it to be well packed even if our slum never reaches its edge
50072 +        so we... */
50073 +      squeeze_left(parent, right_neighbor(parent));
50074 +    }
50075 +}
50076 +
50077 +
50078 +################################################################################
50079 +
50080 +
50081 +The Problem:
50082 +
50083 +We need to know the relocate set in order to perform a left-to-right
50084 +parent-first allocate-and-squeeze traversal over a dirty sub-tree.  We
50085 +could make this decision during the allocate-and-squeeze pass, but in
50086 +that case we would discover a node is dirty when we have already
50087 +passed over its position in the parent-first ordering.  In otherwords,
50088 +we would discover this information too late to be useful.
50089 +
50090 +The Several-Pass Solution:
50091 +
50092 +It is possible to construct this relocate information at flush time by
50093 +scanning the tree, but it means at least two passes over the tree.
50094 +Using several passes has an advantage: we can then choose overwrite
50095 +when the "optimal location" is part of the atom's own preserve set.
50096 +This requires knowing the (partial) set of blocks being flushed before
50097 +allocation begins.  This strategy was initially proposed by Hans,
50098 +before we realized it would require multiple passes.
50099 +
50100 +The Solution:
50101 +
50102 +Maintain an active count of dirty children for each node.  This allows
50103 +us to mark a node dirty whenever its dirty count becomes >= 2 because
50104 +at that point overwriting the parent reduces the total number of
50105 +blocks written to disk.  How much of a counter is needed?  In order to
50106 +keep track of additions and subtractions to this count, a counter at
50107 +the same size as our znode's c_count field is needed.  If this value
50108 +were only ever incremented, then we could use a single bit (0 = no
50109 +dirty children, 1 = single dirty child, otherwise mark dirty &
50110 +relocate).  But since a node may have dirty children added while
50111 +flushes are active (it can happen, right?) this requires more than just
50112 +a bit.  I worry about the complexity of maintaining this dirty count,
50113 +but I fear that the parent-first allocation policy will not succeed
50114 +without knowing before-hand all the dirty nodes it must consider.
50115 +
50116 +The Algorithm:
50117 +
50118 +Given the above assumption, that nodes are marked dirty whenever they
50119 +should be relocated (i.e., that the flush algorithm does not make this
50120 +decision during as part of its passing over the tree).
50121 +
50122 +Starting from some leaf node, find the greatest dirty ancestor, defined as the
50123 +least (i.e., lowest-level) ancestor of a node without a dirty parent.  The
50124 +greatest dirty ancestor will be overwritten, therefore its preceding node in
50125 +the parent-first order should not be considered.
50126 +
50127 +    [Dead text: If the greatest dirty ancestor is NOT the leftmost child of
50128 +    its own parent (and not the root node), there may be a dirty
50129 +    parent-first-ordered node in a subtree to the left of this one.  In those
50130 +    cases, from the greatest dirty ancestor, find the leftmost in-memory
50131 +    descendant.  If the leftmost descendant is dirty, consider its left
50132 +    neighbor.  If the neighbor is also dirty, repeat the steps of this
50133 +    paragraph starting at that node (i.e., find the new greatest dirty
50134 +    ancestor).]
50135 +
50136 +Pseudo-code for this is:
50137 +
50138 +/* Starting from a node, find the greatest dirty ancestor. */
50139 +jnode* greatest_dirty_ancestor (jnode *node)
50140 +{
50141 +  while (! is_root_node (node)) {
50142 +     znode *parent = get_parent (node);
50143 +
50144 +     if (! znode_is_dirty (parent)) {
50145 +         break;
50146 +     }
50147 +
50148 +     node = parent;
50149 +  }
50150 +  return node;
50151 +}
50152 +
50153 +Now we have found the greatest dirty ancestor from which to begin allocating
50154 +and squeezing.  From this point we will traverse all descendants of the
50155 +greatest dirty ancestor, in parent-first order, allocating blocks and
50156 +squeezing nodes in the following order.  Squeezing must be performed in a
50157 +bottom-up, left-to-right order, whereas allocation occurs in parent-first
50158 +order.  The following pseudo-code accomplishes both at once:
50159 +
50160 +########################################################################
50161 +
50162 +Problems with above to be addressed:
50163 +
50164 +Nikita suggests squeezing all the formatted nodes of a twig before allocating
50165 +its extents, thereby increasing room for extents to "inflate".
50166 +
50167 +########################################################################
50168 +
50169 +/* A function to find the parent-first-preceder of a node, although
50170 + * there may not be enough nodes in memory to actually compute this.
50171 + * In that case, pick something else.  If node is leftmost child of
50172 + * its parent, return its parent's block number.  Otherwise if node
50173 + * is a leaf node, return its left neighbor.  Finally, return the
50174 + * block number of the parent's left neighbor's rightmost descendent
50175 + * (which may not be in memory).  In the actual implementation of the
50176 + * parent-first traversal below, we can optimize this (because we
50177 + * know the result most of the time). */
50178 +blocknr parent_first_preceder_of (jnode *node) { ... }
50179 +
50180 +/* A parent-first recursive tree traversal, allocate and squeeze.
50181 + * This is called on the greatest dirty ancestor of a region to be
50182 + * flushed.
50183 + */
50184 +void allocate_and_squeeze_parent_first (jnode *node)
50185 +{
50186 +  /* Stop recursion if its not dirty, meaning don't allocate children either.
50187 +   * Children might be dirty but there is an overwrite below this level
50188 +   * or else this node would be dirty. */
50189 +  if (! is_dirty (node)) {
50190 +    return;
50191 +  }
50192 +
50193 +  /* Allocate (parent) first. */
50194 +  allocate_node (node, parent_first_preceder_of (node));
50195 +
50196 +  if (jnode_is_unformatted (node)) {
50197 +    /* We got here because the parent (twig) of an unformatted node is
50198 +     * not being relocated.  Otherwise this recursion does not descend
50199 +     * to unformatted nodes. */
50200 +     return;
50201 +  }
50202 +
50203 +  /* Recursive case: */
50204 +  if (jnode_get_level (node) > LEAF_LEVEL) {
50205 +
50206 +    for (each_item_left_to_right (node)) {
50207 +
50208 +      if (is_extent_item (item) && extent_item_is_dirty (item)) {
50209 +         allocate_extent_item (item);
50210 +      } else if (is_internal_item (item) && jnode_is_dirty (internal_item_child (item))) {
50211 +         allocate_and_squeeze_parent_first (internal_item_child (item));
50212 +      }
50213 +    }
50214 +  }
50215 +
50216 +  /* Squeeze a node: note that this makes the "one big memcpy"
50217 +   * approach somewhat more difficult, but its still possible. */
50218 +  while (not_empty (node) && jnode_is_formatted (node->right) && is_dirty (node->right)) {
50219 +
50220 +    item = first_item_of (node->right);
50221 +
50222 +    if (is_extent_item (item) && extent_item_is_dirty (item)) {
50223 +       allocate_extent_item_into (item, node);
50224 +    } else if (can_shift_into (item, node)) {
50225 +       shift_item (item, node);
50226 +    }
50227 +  }
50228 +}
50229 +
50230 +########################################################################
50231 +########################################################################
50232 +
50233 +########################################################################
50234 +########################################################################
50235 +
50236 +Hans says:
50237 +
50238 +Relocate parent if leftmost child is also relocated
50239 +
50240 +Relocate if leftmost-child of parent.
50241 +
50242 +Ignore the "always relocate children if two children of a node are dirty"
50243 +idea.
50244 +
50245 +Rather than scan left at the leaf level, why not jump to parent, check
50246 +left-most child dirty, and stop?
50247 +
50248 +########################################################################
50249 +########################################################################
50250 +
50251 +Dead pseudo code, older stuff:
50252 +
50253 +########################################################################
50254 +########################################################################
50255 +
50256 +
50257 +Problem: The root of a subtree gets overwritten, so the subtree to the left
50258 +will not follow in parent-first order.  That would simplify things.  Killed
50259 +this code:
50260 +
50261 +/* Starting at a node, find the greatest dirty parent, then see if it
50262 + * has a preceding dirty node on the leaf of the subtree to its left. */
50263 +void find_maximal_dirty_ancestor (jnode *node)
50264 +{
50265 + repeat:
50266 +  node = greatest_dirty_ancestor (node)
50267 +
50268 +  /* End search at the root node or if the node is the leftmost child
50269 +   * of its parent, in which case the left-of-leftmost-descendent does
50270 +   * not precede it in parent first order, its parent does in that
50271 +   * case. */
50272 +  if (! is_root_node (node) && ! leftmost_child_of_parent (node)) {
50273 +     jnode *godown = node;
50274 +
50275 +     while (jnode_get_level (godown) > LEAF_LEVEL) {
50276 +       /* Iterate downward as long as leftmost nodes in memory (note:
50277 +        * they don't have to be dirty). */
50278 +       jnode *child = leftmost_child (godown);
50279 +
50280 +       if (child == NULL) {
50281 +         return node;
50282 +       }
50283 +
50284 +       godown = child;
50285 +    }
50286 +
50287 +    /* Reached the leftmost descendant of the maximal dirty node,
50288 +     * now see if its left is dirty.  Otherwise return. */
50289 +    if ((godown = godown->left) == NULL || ! jnode_is_dirty (godown)) {
50290 +      return node;
50291 +    }
50292 +
50293 +    /* At this point, "godown" precedes "node" in the parent-first
50294 +     * traversal, so search for a new maximal dirty node. */
50295 +    node = godown;
50296 +    goto repeat;
50297 +  }
50298 +}
50299 +
50300 +/* Allocate and squeeze starting at the greatest dirty ancestor
50301 + * described above.  Repeat in rightward direction for adjacent
50302 + * subtrees.
50303 + */
50304 +void allocate_and_squeeze_parent_first (jnode *node)
50305 +{
50306 +  jnode *right;
50307 +
50308 +repeat:
50309 +  /* Do one sub-tree */
50310 +  allocate_and_squeeze_parent_first_subtree (node);
50311 +
50312 +  /* Now try to repeat to the right. */
50313 +  right = get_right_neighbor (node);
50314 +
50315 +  if (right != NULL && jnode_is_dirty (right)) {
50316 +     node = greatest_dirty_ancestor (right);
50317 +     goto repeat;
50318 +  }
50319 +}
50320 +
50321 +
50322 +/* The crap below was my first attempt to write this iteratively. */
50323 +
50324 +
50325 +
50326 +
50327 +
50328 +jnode *maximal_dirty_ancestor = ...;                   /* Computed using above algorithm */
50329 +jnode *left_edge[MAX_LEVELS];                          /* Vertical edge of left-to-right scan */
50330 +int    top_edge = jnode_get_level (node) - LEAF_LEVEL; /* Highest index to the left_edge array --
50331 +                                                        * by subtracting LEAF_LEVEL it becomes 0-origin */
50332 +
50333 +/* Initialize left_edge array entries to NULL, set top edge */
50334 +left_edge[top_edge] = maximal_dirty_ancestor;
50335 +
50336 +/* For each node above the leaf level, set the child in left_edge */
50337 +for (int level = top_edge; level >= 1; level -= 1) {
50338 +
50339 +  jnode *parent = left_edge[level];
50340 +
50341 +  /* Find its leftmost dirty child. */
50342 +  jnode *child  = leftmost_dirty_child (parent);
50343 +
50344 +  /* Its possible that a dirty node could have no dirty children,
50345 +   * in which case leave the lower edges NULL. */
50346 +  if (child == NULL) { break; }
50347 +
50348 +  left_edge[level-1] = child;
50349 +}
50350 +
50351 +/* To store the lowest dirty entry in left_edge[]. */
50352 +int current_level = 0;
50353 +
50354 +/* Allocate each node in the left edge. */
50355 +for (int level = top_edge; level >= 0 && left_edge[level] != NULL; level -= 1) {
50356 +
50357 +  jnode *node = left_edge[level];
50358 +
50359 +  /* Allocate this node... */
50360 +  allocate_node (node, parent_first_preceder_of (node));
50361 +
50362 +  current_level = level;
50363 +}
50364 +
50365 +/* Now starting with the current level, squeeze and allocate until finished. */
50366 +while (current_level <= top_level) {
50367 +
50368 +  jnode *current_node = left_edge[current_level];
50369 +
50370 +  if (jnode_is_formatted (current_node)) {
50371 +
50372 +    do {
50373 +
50374 +      /* Shift as much as possible. */
50375 +      while (node_has_room_to_shift_into (current_node)) {
50376 +        if (is_twig_level (current_node)) {
50377 +          shift_left (current_node, current_node->right);
50378 +        } else {
50379 +          allocate_extents_and_shift_left (current_node, current_node->right);
50380 +        }
50381 +      }
50382 +
50383 +      /* Once it has been tightly packed, allocate it. */
50384 +      allocate_node (current_node, parent_first_preceder_of (node));
50385 +
50386 +      current_node = current_node->right;
50387 +  }
50388 +
50389 +  current_level += 1;
50390 +}
50391 +</pre>
50392 diff -rupN linux-2.6.8-rc3/fs/reiser4/plugin/hash.c linux-2.6.8-rc3-a/fs/reiser4/plugin/hash.c
50393 --- linux-2.6.8-rc3/fs/reiser4/plugin/hash.c    1970-01-01 03:00:00.000000000 +0300
50394 +++ linux-2.6.8-rc3-a/fs/reiser4/plugin/hash.c  2004-08-05 21:20:53.280618885 +0400
50395 @@ -0,0 +1,347 @@
50396 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
50397 + * reiser4/README */
50398 +
50399 +/* Hash functions */
50400 +
50401 +#include "../debug.h"
50402 +#include "plugin_header.h"
50403 +#include "plugin.h"
50404 +#include "../super.h"
50405 +#include "../inode.h"
50406 +#include "../plugin/dir/dir.h"
50407 +
50408 +#include <linux/types.h>
50409 +
50410 +/* old rupasov (yura) hash */
50411 +static __u64
50412 +hash_rupasov(const unsigned char *name /* name to hash */ ,
50413 +            int len /* @name's length */ )
50414 +{
50415 +       int i;
50416 +       int j;
50417 +       int pow;
50418 +       __u64 a;
50419 +       __u64 c;
50420 +
50421 +       assert("nikita-672", name != NULL);
50422 +       assert("nikita-673", len >= 0);
50423 +
50424 +       for (pow = 1, i = 1; i < len; ++i)
50425 +               pow = pow * 10;
50426 +
50427 +       if (len == 1)
50428 +               a = name[0] - 48;
50429 +       else
50430 +               a = (name[0] - 48) * pow;
50431 +
50432 +       for (i = 1; i < len; ++i) {
50433 +               c = name[i] - 48;
50434 +               for (pow = 1, j = i; j < len - 1; ++j)
50435 +                       pow = pow * 10;
50436 +               a = a + c * pow;
50437 +       }
50438 +       for (; i < 40; ++i) {
50439 +               c = '0' - 48;
50440 +               for (pow = 1, j = i; j < len - 1; ++j)
50441 +                       pow = pow * 10;
50442 +               a = a + c * pow;
50443 +       }
50444 +
50445 +       for (; i < 256; ++i) {
50446 +               c = i;
50447 +               for (pow = 1, j = i; j < len - 1; ++j)
50448 +                       pow = pow * 10;
50449 +               a = a + c * pow;
50450 +       }
50451 +
50452 +       a = a << 7;
50453 +       return a;
50454 +}
50455 +
50456 +/* r5 hash */
50457 +static __u64
50458 +hash_r5(const unsigned char *name /* name to hash */ ,
50459 +       int len UNUSED_ARG /* @name's length */ )
50460 +{
50461 +       __u64 a = 0;
50462 +
50463 +       assert("nikita-674", name != NULL);
50464 +       assert("nikita-675", len >= 0);
50465 +
50466 +       while (*name) {
50467 +               a += *name << 4;
50468 +               a += *name >> 4;
50469 +               a *= 11;
50470 +               name++;
50471 +       }
50472 +       return a;
50473 +}
50474 +
50475 +/* Keyed 32-bit hash function using TEA in a Davis-Meyer function
50476 +     H0 = Key
50477 +     Hi = E Mi(Hi-1) + Hi-1
50478 +
50479 +   (see Applied Cryptography, 2nd edition, p448).
50480 +
50481 +   Jeremy Fitzhardinge <jeremy@zip.com.au> 1998
50482 +
50483 +   Jeremy has agreed to the contents of reiserfs/README. -Hans
50484 +
50485 +   This code was blindly upgraded to __u64 by s/__u32/__u64/g.
50486 +*/
50487 +static __u64
50488 +hash_tea(const unsigned char *name /* name to hash */ ,
50489 +        int len /* @name's length */ )
50490 +{
50491 +       __u64 k[] = { 0x9464a485u, 0x542e1a94u, 0x3e846bffu, 0xb75bcfc3u };
50492 +
50493 +       __u64 h0 = k[0], h1 = k[1];
50494 +       __u64 a, b, c, d;
50495 +       __u64 pad;
50496 +       int i;
50497 +
50498 +       assert("nikita-676", name != NULL);
50499 +       assert("nikita-677", len >= 0);
50500 +
50501 +#define DELTA 0x9E3779B9u
50502 +#define FULLROUNDS 10          /* 32 is overkill, 16 is strong crypto */
50503 +#define PARTROUNDS 6           /* 6 gets complete mixing */
50504 +
50505 +/* a, b, c, d - data; h0, h1 - accumulated hash */
50506 +#define TEACORE(rounds)                                                        \
50507 +       do {                                                            \
50508 +               __u64 sum = 0;                                          \
50509 +               int n = rounds;                                         \
50510 +               __u64 b0, b1;                                           \
50511 +                                                                       \
50512 +               b0 = h0;                                                \
50513 +               b1 = h1;                                                \
50514 +                                                                       \
50515 +               do                                                      \
50516 +               {                                                       \
50517 +                       sum += DELTA;                                   \
50518 +                       b0 += ((b1 << 4)+a) ^ (b1+sum) ^ ((b1 >> 5)+b); \
50519 +                       b1 += ((b0 << 4)+c) ^ (b0+sum) ^ ((b0 >> 5)+d); \
50520 +               } while(--n);                                           \
50521 +                                                                       \
50522 +               h0 += b0;                                               \
50523 +               h1 += b1;                                               \
50524 +       } while(0)
50525 +
50526 +       pad = (__u64) len | ((__u64) len << 8);
50527 +       pad |= pad << 16;
50528 +
50529 +       while (len >= 16) {
50530 +               a = (__u64) name[0] | (__u64) name[1] << 8 | (__u64) name[2] << 16 | (__u64) name[3] << 24;
50531 +               b = (__u64) name[4] | (__u64) name[5] << 8 | (__u64) name[6] << 16 | (__u64) name[7] << 24;
50532 +               c = (__u64) name[8] | (__u64) name[9] << 8 | (__u64) name[10] << 16 | (__u64) name[11] << 24;
50533 +               d = (__u64) name[12] | (__u64) name[13] << 8 | (__u64) name[14] << 16 | (__u64) name[15] << 24;
50534 +
50535 +               TEACORE(PARTROUNDS);
50536 +
50537 +               len -= 16;
50538 +               name += 16;
50539 +       }
50540 +
50541 +       if (len >= 12) {
50542 +               //assert(len < 16);
50543 +               if (len >= 16)
50544 +                       *(int *) 0 = 0;
50545 +
50546 +               a = (__u64) name[0] | (__u64) name[1] << 8 | (__u64) name[2] << 16 | (__u64) name[3] << 24;
50547 +               b = (__u64) name[4] | (__u64) name[5] << 8 | (__u64) name[6] << 16 | (__u64) name[7] << 24;
50548 +               c = (__u64) name[8] | (__u64) name[9] << 8 | (__u64) name[10] << 16 | (__u64) name[11] << 24;
50549 +
50550 +               d = pad;
50551 +               for (i = 12; i < len; i++) {
50552 +                       d <<= 8;
50553 +                       d |= name[i];
50554 +               }
50555 +       } else if (len >= 8) {
50556 +               //assert(len < 12);
50557 +               if (len >= 12)
50558 +                       *(int *) 0 = 0;
50559 +               a = (__u64) name[0] | (__u64) name[1] << 8 | (__u64) name[2] << 16 | (__u64) name[3] << 24;
50560 +               b = (__u64) name[4] | (__u64) name[5] << 8 | (__u64) name[6] << 16 | (__u64) name[7] << 24;
50561 +
50562 +               c = d = pad;
50563 +               for (i = 8; i < len; i++) {
50564 +                       c <<= 8;
50565 +                       c |= name[i];
50566 +               }
50567 +       } else if (len >= 4) {
50568 +               //assert(len < 8);
50569 +               if (len >= 8)
50570 +                       *(int *) 0 = 0;
50571 +               a = (__u64) name[0] | (__u64) name[1] << 8 | (__u64) name[2] << 16 | (__u64) name[3] << 24;
50572 +
50573 +               b = c = d = pad;
50574 +               for (i = 4; i < len; i++) {
50575 +                       b <<= 8;
50576 +                       b |= name[i];
50577 +               }
50578 +       } else {
50579 +               //assert(len < 4);
50580 +               if (len >= 4)
50581 +                       *(int *) 0 = 0;
50582 +               a = b = c = d = pad;
50583 +               for (i = 0; i < len; i++) {
50584 +                       a <<= 8;
50585 +                       a |= name[i];
50586 +               }
50587 +       }
50588 +
50589 +       TEACORE(FULLROUNDS);
50590 +
50591 +/*     return 0;*/
50592 +       return h0 ^ h1;
50593 +
50594 +}
50595 +
50596 +/* classical 64 bit Fowler/Noll/Vo-1 (FNV-1) hash.
50597 +
50598 +   See http://www.isthe.com/chongo/tech/comp/fnv/ for details.
50599 +
50600 +   Excerpts:
50601 +
50602 +     FNV hashes are designed to be fast while maintaining a low collision
50603 +     rate.
50604 +
50605 +     [This version also seems to preserve lexicographical order locally.]
50606 +
50607 +     FNV hash algorithms and source code have been released into the public
50608 +     domain.
50609 +
50610 +*/
50611 +static __u64
50612 +hash_fnv1(const unsigned char *name /* name to hash */ ,
50613 +         int len UNUSED_ARG /* @name's length */ )
50614 +{
50615 +       unsigned long long a = 0xcbf29ce484222325ull;
50616 +       const unsigned long long fnv_64_prime = 0x100000001b3ull;
50617 +
50618 +       assert("nikita-678", name != NULL);
50619 +       assert("nikita-679", len >= 0);
50620 +
50621 +       /* FNV-1 hash each octet in the buffer */
50622 +       for (; *name; ++name) {
50623 +               /* multiply by the 32 bit FNV magic prime mod 2^64 */
50624 +               a *= fnv_64_prime;
50625 +               /* xor the bottom with the current octet */
50626 +               a ^= (unsigned long long) (*name);
50627 +       }
50628 +       /* return our new hash value */
50629 +       return a;
50630 +}
50631 +
50632 +/* degenerate hash function used to simplify testing of non-unique key
50633 +   handling */
50634 +static __u64
50635 +hash_deg(const unsigned char *name UNUSED_ARG /* name to hash */ ,
50636 +        int len UNUSED_ARG /* @name's length */ )
50637 +{
50638 +       ON_TRACE(TRACE_DIR, "Hashing %s\n", name);
50639 +       return 0xc0c0c0c010101010ull;
50640 +}
50641 +
50642 +static int
50643 +change_hash(struct inode * inode, reiser4_plugin * plugin)
50644 +{
50645 +       int result;
50646 +
50647 +       assert("nikita-3503", inode != NULL);
50648 +       assert("nikita-3504", plugin != NULL);
50649 +
50650 +       assert("nikita-3505", is_reiser4_inode(inode));
50651 +       assert("nikita-3506", inode_dir_plugin(inode) != NULL);
50652 +       assert("nikita-3507", plugin->h.type_id == REISER4_HASH_PLUGIN_TYPE);
50653 +
50654 +       result = 0;
50655 +       if (inode_hash_plugin(inode) == NULL ||
50656 +           inode_hash_plugin(inode)->h.id != plugin->h.id) {
50657 +               if (is_dir_empty(inode) == 0)
50658 +                       result = plugin_set_hash(&reiser4_inode_data(inode)->pset,
50659 +                                                &plugin->hash);
50660 +               else
50661 +                       result = RETERR(-ENOTEMPTY);
50662 +
50663 +       }
50664 +       return result;
50665 +}
50666 +
50667 +static reiser4_plugin_ops hash_plugin_ops = {
50668 +       .init     = NULL,
50669 +       .load     = NULL,
50670 +       .save_len = NULL,
50671 +       .save     = NULL,
50672 +       .change   = change_hash
50673 +};
50674 +
50675 +/* hash plugins */
50676 +hash_plugin hash_plugins[LAST_HASH_ID] = {
50677 +       [RUPASOV_HASH_ID] = {
50678 +               .h = {
50679 +                       .type_id = REISER4_HASH_PLUGIN_TYPE,
50680 +                       .id = RUPASOV_HASH_ID,
50681 +                       .pops = &hash_plugin_ops,
50682 +                       .label = "rupasov",
50683 +                       .desc = "Original Yura's hash",
50684 +                       .linkage = TYPE_SAFE_LIST_LINK_ZERO}
50685 +               ,
50686 +               .hash = hash_rupasov
50687 +       },
50688 +       [R5_HASH_ID] = {
50689 +               .h = {
50690 +                       .type_id = REISER4_HASH_PLUGIN_TYPE,
50691 +                       .id = R5_HASH_ID,
50692 +                       .pops = &hash_plugin_ops,
50693 +                       .label = "r5",
50694 +                       .desc = "r5 hash",
50695 +                       .linkage = TYPE_SAFE_LIST_LINK_ZERO}
50696 +               ,
50697 +               .hash = hash_r5
50698 +       },
50699 +       [TEA_HASH_ID] = {
50700 +               .h = {
50701 +                       .type_id = REISER4_HASH_PLUGIN_TYPE,
50702 +                       .id = TEA_HASH_ID,
50703 +                       .pops = &hash_plugin_ops,
50704 +                       .label = "tea",
50705 +                       .desc = "tea hash",
50706 +                       .linkage = TYPE_SAFE_LIST_LINK_ZERO}
50707 +               ,
50708 +               .hash = hash_tea
50709 +       },
50710 +       [FNV1_HASH_ID] = {
50711 +               .h = {
50712 +                       .type_id = REISER4_HASH_PLUGIN_TYPE,
50713 +                       .id = FNV1_HASH_ID,
50714 +                       .pops = &hash_plugin_ops,
50715 +                       .label = "fnv1",
50716 +                       .desc = "fnv1 hash",
50717 +                       .linkage = TYPE_SAFE_LIST_LINK_ZERO}
50718 +               ,
50719 +               .hash = hash_fnv1
50720 +       },
50721 +       [DEGENERATE_HASH_ID] = {
50722 +               .h = {
50723 +                       .type_id = REISER4_HASH_PLUGIN_TYPE,
50724 +                       .id = DEGENERATE_HASH_ID,
50725 +                       .pops = &hash_plugin_ops,
50726 +                       .label = "degenerate hash",
50727 +                       .desc = "Degenerate hash: only for testing",
50728 +                       .linkage = TYPE_SAFE_LIST_LINK_ZERO}
50729 +               ,
50730 +               .hash = hash_deg
50731 +       }
50732 +};
50733 +
50734 +/* Make Linus happy.
50735 +   Local variables:
50736 +   c-indentation-style: "K&R"
50737 +   mode-name: "LC"
50738 +   c-basic-offset: 8
50739 +   tab-width: 8
50740 +   fill-column: 120
50741 +   End:
50742 +*/
50743 diff -rupN linux-2.6.8-rc3/fs/reiser4/plugin/item/acl.h linux-2.6.8-rc3-a/fs/reiser4/plugin/item/acl.h
50744 --- linux-2.6.8-rc3/fs/reiser4/plugin/item/acl.h        1970-01-01 03:00:00.000000000 +0300
50745 +++ linux-2.6.8-rc3-a/fs/reiser4/plugin/item/acl.h      2004-08-05 21:20:53.135649463 +0400
50746 @@ -0,0 +1,64 @@
50747 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
50748 +
50749 +/* Directory entry. */
50750 +
50751 +#if !defined( __FS_REISER4_PLUGIN_DIRECTORY_ENTRY_H__ )
50752 +#define __FS_REISER4_PLUGIN_DIRECTORY_ENTRY_H__
50753 +
50754 +#include "../../forward.h"
50755 +#include "../../dformat.h"
50756 +#include "../../kassign.h"
50757 +#include "../../key.h"
50758 +
50759 +#include <linux/fs.h>
50760 +#include <linux/dcache.h>      /* for struct dentry */
50761 +
50762 +typedef struct directory_entry_format {
50763 +       /* key of object stat-data. It's not necessary to store whole
50764 +          key here, because it's always key of stat-data, so minor
50765 +          packing locality and offset can be omitted here. But this
50766 +          relies on particular key allocation scheme for stat-data, so,
50767 +          for extensibility sake, whole key can be stored here.
50768 +
50769 +          We store key as array of bytes, because we don't want 8-byte
50770 +          alignment of dir entries.
50771 +       */
50772 +       obj_key_id id;
50773 +       /* file name. Null terminated string. */
50774 +       d8 name[0];
50775 +} directory_entry_format;
50776 +
50777 +void print_de(const char *prefix, coord_t * coord);
50778 +int extract_key_de(const coord_t * coord, reiser4_key * key);
50779 +int update_key_de(const coord_t * coord, const reiser4_key * key, lock_handle * lh);
50780 +char *extract_name_de(const coord_t * coord, char *buf);
50781 +unsigned extract_file_type_de(const coord_t * coord);
50782 +int add_entry_de(struct inode *dir, coord_t * coord,
50783 +                lock_handle * lh, const struct dentry *name, reiser4_dir_entry_desc * entry);
50784 +int rem_entry_de(struct inode *dir, const struct qstr * name, coord_t * coord, lock_handle * lh, reiser4_dir_entry_desc * entry);
50785 +int max_name_len_de(const struct inode *dir);
50786 +
50787 +
50788 +int de_rem_and_shrink(struct inode *dir, coord_t * coord, int length);
50789 +
50790 +char *extract_dent_name(const coord_t * coord,
50791 +                       directory_entry_format *dent, char *buf);
50792 +
50793 +#if REISER4_LARGE_KEY
50794 +#define DE_NAME_BUF_LEN (24)
50795 +#else
50796 +#define DE_NAME_BUF_LEN (16)
50797 +#endif
50798 +
50799 +/* __FS_REISER4_PLUGIN_DIRECTORY_ENTRY_H__ */
50800 +#endif
50801 +
50802 +/* Make Linus happy.
50803 +   Local variables:
50804 +   c-indentation-style: "K&R"
50805 +   mode-name: "LC"
50806 +   c-basic-offset: 8
50807 +   tab-width: 8
50808 +   fill-column: 120
50809 +   End:
50810 +*/
50811 diff -rupN linux-2.6.8-rc3/fs/reiser4/plugin/item/blackbox.c linux-2.6.8-rc3-a/fs/reiser4/plugin/item/blackbox.c
50812 --- linux-2.6.8-rc3/fs/reiser4/plugin/item/blackbox.c   1970-01-01 03:00:00.000000000 +0300
50813 +++ linux-2.6.8-rc3-a/fs/reiser4/plugin/item/blackbox.c 2004-08-05 21:20:52.973683625 +0400
50814 @@ -0,0 +1,142 @@
50815 +/* Copyright 2003 by Hans Reiser, licensing governed by
50816 + * reiser4/README */
50817 +
50818 +/* Black box item implementation */
50819 +
50820 +#include "../../forward.h"
50821 +#include "../../debug.h"
50822 +#include "../../dformat.h"
50823 +#include "../../kassign.h"
50824 +#include "../../coord.h"
50825 +#include "../../tree.h"
50826 +#include "../../lock.h"
50827 +
50828 +#include "blackbox.h"
50829 +#include "item.h"
50830 +#include "../plugin.h"
50831 +
50832 +
50833 +reiser4_internal int
50834 +store_black_box(reiser4_tree *tree,
50835 +               const reiser4_key *key, void *data, int length)
50836 +{
50837 +       int result;
50838 +       reiser4_item_data idata;
50839 +       coord_t coord;
50840 +       lock_handle lh;
50841 +
50842 +       xmemset(&idata, 0, sizeof idata);
50843 +
50844 +       idata.data = data;
50845 +       idata.user = 0;
50846 +       idata.length = length;
50847 +       idata.iplug = item_plugin_by_id(BLACK_BOX_ID);
50848 +
50849 +       init_lh(&lh);
50850 +       result = insert_by_key(tree, key,
50851 +                              &idata, &coord, &lh, LEAF_LEVEL, CBK_UNIQUE);
50852 +
50853 +       assert("nikita-3413",
50854 +              ergo(result == 0,
50855 +                   WITH_COORD(&coord, item_length_by_coord(&coord) == length)));
50856 +
50857 +       done_lh(&lh);
50858 +       return result;
50859 +}
50860 +
50861 +reiser4_internal int
50862 +load_black_box(reiser4_tree *tree,
50863 +              reiser4_key *key, void *data, int length, int exact)
50864 +{
50865 +       int result;
50866 +       coord_t coord;
50867 +       lock_handle lh;
50868 +
50869 +       init_lh(&lh);
50870 +       result = coord_by_key(tree, key,
50871 +                             &coord, &lh, ZNODE_READ_LOCK,
50872 +                             exact ? FIND_EXACT : FIND_MAX_NOT_MORE_THAN,
50873 +                             LEAF_LEVEL, LEAF_LEVEL, CBK_UNIQUE, NULL);
50874 +
50875 +       if (result == 0) {
50876 +               int ilen;
50877 +
50878 +               result = zload(coord.node);
50879 +               if (result == 0) {
50880 +                       ilen = item_length_by_coord(&coord);
50881 +                       if (ilen <= length) {
50882 +                               xmemcpy(data, item_body_by_coord(&coord), ilen);
50883 +                               unit_key_by_coord(&coord, key);
50884 +                       } else if (exact) {
50885 +                               /*
50886 +                                * item is larger than buffer provided by the
50887 +                                * user. Only issue a warning if @exact is
50888 +                                * set. If @exact is false, we are iterating
50889 +                                * over all safe-links and here we are reaching
50890 +                                * the end of the iteration.
50891 +                                */
50892 +                               warning("nikita-3415",
50893 +                                       "Wrong black box length: %i > %i",
50894 +                                       ilen, length);
50895 +                               result = RETERR(-EIO);
50896 +                       }
50897 +                       zrelse(coord.node);
50898 +               }
50899 +       }
50900 +
50901 +       done_lh(&lh);
50902 +       return result;
50903 +
50904 +}
50905 +
50906 +reiser4_internal int
50907 +update_black_box(reiser4_tree *tree,
50908 +                const reiser4_key *key, void *data, int length)
50909 +{
50910 +       int result;
50911 +       coord_t coord;
50912 +       lock_handle lh;
50913 +
50914 +       init_lh(&lh);
50915 +       result = coord_by_key(tree, key,
50916 +                             &coord, &lh, ZNODE_READ_LOCK,
50917 +                             FIND_EXACT,
50918 +                             LEAF_LEVEL, LEAF_LEVEL, CBK_UNIQUE, NULL);
50919 +       if (result == 0) {
50920 +               int ilen;
50921 +
50922 +               result = zload(coord.node);
50923 +               if (result == 0) {
50924 +                       ilen = item_length_by_coord(&coord);
50925 +                       if (length <= ilen) {
50926 +                               xmemcpy(item_body_by_coord(&coord), data, length);
50927 +                       } else {
50928 +                               warning("nikita-3437",
50929 +                                       "Wrong black box length: %i < %i",
50930 +                                       ilen, length);
50931 +                               result = RETERR(-EIO);
50932 +                       }
50933 +                       zrelse(coord.node);
50934 +               }
50935 +       }
50936 +
50937 +       done_lh(&lh);
50938 +       return result;
50939 +
50940 +}
50941 +
50942 +reiser4_internal int kill_black_box(reiser4_tree *tree, const reiser4_key *key)
50943 +{
50944 +       return cut_tree(tree, key, key, NULL, 1);
50945 +}
50946 +
50947 +
50948 +/* Make Linus happy.
50949 +   Local variables:
50950 +   c-indentation-style: "K&R"
50951 +   mode-name: "LC"
50952 +   c-basic-offset: 8
50953 +   tab-width: 8
50954 +   fill-column: 120
50955 +   End:
50956 +*/
50957 diff -rupN linux-2.6.8-rc3/fs/reiser4/plugin/item/blackbox.h linux-2.6.8-rc3-a/fs/reiser4/plugin/item/blackbox.h
50958 --- linux-2.6.8-rc3/fs/reiser4/plugin/item/blackbox.h   1970-01-01 03:00:00.000000000 +0300
50959 +++ linux-2.6.8-rc3-a/fs/reiser4/plugin/item/blackbox.h 2004-08-05 21:20:52.868705768 +0400
50960 @@ -0,0 +1,33 @@
50961 +/* Copyright 2003 by Hans Reiser, licensing governed by
50962 + * reiser4/README */
50963 +
50964 +/* "Black box" entry to fixed-width contain user supplied data */
50965 +
50966 +#if !defined( __FS_REISER4_BLACK_BOX_H__ )
50967 +#define __FS_REISER4_BLACK_BOX_H__
50968 +
50969 +#include "../../forward.h"
50970 +#include "../../dformat.h"
50971 +#include "../../kassign.h"
50972 +#include "../../key.h"
50973 +
50974 +extern int store_black_box(reiser4_tree *tree,
50975 +                          const reiser4_key *key, void *data, int length);
50976 +extern int load_black_box(reiser4_tree *tree,
50977 +                         reiser4_key *key, void *data, int length, int exact);
50978 +extern int kill_black_box(reiser4_tree *tree, const reiser4_key *key);
50979 +extern int update_black_box(reiser4_tree *tree,
50980 +                           const reiser4_key *key, void *data, int length);
50981 +
50982 +/* __FS_REISER4_BLACK_BOX_H__ */
50983 +#endif
50984 +
50985 +/* Make Linus happy.
50986 +   Local variables:
50987 +   c-indentation-style: "K&R"
50988 +   mode-name: "LC"
50989 +   c-basic-offset: 8
50990 +   tab-width: 8
50991 +   fill-column: 120
50992 +   End:
50993 +*/
50994 diff -rupN linux-2.6.8-rc3/fs/reiser4/plugin/item/cde.c linux-2.6.8-rc3-a/fs/reiser4/plugin/item/cde.c
50995 --- linux-2.6.8-rc3/fs/reiser4/plugin/item/cde.c        1970-01-01 03:00:00.000000000 +0300
50996 +++ linux-2.6.8-rc3-a/fs/reiser4/plugin/item/cde.c      2004-08-05 21:20:52.866706190 +0400
50997 @@ -0,0 +1,1073 @@
50998 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
50999 +
51000 +/* Directory entry implementation */
51001 +
51002 +/* DESCRIPTION:
51003 +
51004 +   This is "compound" directory item plugin implementation. This directory
51005 +   item type is compound (as opposed to the "simple directory item" in
51006 +   fs/reiser4/plugin/item/sde.[ch]), because it consists of several directory
51007 +   entries.
51008 +
51009 +   The reason behind this decision is disk space efficiency: all directory
51010 +   entries inside the same directory have identical fragment in their
51011 +   keys. This, of course, depends on key assignment policy. In our default key
51012 +   assignment policy, all directory entries have the same locality which is
51013 +   equal to the object id of their directory.
51014 +
51015 +   Composing directory item out of several directory entries for the same
51016 +   directory allows us to store said key fragment only once. That is, this is
51017 +   some ad hoc form of key compression (stem compression) that is implemented
51018 +   here, because general key compression is not supposed to be implemented in
51019 +   v4.0.
51020 +
51021 +   Another decision that was made regarding all directory item plugins, is
51022 +   that they will store entry keys unaligned. This is for that sake of disk
51023 +   space efficiency again.
51024 +
51025 +   In should be noted, that storing keys unaligned increases CPU consumption,
51026 +   at least on some architectures.
51027 +
51028 +   Internal on-disk structure of the compound directory item is the following:
51029 +
51030 +        HEADER          cde_item_format.        Here number of entries is stored.
51031 +        ENTRY_HEADER_0  cde_unit_header.        Here part of entry key and
51032 +        ENTRY_HEADER_1                          offset of entry body are stored.
51033 +        ENTRY_HEADER_2                         (basically two last parts of key)
51034 +        ...
51035 +        ENTRY_HEADER_N
51036 +        ENTRY_BODY_0    directory_entry_format. Here part of stat data key and
51037 +        ENTRY_BODY_1                            NUL-terminated name are stored.
51038 +        ENTRY_BODY_2                           (part of statadta key in the
51039 +                                                sence that since all SDs have
51040 +                                                zero offset, this offset is not
51041 +                                                stored on disk).
51042 +        ...
51043 +        ENTRY_BODY_N
51044 +
51045 +   When it comes to the balancing, each directory entry in compound directory
51046 +   item is unit, that is, something that can be cut from one item and pasted
51047 +   into another item of the same type. Handling of unit cut and paste is major
51048 +   reason for the complexity of code below.
51049 +
51050 +*/
51051 +
51052 +#include "../../forward.h"
51053 +#include "../../debug.h"
51054 +#include "../../dformat.h"
51055 +#include "../../kassign.h"
51056 +#include "../../key.h"
51057 +#include "../../coord.h"
51058 +#include "sde.h"
51059 +#include "cde.h"
51060 +#include "item.h"
51061 +#include "../node/node.h"
51062 +#include "../plugin.h"
51063 +#include "../../znode.h"
51064 +#include "../../carry.h"
51065 +#include "../../tree.h"
51066 +#include "../../inode.h"
51067 +
51068 +#include <linux/fs.h>          /* for struct inode */
51069 +#include <linux/dcache.h>      /* for struct dentry */
51070 +#include <linux/quotaops.h>
51071 +
51072 +#if 0
51073 +#define CHECKME(coord)                                         \
51074 +({                                                             \
51075 +       const char *message;                                    \
51076 +       coord_t dup;                                            \
51077 +                                                               \
51078 +       coord_dup_nocheck(&dup, (coord));                       \
51079 +       dup.unit_pos = 0;                                       \
51080 +       assert("nikita-2871", cde_check(&dup, &message) == 0);  \
51081 +})
51082 +#else
51083 +#define CHECKME(coord) noop
51084 +#endif
51085 +
51086 +
51087 +/* return body of compound directory item at @coord */
51088 +static inline cde_item_format *
51089 +formatted_at(const coord_t * coord)
51090 +{
51091 +       assert("nikita-1282", coord != NULL);
51092 +       return item_body_by_coord(coord);
51093 +}
51094 +
51095 +/* return entry header at @coord */
51096 +static inline cde_unit_header *
51097 +header_at(const coord_t * coord /* coord of item */ ,
51098 +         int idx /* index of unit */ )
51099 +{
51100 +       assert("nikita-1283", coord != NULL);
51101 +       return &formatted_at(coord)->entry[idx];
51102 +}
51103 +
51104 +/* return number of units in compound directory item at @coord */
51105 +static int
51106 +units(const coord_t * coord /* coord of item */ )
51107 +{
51108 +       return d16tocpu(&formatted_at(coord)->num_of_entries);
51109 +}
51110 +
51111 +/* return offset of the body of @idx-th entry in @coord */
51112 +static unsigned int
51113 +offset_of(const coord_t * coord /* coord of item */ ,
51114 +         int idx /* index of unit */ )
51115 +{
51116 +       if (idx < units(coord))
51117 +               return d16tocpu(&header_at(coord, idx)->offset);
51118 +       else if (idx == units(coord))
51119 +               return item_length_by_coord(coord);
51120 +       else
51121 +               impossible("nikita-1308", "Wrong idx");
51122 +       return 0;
51123 +}
51124 +
51125 +/* set offset of the body of @idx-th entry in @coord */
51126 +static void
51127 +set_offset(const coord_t * coord /* coord of item */ ,
51128 +          int idx /* index of unit */ ,
51129 +          unsigned int offset /* new offset */ )
51130 +{
51131 +       cputod16((__u16) offset, &header_at(coord, idx)->offset);
51132 +}
51133 +
51134 +static void
51135 +adj_offset(const coord_t * coord /* coord of item */ ,
51136 +          int idx /* index of unit */ ,
51137 +          int delta /* offset change */ )
51138 +{
51139 +       d16  *doffset;
51140 +       __u16 offset;
51141 +
51142 +       doffset = &header_at(coord, idx)->offset;
51143 +       offset = d16tocpu(doffset);
51144 +       offset += delta;
51145 +       cputod16((__u16) offset, doffset);
51146 +}
51147 +
51148 +/* return pointer to @offset-th byte from the beginning of @coord */
51149 +static char *
51150 +address(const coord_t * coord /* coord of item */ ,
51151 +       int offset)
51152 +{
51153 +       return ((char *) item_body_by_coord(coord)) + offset;
51154 +}
51155 +
51156 +/* return pointer to the body of @idx-th entry in @coord */
51157 +static directory_entry_format *
51158 +entry_at(const coord_t * coord /* coord of
51159 +                                * item */ ,
51160 +        int idx /* index of unit */ )
51161 +{
51162 +       return (directory_entry_format *) address(coord, (int) offset_of(coord, idx));
51163 +}
51164 +
51165 +/* return number of unit referenced by @coord */
51166 +static int
51167 +idx_of(const coord_t * coord /* coord of item */ )
51168 +{
51169 +       assert("nikita-1285", coord != NULL);
51170 +       return coord->unit_pos;
51171 +}
51172 +
51173 +/* find position where entry with @entry_key would be inserted into @coord */
51174 +static int
51175 +find(const coord_t * coord /* coord of item */ ,
51176 +     const reiser4_key * entry_key /* key to look for */ ,
51177 +     cmp_t * last /* result of last comparison */ )
51178 +{
51179 +       int entries;
51180 +
51181 +       int left;
51182 +       int right;
51183 +
51184 +       cde_unit_header *header;
51185 +
51186 +       assert("nikita-1295", coord != NULL);
51187 +       assert("nikita-1296", entry_key != NULL);
51188 +       assert("nikita-1297", last != NULL);
51189 +
51190 +       entries = units(coord);
51191 +       left = 0;
51192 +       right = entries - 1;
51193 +       while (right - left >= REISER4_SEQ_SEARCH_BREAK) {
51194 +               int median;
51195 +
51196 +               median = (left + right) >> 1;
51197 +
51198 +               header = header_at(coord, median);
51199 +               *last = de_id_key_cmp(&header->hash, entry_key);
51200 +               switch (*last) {
51201 +               case LESS_THAN:
51202 +                       left = median;
51203 +                       break;
51204 +               case GREATER_THAN:
51205 +                       right = median;
51206 +                       break;
51207 +               case EQUAL_TO: {
51208 +                       do {
51209 +                               median --;
51210 +                               header --;
51211 +                       } while (median >= 0 &&
51212 +                                de_id_key_cmp(&header->hash,
51213 +                                              entry_key) == EQUAL_TO);
51214 +                       return median + 1;
51215 +               }
51216 +               }
51217 +       }
51218 +       header = header_at(coord, left);
51219 +       for (; left < entries; ++ left, ++ header) {
51220 +               prefetch(header + 1);
51221 +               *last = de_id_key_cmp(&header->hash, entry_key);
51222 +               if (*last != LESS_THAN)
51223 +                       break;
51224 +       }
51225 +       if (left < entries)
51226 +               return left;
51227 +       else
51228 +               return RETERR(-ENOENT);
51229 +
51230 +}
51231 +
51232 +/* expand @coord as to accomodate for insertion of @no new entries starting
51233 +   from @pos, with total bodies size @size. */
51234 +static int
51235 +expand_item(const coord_t * coord /* coord of item */ ,
51236 +           int pos /* unit position */ , int no        /* number of new
51237 +                                                        * units*/ ,
51238 +           int size /* total size of new units' data */ ,
51239 +           unsigned int data_size      /* free space already reserved
51240 +                                        * in the item for insertion */ )
51241 +{
51242 +       int entries;
51243 +       cde_unit_header *header;
51244 +       char *dent;
51245 +       int i;
51246 +
51247 +       assert("nikita-1310", coord != NULL);
51248 +       assert("nikita-1311", pos >= 0);
51249 +       assert("nikita-1312", no > 0);
51250 +       assert("nikita-1313", data_size >= no * sizeof (directory_entry_format));
51251 +       assert("nikita-1343", item_length_by_coord(coord) >= (int) (size + data_size + no * sizeof *header));
51252 +
51253 +       entries = units(coord);
51254 +
51255 +       if (pos == entries)
51256 +               dent = address(coord, size);
51257 +       else
51258 +               dent = (char *) entry_at(coord, pos);
51259 +       /* place where new header will be in */
51260 +       header = header_at(coord, pos);
51261 +       /* free space for new entry headers */
51262 +       xmemmove(header + no, header, (unsigned) (address(coord, size) - (char *) header));
51263 +       /* if adding to the end initialise first new header */
51264 +       if (pos == entries) {
51265 +               set_offset(coord, pos, (unsigned) size);
51266 +       }
51267 +
51268 +       /* adjust entry pointer and size */
51269 +       dent = dent + no * sizeof *header;
51270 +       size += no * sizeof *header;
51271 +       /* free space for new entries */
51272 +       xmemmove(dent + data_size, dent, (unsigned) (address(coord, size) - dent));
51273 +
51274 +       /* increase counter */
51275 +       entries += no;
51276 +       cputod16((__u16) entries, &formatted_at(coord)->num_of_entries);
51277 +
51278 +       /* [ 0 ... pos ] entries were shifted by no * ( sizeof *header )
51279 +          bytes.  */
51280 +       for (i = 0; i <= pos; ++i)
51281 +               adj_offset(coord, i, no * sizeof *header);
51282 +       /* [ pos + no ... +\infty ) entries were shifted by ( no *
51283 +          sizeof *header + data_size ) bytes */
51284 +       for (i = pos + no; i < entries; ++i)
51285 +               adj_offset(coord, i, no * sizeof *header + data_size);
51286 +       return 0;
51287 +}
51288 +
51289 +/* insert new @entry into item */
51290 +static int
51291 +expand(const coord_t * coord /* coord of item */ ,
51292 +       cde_entry * entry /* entry to insert */ ,
51293 +       int len /* length of @entry data */ ,
51294 +       int *pos /* position to insert */ ,
51295 +       reiser4_dir_entry_desc * dir_entry      /* parameters for new
51296 +                                                * entry */ )
51297 +{
51298 +       cmp_t cmp_res;
51299 +       int   datasize;
51300 +
51301 +       *pos = find(coord, &dir_entry->key, &cmp_res);
51302 +       if (*pos < 0)
51303 +               *pos = units(coord);
51304 +
51305 +       datasize = sizeof (directory_entry_format);
51306 +       if (is_longname(entry->name->name, entry->name->len))
51307 +               datasize += entry->name->len + 1;
51308 +
51309 +       expand_item(coord, *pos, 1, item_length_by_coord(coord) - len, datasize);
51310 +       return 0;
51311 +}
51312 +
51313 +/* paste body of @entry into item */
51314 +static int
51315 +paste_entry(const coord_t * coord /* coord of item */ ,
51316 +           cde_entry * entry /* new entry */ ,
51317 +           int pos /* position to insert */ ,
51318 +           reiser4_dir_entry_desc * dir_entry  /* parameters for
51319 +                                                * new entry */ )
51320 +{
51321 +       cde_unit_header *header;
51322 +       directory_entry_format *dent;
51323 +       const char *name;
51324 +       int   len;
51325 +
51326 +       header = header_at(coord, pos);
51327 +       dent = entry_at(coord, pos);
51328 +
51329 +       build_de_id_by_key(&dir_entry->key, &header->hash);
51330 +       build_inode_key_id(entry->obj, &dent->id);
51331 +       /* AUDIT unsafe strcpy() operation! It should be replaced with
51332 +          much less CPU hungry
51333 +          memcpy( ( char * ) dent -> name, entry -> name -> name , entry -> name -> len );
51334 +
51335 +          Also a more major thing is that there should be a way to figure out
51336 +          amount of space in dent -> name and be able to check that we are
51337 +          not going to overwrite more than we supposed to */
51338 +       name = entry->name->name;
51339 +       len  = entry->name->len;
51340 +       if (is_longname(name, len)) {
51341 +               strcpy((unsigned char *) dent->name, name);
51342 +               cputod8(0, &dent->name[len]);
51343 +       }
51344 +       return 0;
51345 +}
51346 +
51347 +/* estimate how much space is necessary in item to insert/paste set of entries
51348 +   described in @data. */
51349 +reiser4_internal int
51350 +estimate_cde(const coord_t * coord /* coord of item */ ,
51351 +            const reiser4_item_data * data /* parameters for new item */ )
51352 +{
51353 +       cde_entry_data *e;
51354 +       int result;
51355 +       int i;
51356 +
51357 +       e = (cde_entry_data *) data->data;
51358 +
51359 +       assert("nikita-1288", e != NULL);
51360 +       assert("nikita-1289", e->num_of_entries >= 0);
51361 +
51362 +       if (coord == NULL)
51363 +               /* insert */
51364 +               result = sizeof (cde_item_format);
51365 +       else
51366 +               /* paste */
51367 +               result = 0;
51368 +
51369 +       result += e->num_of_entries *
51370 +               (sizeof (cde_unit_header) + sizeof (directory_entry_format));
51371 +       for (i = 0; i < e->num_of_entries; ++i) {
51372 +               const char *name;
51373 +               int   len;
51374 +
51375 +               name = e->entry[i].name->name;
51376 +               len  = e->entry[i].name->len;
51377 +               assert("nikita-2054", strlen(name) == len);
51378 +               if (is_longname(name, len))
51379 +                       result += len + 1;
51380 +       }
51381 +       ((reiser4_item_data *) data)->length = result;
51382 +       return result;
51383 +}
51384 +
51385 +/* ->nr_units() method for this item plugin. */
51386 +reiser4_internal pos_in_node_t
51387 +nr_units_cde(const coord_t * coord /* coord of item */ )
51388 +{
51389 +       return units(coord);
51390 +}
51391 +
51392 +/* ->unit_key() method for this item plugin. */
51393 +reiser4_internal reiser4_key *
51394 +unit_key_cde(const coord_t * coord /* coord of item */ ,
51395 +            reiser4_key * key /* resulting key */ )
51396 +{
51397 +       assert("nikita-1452", coord != NULL);
51398 +       assert("nikita-1345", idx_of(coord) < units(coord));
51399 +       assert("nikita-1346", key != NULL);
51400 +
51401 +       item_key_by_coord(coord, key);
51402 +       extract_key_from_de_id(extract_dir_id_from_key(key), &header_at(coord, idx_of(coord))->hash, key);
51403 +       return key;
51404 +}
51405 +
51406 +/* mergeable_cde(): implementation of ->mergeable() item method.
51407 +
51408 +   Two directory items are mergeable iff they are from the same
51409 +   directory. That simple.
51410 +
51411 +*/
51412 +reiser4_internal int
51413 +mergeable_cde(const coord_t * p1 /* coord of first item */ ,
51414 +             const coord_t * p2 /* coord of second item */ )
51415 +{
51416 +       reiser4_key k1;
51417 +       reiser4_key k2;
51418 +
51419 +       assert("nikita-1339", p1 != NULL);
51420 +       assert("nikita-1340", p2 != NULL);
51421 +
51422 +       return
51423 +           (item_plugin_by_coord(p1) == item_plugin_by_coord(p2)) &&
51424 +           (extract_dir_id_from_key(item_key_by_coord(p1, &k1)) ==
51425 +            extract_dir_id_from_key(item_key_by_coord(p2, &k2)));
51426 +
51427 +}
51428 +
51429 +/* ->max_key_inside() method for this item plugin. */
51430 +reiser4_internal reiser4_key *
51431 +max_key_inside_cde(const coord_t * coord /* coord of item */ ,
51432 +                  reiser4_key * result /* resulting key */)
51433 +{
51434 +       assert("nikita-1342", coord != NULL);
51435 +
51436 +       item_key_by_coord(coord, result);
51437 +       set_key_ordering(result, get_key_ordering(max_key()));
51438 +       set_key_objectid(result, get_key_objectid(max_key()));
51439 +       set_key_offset(result, get_key_offset(max_key()));
51440 +       return result;
51441 +}
51442 +
51443 +/* @data contains data which are to be put into tree */
51444 +reiser4_internal int
51445 +can_contain_key_cde(const coord_t * coord /* coord of item */ ,
51446 +                   const reiser4_key * key /* key to check */ ,
51447 +                   const reiser4_item_data * data      /* parameters of new
51448 +                                                        * item/unit being
51449 +                                                        * created */ )
51450 +{
51451 +       reiser4_key item_key;
51452 +
51453 +       /* FIXME-VS: do not rely on anything but iplug field of @data. Only
51454 +          data->iplug is initialized */
51455 +       assert("vs-457", data && data->iplug);
51456 +/*     assert( "vs-553", data -> user == 0 );*/
51457 +       item_key_by_coord(coord, &item_key);
51458 +
51459 +       return (item_plugin_by_coord(coord) == data->iplug) &&
51460 +           (extract_dir_id_from_key(&item_key) == extract_dir_id_from_key(key));
51461 +}
51462 +
51463 +#if REISER4_DEBUG_OUTPUT
51464 +/* ->print() method for this item plugin. */
51465 +reiser4_internal void
51466 +print_cde(const char *prefix /* prefix to print */ ,
51467 +         coord_t * coord /* coord of item to print */ )
51468 +{
51469 +       assert("nikita-1077", prefix != NULL);
51470 +       assert("nikita-1078", coord != NULL);
51471 +
51472 +       if (item_length_by_coord(coord) < (int) sizeof (cde_item_format)) {
51473 +               printk("%s: wrong size: %i < %i\n", prefix, item_length_by_coord(coord), sizeof (cde_item_format));
51474 +       } else {
51475 +               char *name;
51476 +               char *end;
51477 +               char *start;
51478 +               int i;
51479 +               oid_t dirid;
51480 +               reiser4_key key;
51481 +
51482 +               start = address(coord, 0);
51483 +               end = address(coord, item_length_by_coord(coord));
51484 +               item_key_by_coord(coord, &key);
51485 +               dirid = extract_dir_id_from_key(&key);
51486 +
51487 +               printk("%s: units: %i\n", prefix, nr_units_cde(coord));
51488 +               for (i = 0; i < units(coord); ++i) {
51489 +                       cde_unit_header *header;
51490 +
51491 +                       header = header_at(coord, i);
51492 +                       indent_znode(coord->node);
51493 +                       printk("\theader %i: ", i);
51494 +                       if ((char *) (header + 1) > end) {
51495 +                               printk("out of bounds: %p [%p, %p]\n", header, start, end);
51496 +                       } else {
51497 +                               extract_key_from_de_id(dirid, &header->hash, &key);
51498 +                               printk("%i: at %i, offset: %i, ", i, i * sizeof (*header), d16tocpu(&header->offset));
51499 +                               print_key("key", &key);
51500 +                       }
51501 +               }
51502 +               for (i = 0; i < units(coord); ++i) {
51503 +                       directory_entry_format *entry;
51504 +                       char buf[DE_NAME_BUF_LEN];
51505 +
51506 +                       entry = entry_at(coord, i);
51507 +                       indent_znode(coord->node);
51508 +                       printk("\tentry: %i: ", i);
51509 +                       if (((char *) (entry + 1) > end) || ((char *) entry < start)) {
51510 +                               printk("out of bounds: %p [%p, %p]\n", entry, start, end);
51511 +                       } else {
51512 +                               coord->unit_pos = i;
51513 +                               extract_key_cde(coord, &key);
51514 +                               name = extract_name_cde(coord, buf);
51515 +                               printk("at %i, name: %s, ", (char *) entry - start, name);
51516 +                               print_key("sdkey", &key);
51517 +                       }
51518 +               }
51519 +       }
51520 +}
51521 +#endif
51522 +
51523 +#if REISER4_DEBUG
51524 +/* cde_check ->check() method for compressed directory items
51525 +
51526 +   used for debugging, every item should have here the most complete
51527 +   possible check of the consistency of the item that the inventor can
51528 +   construct
51529 +*/
51530 +reiser4_internal int
51531 +check_cde(const coord_t * coord /* coord of item to check */ ,
51532 +         const char **error /* where to store error message */ )
51533 +{
51534 +       int i;
51535 +       int result;
51536 +       char *item_start;
51537 +       char *item_end;
51538 +       reiser4_key key;
51539 +
51540 +       coord_t c;
51541 +
51542 +       assert("nikita-1357", coord != NULL);
51543 +       assert("nikita-1358", error != NULL);
51544 +
51545 +       if (!ergo(coord->item_pos != 0,
51546 +                 is_dot_key(item_key_by_coord(coord, &key)))) {
51547 +               *error = "CDE doesn't start with dot";
51548 +               return -1;
51549 +       }
51550 +       item_start = item_body_by_coord(coord);
51551 +       item_end = item_start + item_length_by_coord(coord);
51552 +
51553 +       coord_dup(&c, coord);
51554 +       result = 0;
51555 +       for (i = 0; i < units(coord); ++i) {
51556 +               directory_entry_format *entry;
51557 +
51558 +               if ((char *) (header_at(coord, i) + 1) > item_end - units(coord) * sizeof *entry) {
51559 +                       *error = "CDE header is out of bounds";
51560 +                       result = -1;
51561 +                       break;
51562 +               }
51563 +               entry = entry_at(coord, i);
51564 +               if ((char *) entry < item_start + sizeof (cde_item_format)) {
51565 +                       *error = "CDE header is too low";
51566 +                       result = -1;
51567 +                       break;
51568 +               }
51569 +               if ((char *) (entry + 1) > item_end) {
51570 +                       *error = "CDE header is too high";
51571 +                       result = -1;
51572 +                       break;
51573 +               }
51574 +       }
51575 +
51576 +       return result;
51577 +}
51578 +#endif
51579 +
51580 +/* ->init() method for this item plugin. */
51581 +reiser4_internal int
51582 +init_cde(coord_t * coord /* coord of item */ ,
51583 +        coord_t * from UNUSED_ARG,
51584 +        reiser4_item_data * data       /* structure used for insertion */
51585 +        UNUSED_ARG)
51586 +{
51587 +       cputod16(0u, &formatted_at(coord)->num_of_entries);
51588 +       return 0;
51589 +}
51590 +
51591 +/* ->lookup() method for this item plugin. */
51592 +reiser4_internal lookup_result
51593 +lookup_cde(const reiser4_key * key /* key to search for */ ,
51594 +          lookup_bias bias /* search bias */ ,
51595 +          coord_t * coord /* coord of item to lookup in */ )
51596 +{
51597 +       cmp_t last_comp;
51598 +       int pos;
51599 +
51600 +       reiser4_key utmost_key;
51601 +
51602 +       assert("nikita-1293", coord != NULL);
51603 +       assert("nikita-1294", key != NULL);
51604 +
51605 +       CHECKME(coord);
51606 +
51607 +       if (keygt(item_key_by_coord(coord, &utmost_key), key)) {
51608 +               coord->unit_pos = 0;
51609 +               coord->between = BEFORE_UNIT;
51610 +               return CBK_COORD_NOTFOUND;
51611 +       }
51612 +       pos = find(coord, key, &last_comp);
51613 +       if (pos >= 0) {
51614 +               coord->unit_pos = (int) pos;
51615 +               switch (last_comp) {
51616 +               case EQUAL_TO:
51617 +                       coord->between = AT_UNIT;
51618 +                       return CBK_COORD_FOUND;
51619 +               case GREATER_THAN:
51620 +                       coord->between = BEFORE_UNIT;
51621 +                       return RETERR(-ENOENT);
51622 +               case LESS_THAN:
51623 +               default:
51624 +                       impossible("nikita-1298", "Broken find");
51625 +                       return RETERR(-EIO);
51626 +               }
51627 +       } else {
51628 +               coord->unit_pos = units(coord) - 1;
51629 +               coord->between = AFTER_UNIT;
51630 +               return (bias == FIND_MAX_NOT_MORE_THAN) ? CBK_COORD_FOUND : CBK_COORD_NOTFOUND;
51631 +       }
51632 +}
51633 +
51634 +/* ->paste() method for this item plugin. */
51635 +reiser4_internal int
51636 +paste_cde(coord_t * coord /* coord of item */ ,
51637 +         reiser4_item_data * data      /* parameters of new unit being
51638 +                                        * inserted */ ,
51639 +         carry_plugin_info * info UNUSED_ARG /* todo carry queue */ )
51640 +{
51641 +       cde_entry_data *e;
51642 +       int result;
51643 +       int i;
51644 +
51645 +       CHECKME(coord);
51646 +       e = (cde_entry_data *) data->data;
51647 +
51648 +       result = 0;
51649 +       for (i = 0; i < e->num_of_entries; ++i) {
51650 +               int pos;
51651 +               int phantom_size;
51652 +
51653 +               phantom_size = data->length;
51654 +               if (units(coord) == 0)
51655 +                       phantom_size -= sizeof (cde_item_format);
51656 +
51657 +               result = expand(coord, e->entry + i, phantom_size, &pos, data->arg);
51658 +               if (result != 0)
51659 +                       break;
51660 +               result = paste_entry(coord, e->entry + i, pos, data->arg);
51661 +               if (result != 0)
51662 +                       break;
51663 +       }
51664 +       CHECKME(coord);
51665 +       return result;
51666 +}
51667 +
51668 +/* amount of space occupied by all entries starting from @idx both headers and
51669 +   bodies. */
51670 +static unsigned int
51671 +part_size(const coord_t * coord /* coord of item */ ,
51672 +         int idx /* index of unit */ )
51673 +{
51674 +       assert("nikita-1299", coord != NULL);
51675 +       assert("nikita-1300", idx < (int) units(coord));
51676 +
51677 +       return sizeof (cde_item_format) +
51678 +           (idx + 1) * sizeof (cde_unit_header) + offset_of(coord, idx + 1) - offset_of(coord, 0);
51679 +}
51680 +
51681 +/* how many but not more than @want units of @source can be merged with
51682 +   item in @target node. If pend == append - we try to append last item
51683 +   of @target by first units of @source. If pend == prepend - we try to
51684 +   "prepend" first item in @target by last units of @source. @target
51685 +   node has @free_space bytes of free space. Total size of those units
51686 +   are returned via @size */
51687 +reiser4_internal int
51688 +can_shift_cde(unsigned free_space /* free space in item */ ,
51689 +             coord_t * coord /* coord of source item */ ,
51690 +             znode * target /* target node */ ,
51691 +             shift_direction pend /* shift direction */ ,
51692 +             unsigned *size /* resulting number of shifted bytes */ ,
51693 +             unsigned want /* maximal number of bytes to shift */ )
51694 +{
51695 +       int shift;
51696 +
51697 +       CHECKME(coord);
51698 +       if (want == 0) {
51699 +               *size = 0;
51700 +               return 0;
51701 +       }
51702 +
51703 +       /* pend == SHIFT_LEFT <==> shifting to the left */
51704 +       if (pend == SHIFT_LEFT) {
51705 +               for (shift = min((int) want - 1, units(coord)); shift >= 0; --shift) {
51706 +                       *size = part_size(coord, shift);
51707 +                       if (target != NULL)
51708 +                               *size -= sizeof (cde_item_format);
51709 +                       if (*size <= free_space)
51710 +                               break;
51711 +               }
51712 +               shift = shift + 1;
51713 +       } else {
51714 +               int total_size;
51715 +
51716 +               assert("nikita-1301", pend == SHIFT_RIGHT);
51717 +
51718 +               total_size = item_length_by_coord(coord);
51719 +               for (shift = units(coord) - want - 1; shift < units(coord) - 1; ++shift) {
51720 +                       *size = total_size - part_size(coord, shift);
51721 +                       if (target == NULL)
51722 +                               *size += sizeof (cde_item_format);
51723 +                       if (*size <= free_space)
51724 +                               break;
51725 +               }
51726 +               shift = units(coord) - shift - 1;
51727 +       }
51728 +       if (shift == 0)
51729 +               *size = 0;
51730 +       CHECKME(coord);
51731 +       return shift;
51732 +}
51733 +
51734 +/* ->copy_units() method for this item plugin. */
51735 +reiser4_internal void
51736 +copy_units_cde(coord_t * target /* coord of target item */ ,
51737 +              coord_t * source /* coord of source item */ ,
51738 +              unsigned from /* starting unit */ ,
51739 +              unsigned count /* how many units to copy */ ,
51740 +              shift_direction where_is_free_space /* shift direction */ ,
51741 +              unsigned free_space /* free space in item */ )
51742 +{
51743 +       char *header_from;
51744 +       char *header_to;
51745 +
51746 +       char *entry_from;
51747 +       char *entry_to;
51748 +
51749 +       int pos_in_target;
51750 +       int data_size;
51751 +       int data_delta;
51752 +       int i;
51753 +#if REISER4_TRACE && REISER4_DEBUG_OUTPUT
51754 +       reiser4_key debug_key;
51755 +#endif
51756 +
51757 +       assert("nikita-1303", target != NULL);
51758 +       assert("nikita-1304", source != NULL);
51759 +       assert("nikita-1305", (int) from < units(source));
51760 +       assert("nikita-1307", (int) (from + count) <= units(source));
51761 +
51762 +       IF_TRACE(TRACE_DIR | TRACE_NODES, print_key("cde_copy source", item_key_by_coord(source, &debug_key)));
51763 +       IF_TRACE(TRACE_DIR | TRACE_NODES, print_key("cde_copy target", item_key_by_coord(target, &debug_key)));
51764 +
51765 +       if (where_is_free_space == SHIFT_LEFT) {
51766 +               assert("nikita-1453", from == 0);
51767 +               pos_in_target = units(target);
51768 +       } else {
51769 +               assert("nikita-1309", (int) (from + count) == units(source));
51770 +               pos_in_target = 0;
51771 +               xmemmove(item_body_by_coord(target),
51772 +                        (char *) item_body_by_coord(target) + free_space, item_length_by_coord(target) - free_space);
51773 +       }
51774 +
51775 +       CHECKME(target);
51776 +       CHECKME(source);
51777 +
51778 +       /* expand @target */
51779 +       data_size = offset_of(source, (int) (from + count)) - offset_of(source, (int) from);
51780 +
51781 +       if (units(target) == 0)
51782 +               free_space -= sizeof (cde_item_format);
51783 +
51784 +       expand_item(target, pos_in_target, (int) count,
51785 +                   (int) (item_length_by_coord(target) - free_space), (unsigned) data_size);
51786 +
51787 +       /* copy first @count units of @source into @target */
51788 +       data_delta = offset_of(target, pos_in_target) - offset_of(source, (int) from);
51789 +
51790 +       /* copy entries */
51791 +       entry_from = (char *) entry_at(source, (int) from);
51792 +       entry_to = (char *) entry_at(source, (int) (from + count));
51793 +       xmemmove(entry_at(target, pos_in_target), entry_from, (unsigned) (entry_to - entry_from));
51794 +
51795 +       /* copy headers */
51796 +       header_from = (char *) header_at(source, (int) from);
51797 +       header_to = (char *) header_at(source, (int) (from + count));
51798 +       xmemmove(header_at(target, pos_in_target), header_from, (unsigned) (header_to - header_from));
51799 +
51800 +       /* update offsets */
51801 +       for (i = pos_in_target; i < (int) (pos_in_target + count); ++i)
51802 +               adj_offset(target, i, data_delta);
51803 +       CHECKME(target);
51804 +       CHECKME(source);
51805 +}
51806 +
51807 +/* ->cut_units() method for this item plugin. */
51808 +reiser4_internal int
51809 +cut_units_cde(coord_t * coord /* coord of item */ ,
51810 +             pos_in_node_t from /* start unit pos */ ,
51811 +             pos_in_node_t to /* stop unit pos */ ,
51812 +             struct carry_cut_data *cdata UNUSED_ARG, reiser4_key *smallest_removed,
51813 +             reiser4_key *new_first)
51814 +{
51815 +       char *header_from;
51816 +       char *header_to;
51817 +
51818 +       char *entry_from;
51819 +       char *entry_to;
51820 +
51821 +       int size;
51822 +       int entry_delta;
51823 +       int header_delta;
51824 +       int i;
51825 +
51826 +       unsigned count;
51827 +
51828 +       CHECKME(coord);
51829 +
51830 +       count = to - from + 1;
51831 +
51832 +       assert("nikita-1454", coord != NULL);
51833 +       assert("nikita-1455", (int) (from + count) <= units(coord));
51834 +
51835 +       if (smallest_removed)
51836 +               unit_key_by_coord(coord, smallest_removed);
51837 +
51838 +       if (new_first) {
51839 +               coord_t next;
51840 +
51841 +               /* not everything is cut from item head */
51842 +               assert("vs-1527", from == 0);
51843 +               assert("vs-1528", to < units(coord) - 1);
51844 +
51845 +               coord_dup(&next, coord);
51846 +               next.unit_pos ++;
51847 +               unit_key_by_coord(&next, new_first);
51848 +       }
51849 +
51850 +       size = item_length_by_coord(coord);
51851 +       if (count == (unsigned) units(coord)) {
51852 +               return size;
51853 +       }
51854 +
51855 +       header_from = (char *) header_at(coord, (int) from);
51856 +       header_to = (char *) header_at(coord, (int) (from + count));
51857 +
51858 +       entry_from = (char *) entry_at(coord, (int) from);
51859 +       entry_to = (char *) entry_at(coord, (int) (from + count));
51860 +
51861 +       /* move headers */
51862 +       xmemmove(header_from, header_to, (unsigned) (address(coord, size) - header_to));
51863 +
51864 +       header_delta = header_to - header_from;
51865 +
51866 +       entry_from -= header_delta;
51867 +       entry_to -= header_delta;
51868 +       size -= header_delta;
51869 +
51870 +       /* copy entries */
51871 +       xmemmove(entry_from, entry_to, (unsigned) (address(coord, size) - entry_to));
51872 +
51873 +       entry_delta = entry_to - entry_from;
51874 +       size -= entry_delta;
51875 +
51876 +       /* update offsets */
51877 +
51878 +       for (i = 0; i < (int) from; ++i)
51879 +               adj_offset(coord, i, - header_delta);
51880 +
51881 +       for (i = from; i < units(coord) - (int) count; ++i)
51882 +               adj_offset(coord, i, - header_delta - entry_delta);
51883 +
51884 +       cputod16((__u16) units(coord) - count, &formatted_at(coord)->num_of_entries);
51885 +
51886 +       if (from == 0) {
51887 +               /* entries from head was removed - move remaining to right */
51888 +               xmemmove((char *) item_body_by_coord(coord) +
51889 +                        header_delta + entry_delta, item_body_by_coord(coord), (unsigned) size);
51890 +               if (REISER4_DEBUG)
51891 +                       xmemset(item_body_by_coord(coord), 0, (unsigned) header_delta + entry_delta);
51892 +       } else {
51893 +               /* freed space is already at the end of item */
51894 +               if (REISER4_DEBUG)
51895 +                       xmemset((char *) item_body_by_coord(coord) + size, 0, (unsigned) header_delta + entry_delta);
51896 +       }
51897 +
51898 +       return header_delta + entry_delta;
51899 +}
51900 +
51901 +reiser4_internal int
51902 +kill_units_cde(coord_t * coord /* coord of item */ ,
51903 +              pos_in_node_t from /* start unit pos */ ,
51904 +              pos_in_node_t to /* stop unit pos */ ,
51905 +              struct carry_kill_data *kdata UNUSED_ARG, reiser4_key *smallest_removed,
51906 +              reiser4_key *new_first)
51907 +{
51908 +       return cut_units_cde(coord, from, to, 0, smallest_removed, new_first);
51909 +}
51910 +
51911 +/* ->s.dir.extract_key() method for this item plugin. */
51912 +reiser4_internal int
51913 +extract_key_cde(const coord_t * coord /* coord of item */ ,
51914 +               reiser4_key * key /* resulting key */ )
51915 +{
51916 +       directory_entry_format *dent;
51917 +
51918 +       assert("nikita-1155", coord != NULL);
51919 +       assert("nikita-1156", key != NULL);
51920 +
51921 +       dent = entry_at(coord, idx_of(coord));
51922 +       return extract_key_from_id(&dent->id, key);
51923 +}
51924 +
51925 +reiser4_internal int
51926 +update_key_cde(const coord_t * coord, const reiser4_key * key, lock_handle * lh UNUSED_ARG)
51927 +{
51928 +       directory_entry_format *dent;
51929 +       obj_key_id obj_id;
51930 +       int result;
51931 +
51932 +       assert("nikita-2344", coord != NULL);
51933 +       assert("nikita-2345", key != NULL);
51934 +
51935 +       dent = entry_at(coord, idx_of(coord));
51936 +       result = build_obj_key_id(key, &obj_id);
51937 +       if (result == 0) {
51938 +               dent->id = obj_id;
51939 +               znode_make_dirty(coord->node);
51940 +       }
51941 +       return 0;
51942 +}
51943 +
51944 +/* ->s.dir.extract_name() method for this item plugin. */
51945 +reiser4_internal char *
51946 +extract_name_cde(const coord_t * coord /* coord of item */, char *buf)
51947 +{
51948 +       directory_entry_format *dent;
51949 +
51950 +       assert("nikita-1157", coord != NULL);
51951 +
51952 +       dent = entry_at(coord, idx_of(coord));
51953 +       return extract_dent_name(coord, dent, buf);
51954 +}
51955 +
51956 +static int
51957 +cde_bytes(int pasting, const reiser4_item_data * data)
51958 +{
51959 +       int result;
51960 +
51961 +       result = data->length;
51962 +       if (!pasting)
51963 +               result -= sizeof (cde_item_format);
51964 +       return result;
51965 +}
51966 +
51967 +/* ->s.dir.add_entry() method for this item plugin */
51968 +reiser4_internal int
51969 +add_entry_cde(struct inode *dir /* directory object */ ,
51970 +             coord_t * coord /* coord of item */ ,
51971 +             lock_handle * lh /* lock handle for insertion */ ,
51972 +             const struct dentry *name /* name to insert */ ,
51973 +             reiser4_dir_entry_desc * dir_entry        /* parameters of new
51974 +                                                        * directory entry */ )
51975 +{
51976 +       reiser4_item_data data;
51977 +       cde_entry entry;
51978 +       cde_entry_data edata;
51979 +       int result;
51980 +
51981 +       assert("nikita-1656", coord->node == lh->node);
51982 +       assert("nikita-1657", znode_is_write_locked(coord->node));
51983 +
51984 +       edata.num_of_entries = 1;
51985 +       edata.entry = &entry;
51986 +
51987 +       entry.dir = dir;
51988 +       entry.obj = dir_entry->obj;
51989 +       entry.name = &name->d_name;
51990 +
51991 +       data.data = (char *) &edata;
51992 +       data.user = 0;          /* &edata is not user space */
51993 +       data.iplug = item_plugin_by_id(COMPOUND_DIR_ID);
51994 +       data.arg = dir_entry;
51995 +       assert("nikita-1302", data.iplug != NULL);
51996 +
51997 +       result = is_dot_key(&dir_entry->key);
51998 +       data.length = estimate_cde(result ? coord : NULL, &data);
51999 +
52000 +       /* NOTE-NIKITA quota plugin? */
52001 +       if (DQUOT_ALLOC_SPACE_NODIRTY(dir, cde_bytes(result, &data)))
52002 +               return RETERR(-EDQUOT);
52003 +
52004 +       if (result)
52005 +               result = insert_by_coord(coord, &data, &dir_entry->key, lh, 0);
52006 +       else
52007 +               result = resize_item(coord, &data, &dir_entry->key, lh, 0);
52008 +       return result;
52009 +}
52010 +
52011 +/* ->s.dir.rem_entry() */
52012 +reiser4_internal int
52013 +rem_entry_cde(struct inode *dir /* directory of item */ ,
52014 +             const struct qstr * name,
52015 +             coord_t * coord /* coord of item */ ,
52016 +             lock_handle * lh UNUSED_ARG       /* lock handle for
52017 +                                                * removal */ ,
52018 +             reiser4_dir_entry_desc * entry UNUSED_ARG /* parameters of
52019 +                                                        * directory entry
52020 +                                                        * being removed */ )
52021 +{
52022 +       coord_t shadow;
52023 +       int result;
52024 +       int length;
52025 +       ON_DEBUG(char buf[DE_NAME_BUF_LEN]);
52026 +
52027 +       assert("nikita-2870", strlen(name->name) == name->len);
52028 +       assert("nikita-2869", !strcmp(name->name, extract_name_cde(coord, buf)));
52029 +
52030 +       length = sizeof (directory_entry_format) + sizeof (cde_unit_header);
52031 +       if (is_longname(name->name, name->len))
52032 +               length += name->len + 1;
52033 +
52034 +       if (inode_get_bytes(dir) < length) {
52035 +               warning("nikita-2628", "Dir is broke: %llu: %llu", get_inode_oid(dir), inode_get_bytes(dir));
52036 +               return RETERR(-EIO);
52037 +       }
52038 +
52039 +       /* cut_node() is supposed to take pointers to _different_
52040 +          coords, because it will modify them without respect to
52041 +          possible aliasing. To work around this, create temporary copy
52042 +          of @coord.
52043 +       */
52044 +       coord_dup(&shadow, coord);
52045 +       result = kill_node_content(coord, &shadow, NULL, NULL, NULL, NULL, NULL);
52046 +       if (result == 0) {
52047 +               /* NOTE-NIKITA quota plugin? */
52048 +               DQUOT_FREE_SPACE_NODIRTY(dir, length);
52049 +       }
52050 +       return result;
52051 +}
52052 +
52053 +/* ->s.dir.max_name_len() method for this item plugin */
52054 +reiser4_internal int
52055 +max_name_len_cde(const struct inode *dir /* directory */ )
52056 +{
52057 +       return
52058 +           tree_by_inode(dir)->nplug->max_item_size() -
52059 +           sizeof (directory_entry_format) - sizeof (cde_item_format) - sizeof (cde_unit_header) - 2;
52060 +}
52061 +
52062 +/* Make Linus happy.
52063 +   Local variables:
52064 +   c-indentation-style: "K&R"
52065 +   mode-name: "LC"
52066 +   c-basic-offset: 8
52067 +   tab-width: 8
52068 +   fill-column: 120
52069 +   End:
52070 +*/
52071 diff -rupN linux-2.6.8-rc3/fs/reiser4/plugin/item/cde.h linux-2.6.8-rc3-a/fs/reiser4/plugin/item/cde.h
52072 --- linux-2.6.8-rc3/fs/reiser4/plugin/item/cde.h        1970-01-01 03:00:00.000000000 +0300
52073 +++ linux-2.6.8-rc3-a/fs/reiser4/plugin/item/cde.h      2004-08-05 21:20:53.336607076 +0400
52074 @@ -0,0 +1,78 @@
52075 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
52076 +
52077 +/* Compound directory item. See cde.c for description. */
52078 +
52079 +#if !defined( __FS_REISER4_PLUGIN_COMPRESSED_DE_H__ )
52080 +#define __FS_REISER4_PLUGIN_COMPRESSED_DE_H__
52081 +
52082 +#include "../../forward.h"
52083 +#include "../../kassign.h"
52084 +#include "../../dformat.h"
52085 +
52086 +#include <linux/fs.h>          /* for struct inode */
52087 +#include <linux/dcache.h>      /* for struct dentry, etc  */
52088 +
52089 +typedef struct cde_unit_header {
52090 +       de_id hash;
52091 +       d16 offset;
52092 +} cde_unit_header;
52093 +
52094 +typedef struct cde_item_format {
52095 +       d16 num_of_entries;
52096 +       cde_unit_header entry[0];
52097 +} cde_item_format;
52098 +
52099 +typedef struct cde_entry {
52100 +       const struct inode *dir;
52101 +       const struct inode *obj;
52102 +       const struct qstr *name;
52103 +} cde_entry;
52104 +
52105 +typedef struct cde_entry_data {
52106 +       int num_of_entries;
52107 +       cde_entry *entry;
52108 +} cde_entry_data;
52109 +
52110 +/* plugin->item.b.* */
52111 +reiser4_key *max_key_inside_cde(const coord_t * coord, reiser4_key * result);
52112 +int can_contain_key_cde(const coord_t * coord, const reiser4_key * key, const reiser4_item_data *);
52113 +int mergeable_cde(const coord_t * p1, const coord_t * p2);
52114 +pos_in_node_t nr_units_cde(const coord_t * coord);
52115 +reiser4_key *unit_key_cde(const coord_t * coord, reiser4_key * key);
52116 +int estimate_cde(const coord_t * coord, const reiser4_item_data * data);
52117 +void print_cde(const char *prefix, coord_t * coord);
52118 +int init_cde(coord_t * coord, coord_t * from, reiser4_item_data * data);
52119 +lookup_result lookup_cde(const reiser4_key * key, lookup_bias bias, coord_t * coord);
52120 +int paste_cde(coord_t * coord, reiser4_item_data * data, carry_plugin_info * info UNUSED_ARG);
52121 +int can_shift_cde(unsigned free_space, coord_t * coord,
52122 +                 znode * target, shift_direction pend, unsigned *size, unsigned want);
52123 +void copy_units_cde(coord_t * target, coord_t * source,
52124 +                   unsigned from, unsigned count, shift_direction where_is_free_space, unsigned free_space);
52125 +int cut_units_cde(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
52126 +                 struct carry_cut_data *, reiser4_key * smallest_removed, reiser4_key *new_first);
52127 +int kill_units_cde(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
52128 +                  struct carry_kill_data *, reiser4_key * smallest_removed, reiser4_key *new_first);
52129 +void print_cde(const char *prefix, coord_t * coord);
52130 +int check_cde(const coord_t * coord, const char **error);
52131 +
52132 +/* plugin->u.item.s.dir.* */
52133 +int extract_key_cde(const coord_t * coord, reiser4_key * key);
52134 +int update_key_cde(const coord_t * coord, const reiser4_key * key, lock_handle * lh);
52135 +char *extract_name_cde(const coord_t * coord, char *buf);
52136 +int add_entry_cde(struct inode *dir, coord_t * coord,
52137 +                 lock_handle * lh, const struct dentry *name, reiser4_dir_entry_desc * entry);
52138 +int rem_entry_cde(struct inode *dir, const struct qstr * name, coord_t * coord, lock_handle * lh, reiser4_dir_entry_desc * entry);
52139 +int max_name_len_cde(const struct inode *dir);
52140 +
52141 +/* __FS_REISER4_PLUGIN_COMPRESSED_DE_H__ */
52142 +#endif
52143 +
52144 +/* Make Linus happy.
52145 +   Local variables:
52146 +   c-indentation-style: "K&R"
52147 +   mode-name: "LC"
52148 +   c-basic-offset: 8
52149 +   tab-width: 8
52150 +   fill-column: 120
52151 +   End:
52152 +*/
52153 diff -rupN linux-2.6.8-rc3/fs/reiser4/plugin/item/ctail.c linux-2.6.8-rc3-a/fs/reiser4/plugin/item/ctail.c
52154 --- linux-2.6.8-rc3/fs/reiser4/plugin/item/ctail.c      1970-01-01 03:00:00.000000000 +0300
52155 +++ linux-2.6.8-rc3-a/fs/reiser4/plugin/item/ctail.c    2004-08-05 21:20:52.806718843 +0400
52156 @@ -0,0 +1,1429 @@
52157 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
52158 +
52159 +/* ctails (aka "crypto tails") are items for cryptcompress objects */
52160 +
52161 +/* DESCRIPTION:
52162 +
52163 +Each cryptcompress object is stored on disk as a set of clusters sliced
52164 +into ctails.
52165 +
52166 +Internal on-disk structure:
52167 +
52168 +        HEADER   (1)  Here stored disk cluster shift
52169 +       BODY
52170 +*/
52171 +
52172 +#include "../../forward.h"
52173 +#include "../../debug.h"
52174 +#include "../../dformat.h"
52175 +#include "../../kassign.h"
52176 +#include "../../key.h"
52177 +#include "../../coord.h"
52178 +#include "item.h"
52179 +#include "../node/node.h"
52180 +#include "../plugin.h"
52181 +#include "../object.h"
52182 +#include "../../znode.h"
52183 +#include "../../carry.h"
52184 +#include "../../tree.h"
52185 +#include "../../inode.h"
52186 +#include "../../super.h"
52187 +#include "../../context.h"
52188 +#include "../../page_cache.h"
52189 +#include "../../cluster.h"
52190 +#include "../../flush.h"
52191 +#include "../file/funcs.h"
52192 +
52193 +#include <linux/swap.h>
52194 +#include <linux/fs.h>
52195 +
52196 +/* return body of ctail item at @coord */
52197 +static ctail_item_format *
52198 +ctail_formatted_at(const coord_t * coord)
52199 +{
52200 +       assert("edward-60", coord != NULL);
52201 +       return item_body_by_coord(coord);
52202 +}
52203 +
52204 +reiser4_internal __u8
52205 +cluster_shift_by_coord(const coord_t * coord)
52206 +{
52207 +       return d8tocpu(&ctail_formatted_at(coord)->cluster_shift);
52208 +}
52209 +
52210 +static unsigned long
52211 +pg_by_coord(const coord_t * coord)
52212 +{
52213 +       reiser4_key  key;
52214 +
52215 +       return get_key_offset(item_key_by_coord(coord, &key)) >> PAGE_CACHE_SHIFT;
52216 +}
52217 +
52218 +reiser4_internal unsigned long
52219 +clust_by_coord(const coord_t * coord)
52220 +{
52221 +       return pg_by_coord(coord) >> cluster_shift_by_coord(coord);
52222 +}
52223 +
52224 +#define cluster_key(key, coord) !(get_key_offset(key) & ~(~0ULL << cluster_shift_by_coord(coord) << PAGE_CACHE_SHIFT))
52225 +
52226 +static char *
52227 +first_unit(coord_t * coord)
52228 +{
52229 +       /* FIXME: warning: pointer of type `void *' used in arithmetic */
52230 +       return (char *)item_body_by_coord(coord) + sizeof (ctail_item_format);
52231 +}
52232 +
52233 +/* plugin->u.item.b.max_key_inside :
52234 +   tail_max_key_inside */
52235 +
52236 +/* plugin->u.item.b.can_contain_key */
52237 +reiser4_internal int
52238 +can_contain_key_ctail(const coord_t *coord, const reiser4_key *key, const reiser4_item_data *data)
52239 +{
52240 +       reiser4_key item_key;
52241 +
52242 +       if (item_plugin_by_coord(coord) != data->iplug)
52243 +               return 0;
52244 +
52245 +       item_key_by_coord(coord, &item_key);
52246 +       if (get_key_locality(key) != get_key_locality(&item_key) ||
52247 +           get_key_objectid(key) != get_key_objectid(&item_key))
52248 +               return 0;
52249 +       if (get_key_offset(&item_key) + nr_units_ctail(coord) != get_key_offset(key))
52250 +               return 0;
52251 +       if (cluster_key(key, coord))
52252 +               return 0;
52253 +       return 1;
52254 +}
52255 +
52256 +/* plugin->u.item.b.mergeable
52257 +   c-tails of different clusters are not mergeable */
52258 +reiser4_internal int
52259 +mergeable_ctail(const coord_t * p1, const coord_t * p2)
52260 +{
52261 +       reiser4_key key1, key2;
52262 +
52263 +       assert("edward-61", item_type_by_coord(p1) == UNIX_FILE_METADATA_ITEM_TYPE);
52264 +       assert("edward-62", item_id_by_coord(p1) == CTAIL_ID);
52265 +
52266 +       if (item_id_by_coord(p2) != CTAIL_ID) {
52267 +               /* second item is of another type */
52268 +               return 0;
52269 +       }
52270 +
52271 +       item_key_by_coord(p1, &key1);
52272 +       item_key_by_coord(p2, &key2);
52273 +       if (get_key_locality(&key1) != get_key_locality(&key2) ||
52274 +           get_key_objectid(&key1) != get_key_objectid(&key2) ||
52275 +           get_key_type(&key1) != get_key_type(&key2)) {
52276 +               /* items of different objects */
52277 +               return 0;
52278 +           }
52279 +       if (get_key_offset(&key1) + nr_units_ctail(p1) != get_key_offset(&key2))
52280 +               /*  not adjacent items */
52281 +               return 0;
52282 +       if (cluster_key(&key2, p2))
52283 +               return 0;
52284 +       return 1;
52285 +}
52286 +
52287 +/* plugin->u.item.b.nr_units */
52288 +reiser4_internal pos_in_node_t
52289 +nr_units_ctail(const coord_t * coord)
52290 +{
52291 +       return (item_length_by_coord(coord) - sizeof(ctail_formatted_at(coord)->cluster_shift));
52292 +}
52293 +
52294 +/* plugin->u.item.b.estimate:
52295 +   estimate how much space is needed to insert/paste @data->length bytes
52296 +   into ctail at @coord */
52297 +reiser4_internal int
52298 +estimate_ctail(const coord_t * coord /* coord of item */,
52299 +            const reiser4_item_data * data /* parameters for new item */)
52300 +{
52301 +       if (coord == NULL)
52302 +               /* insert */
52303 +               return (sizeof(ctail_item_format) + data->length);
52304 +       else
52305 +               /* paste */
52306 +               return data->length;
52307 +}
52308 +
52309 +#if REISER4_DEBUG_OUTPUT
52310 +static unsigned
52311 +cluster_size_by_coord(const coord_t * coord)
52312 +{
52313 +       return (PAGE_CACHE_SIZE << cluster_shift_by_coord(coord));
52314 +}
52315 +
52316 +
52317 +/* ->print() method for this item plugin. */
52318 +reiser4_internal void
52319 +print_ctail(const char *prefix /* prefix to print */ ,
52320 +         coord_t * coord /* coord of item to print */ )
52321 +{
52322 +       assert("edward-63", prefix != NULL);
52323 +       assert("edward-64", coord != NULL);
52324 +
52325 +       if (item_length_by_coord(coord) < (int) sizeof (ctail_item_format))
52326 +               printk("%s: wrong size: %i < %i\n", prefix, item_length_by_coord(coord), sizeof (ctail_item_format));
52327 +       else
52328 +               printk("%s: disk cluster size: %i\n", prefix, cluster_size_by_coord(coord));
52329 +}
52330 +#endif
52331 +
52332 +/* ->init() method for this item plugin. */
52333 +reiser4_internal int
52334 +init_ctail(coord_t * to /* coord of item */,
52335 +          coord_t * from /* old_item */,
52336 +          reiser4_item_data * data /* structure used for insertion */)
52337 +{
52338 +       int cluster_shift; /* cpu value to convert */
52339 +
52340 +       if (data) {
52341 +               assert("edward-463", data->length > sizeof(ctail_item_format));
52342 +
52343 +               cluster_shift = (int)(*((char *)(data->arg)));
52344 +               data->length -= sizeof(ctail_item_format);
52345 +       }
52346 +       else {
52347 +               assert("edward-464", from != NULL);
52348 +
52349 +               cluster_shift = (int)(cluster_shift_by_coord(from));
52350 +       }
52351 +       cputod8(cluster_shift, &ctail_formatted_at(to)->cluster_shift);
52352 +
52353 +       return 0;
52354 +}
52355 +
52356 +/* plugin->u.item.b.lookup:
52357 +   NULL. (we are looking only for exact keys from item headers) */
52358 +
52359 +
52360 +/* plugin->u.item.b.check */
52361 +
52362 +/* plugin->u.item.b.paste */
52363 +reiser4_internal int
52364 +paste_ctail(coord_t * coord, reiser4_item_data * data, carry_plugin_info * info UNUSED_ARG)
52365 +{
52366 +       unsigned old_nr_units;
52367 +
52368 +       assert("edward-268", data->data != NULL);
52369 +       /* copy only from kernel space */
52370 +       assert("edward-66", data->user == 0);
52371 +
52372 +       old_nr_units = item_length_by_coord(coord) - sizeof(ctail_item_format) - data->length;
52373 +
52374 +       /* ctail items never get pasted in the middle */
52375 +
52376 +       if (coord->unit_pos == 0 && coord->between == AT_UNIT) {
52377 +
52378 +                /* paste at the beginning when create new item */
52379 +               assert("edward-450", item_length_by_coord(coord) == data->length + sizeof(ctail_item_format));
52380 +               assert("edward-451", old_nr_units == 0);
52381 +       }
52382 +       else if (coord->unit_pos == old_nr_units - 1 && coord->between == AFTER_UNIT) {
52383 +
52384 +                /* paste at the end */
52385 +               coord->unit_pos++;
52386 +       }
52387 +       else
52388 +               impossible("edward-453", "bad paste position");
52389 +
52390 +       xmemcpy(first_unit(coord) + coord->unit_pos, data->data, data->length);
52391 +
52392 +       return 0;
52393 +}
52394 +
52395 +/* plugin->u.item.b.fast_paste */
52396 +
52397 +/* plugin->u.item.b.can_shift
52398 +   number of units is returned via return value, number of bytes via @size. For
52399 +   ctail items they coincide */
52400 +reiser4_internal int
52401 +can_shift_ctail(unsigned free_space, coord_t * source,
52402 +               znode * target, shift_direction direction UNUSED_ARG, unsigned *size, unsigned want)
52403 +{
52404 +       /* make sure that that we do not want to shift more than we have */
52405 +       assert("edward-68", want > 0 && want <= nr_units_ctail(source));
52406 +
52407 +       *size = min(want, free_space);
52408 +
52409 +       if (!target) {
52410 +               /* new item will be created */
52411 +               if (*size <= sizeof(ctail_item_format)) {
52412 +                       *size = 0;
52413 +                       return 0;
52414 +               }
52415 +               return *size - sizeof(ctail_item_format);
52416 +       }
52417 +       return *size;
52418 +}
52419 +
52420 +/* plugin->u.item.b.copy_units */
52421 +reiser4_internal void
52422 +copy_units_ctail(coord_t * target, coord_t * source,
52423 +               unsigned from, unsigned count, shift_direction where_is_free_space, unsigned free_space UNUSED_ARG)
52424 +{
52425 +       /* make sure that item @target is expanded already */
52426 +       assert("edward-69", (unsigned) item_length_by_coord(target) >= count);
52427 +       assert("edward-70", free_space >= count);
52428 +
52429 +       if (item_length_by_coord(target) == count) {
52430 +               /* new item has been created */
52431 +               assert("edward-465", count > sizeof(ctail_item_format));
52432 +
52433 +               count--;
52434 +       }
52435 +       if (where_is_free_space == SHIFT_LEFT) {
52436 +               /* append item @target with @count first bytes of @source:
52437 +                  this restriction came from ordinary tails */
52438 +               assert("edward-71", from == 0);
52439 +
52440 +               xmemcpy(first_unit(target) + nr_units_ctail(target) - count, first_unit(source), count);
52441 +       } else {
52442 +               /* target item is moved to right already */
52443 +               reiser4_key key;
52444 +
52445 +               assert("edward-72", nr_units_ctail(source) == from + count);
52446 +
52447 +               xmemcpy(first_unit(target), first_unit(source) + from, count);
52448 +
52449 +               /* new units are inserted before first unit in an item,
52450 +                  therefore, we have to update item key */
52451 +               item_key_by_coord(source, &key);
52452 +               set_key_offset(&key, get_key_offset(&key) + from);
52453 +
52454 +               node_plugin_by_node(target->node)->update_item_key(target, &key, 0 /*info */);
52455 +       }
52456 +}
52457 +
52458 +/* plugin->u.item.b.create_hook */
52459 +/* plugin->u.item.b.kill_hook */
52460 +reiser4_internal int
52461 +kill_hook_ctail(const coord_t *coord, pos_in_node_t from, pos_in_node_t count, carry_kill_data *kdata)
52462 +{
52463 +       struct inode *inode;
52464 +
52465 +       assert("edward-291", znode_is_write_locked(coord->node));
52466 +
52467 +       inode = kdata->inode;
52468 +       if (inode) {
52469 +               reiser4_key key;
52470 +               item_key_by_coord(coord, &key);
52471 +
52472 +               if (from == 0 && cluster_key(&key, coord)) {
52473 +                       pgoff_t start = off_to_pg(get_key_offset(&key));
52474 +                       pgoff_t end = off_to_pg(inode->i_size);
52475 +                       truncate_cluster(inode, start, end - start + 1);
52476 +               }
52477 +       }
52478 +       return 0;
52479 +}
52480 +
52481 +/* for shift_hook_ctail(),
52482 +   return true if the first disk cluster item has dirty child
52483 +*/
52484 +static int
52485 +ctail_squeezable (const coord_t *coord)
52486 +{
52487 +       int result;
52488 +       reiser4_key  key;
52489 +       jnode * child = NULL;
52490 +
52491 +       assert("edward-477", coord != NULL);
52492 +       assert("edward-478", item_plugin_by_coord(coord) == item_plugin_by_id(CTAIL_ID));
52493 +
52494 +       item_key_by_coord(coord, &key);
52495 +       child =  jlookup(current_tree, get_key_objectid(&key), pg_by_coord(coord));
52496 +
52497 +       if (!child)
52498 +               return 0;
52499 +       LOCK_JNODE(child);
52500 +       if (jnode_is_dirty(child))
52501 +               result = 1;
52502 +       else
52503 +               result = 0;
52504 +       UNLOCK_JNODE(child);
52505 +       jput(child);
52506 +       return result;
52507 +}
52508 +
52509 +/* plugin->u.item.b.shift_hook */
52510 +reiser4_internal int
52511 +shift_hook_ctail(const coord_t * item /* coord of item */ ,
52512 +                unsigned from UNUSED_ARG /* start unit */ ,
52513 +                unsigned count UNUSED_ARG /* stop unit */ ,
52514 +                znode * old_node /* old parent */ )
52515 +{
52516 +       assert("edward-479", item != NULL);
52517 +       assert("edward-480", item->node != old_node);
52518 +
52519 +       if (!znode_squeezable(old_node) || znode_squeezable(item->node))
52520 +               return 0;
52521 +       if (ctail_squeezable(item))
52522 +               znode_set_squeezable(item->node);
52523 +       return 0;
52524 +}
52525 +
52526 +static int
52527 +cut_or_kill_ctail_units(coord_t * coord, pos_in_node_t from, pos_in_node_t to, int cut,
52528 +                       void *p, reiser4_key * smallest_removed, reiser4_key *new_first)
52529 +{
52530 +       pos_in_node_t count; /* number of units to cut */
52531 +       char *item;
52532 +
52533 +       count = to - from + 1;
52534 +       item = item_body_by_coord(coord);
52535 +
52536 +       /* When we cut from the end of item - we have nothing to do */
52537 +       assert("edward-73", count < nr_units_ctail(coord));
52538 +       assert("edward-74", ergo(from != 0, to == coord_last_unit_pos(coord)));
52539 +
52540 +       if (smallest_removed) {
52541 +               /* store smallest key removed */
52542 +               item_key_by_coord(coord, smallest_removed);
52543 +               set_key_offset(smallest_removed, get_key_offset(smallest_removed) + from);
52544 +       }
52545 +
52546 +       if (new_first) {
52547 +               assert("vs-1531", from == 0);
52548 +
52549 +               item_key_by_coord(coord, new_first);
52550 +               set_key_offset(new_first, get_key_offset(new_first) + from + count);
52551 +       }
52552 +
52553 +       if (!cut)
52554 +               kill_hook_ctail(coord, from, 0, (struct carry_kill_data *)p);
52555 +
52556 +       if (from == 0) {
52557 +               if (count != nr_units_ctail(coord)) {
52558 +                       /* part of item is removed, so move free space at the beginning
52559 +                          of the item and update item key */
52560 +                       reiser4_key key;
52561 +                       xmemcpy(item + to + 1, item, sizeof(ctail_item_format));
52562 +                       item_key_by_coord(coord, &key);
52563 +                       set_key_offset(&key, get_key_offset(&key) + count);
52564 +                       node_plugin_by_node(coord->node)->update_item_key(coord, &key, 0 /*info */ );
52565 +               }
52566 +               else {
52567 +                       impossible("vs-1532", "cut_units should not be called to cut evrything");
52568 +                       /* whole item is cut, so more then amount of space occupied
52569 +                          by units got freed */
52570 +                       count += sizeof(ctail_item_format);
52571 +               }
52572 +               if (REISER4_DEBUG)
52573 +                       xmemset(item, 0, count);
52574 +       }
52575 +       else if (REISER4_DEBUG)
52576 +               xmemset(item + sizeof(ctail_item_format) + from, 0, count);
52577 +       return count;
52578 +}
52579 +
52580 +/* plugin->u.item.b.cut_units */
52581 +reiser4_internal int
52582 +cut_units_ctail(coord_t *item, pos_in_node_t from, pos_in_node_t to,
52583 +               carry_cut_data *cdata, reiser4_key *smallest_removed, reiser4_key *new_first)
52584 +{
52585 +       return cut_or_kill_ctail_units(item, from, to, 1, NULL, smallest_removed, new_first);
52586 +}
52587 +
52588 +/* plugin->u.item.b.kill_units */
52589 +reiser4_internal int
52590 +kill_units_ctail(coord_t *item, pos_in_node_t from, pos_in_node_t to,
52591 +                struct carry_kill_data *kdata, reiser4_key *smallest_removed, reiser4_key *new_first)
52592 +{
52593 +       return cut_or_kill_ctail_units(item, from, to, 0, kdata, smallest_removed, new_first);
52594 +}
52595 +
52596 +/* plugin->u.item.s.file.read */
52597 +reiser4_internal int
52598 +read_ctail(struct file *file UNUSED_ARG, flow_t *f, hint_t *hint)
52599 +{
52600 +       uf_coord_t *uf_coord;
52601 +       coord_t *coord;
52602 +
52603 +       uf_coord = &hint->coord;
52604 +       coord = &uf_coord->base_coord;
52605 +       assert("edward-127", f->user == 0);
52606 +       assert("edward-128", f->data);
52607 +       assert("edward-129", coord && coord->node);
52608 +       assert("edward-130", coord_is_existing_unit(coord));
52609 +       assert("edward-132", znode_is_loaded(coord->node));
52610 +
52611 +       /* start read only from the beginning of ctail */
52612 +       assert("edward-133", coord->unit_pos == 0);
52613 +       /* read only whole ctails */
52614 +       assert("edward-135", nr_units_ctail(coord) <= f->length);
52615 +
52616 +       assert("edward-136", schedulable());
52617 +
52618 +       memcpy(f->data, (char *)first_unit(coord), (size_t)nr_units_ctail(coord));
52619 +
52620 +       mark_page_accessed(znode_page(coord->node));
52621 +       move_flow_forward(f, nr_units_ctail(coord));
52622 +
52623 +       coord->item_pos ++;
52624 +       coord->between = BEFORE_ITEM;
52625 +
52626 +       return 0;
52627 +}
52628 +
52629 +/* this reads one cluster form disk,
52630 +   attaches buffer with decrypted and decompressed data */
52631 +reiser4_internal int
52632 +ctail_read_cluster (reiser4_cluster_t * clust, struct inode * inode, int write)
52633 +{
52634 +       int result;
52635 +
52636 +       assert("edward-139", clust->buf == NULL);
52637 +       assert("edward-671", clust->hint != NULL);
52638 +       assert("edward-140", clust->stat != FAKE_CLUSTER);
52639 +       assert("edward-672", crc_inode_ok(inode));
52640 +       assert("edward-145", inode_get_flag(inode, REISER4_CLUSTER_KNOWN));
52641 +
52642 +       if (!hint_prev_cluster(clust)) {
52643 +               done_lh(clust->hint->coord.lh);
52644 +               unset_hint(clust->hint);
52645 +       }
52646 +       /* allocate temporary buffer of disk cluster size */
52647 +
52648 +       clust->bsize = inode_scaled_offset(inode, fsize_to_count(clust, inode) +
52649 +                                          max_crypto_overhead(inode));
52650 +       if (clust->bsize > inode_scaled_cluster_size(inode))
52651 +               clust->bsize = inode_scaled_cluster_size(inode);
52652 +
52653 +       clust->buf = reiser4_kmalloc(clust->bsize, GFP_KERNEL);
52654 +       if (!clust->buf)
52655 +               return -ENOMEM;
52656 +
52657 +       result = find_cluster(clust, inode, 1 /* read */, write);
52658 +       if (cbk_errored(result))
52659 +               goto out;
52660 +
52661 +       assert("edward-673", znode_is_any_locked(clust->hint->coord.lh->node));
52662 +
52663 +       result = inflate_cluster(clust, inode);
52664 +       if(result)
52665 +               goto out;
52666 +       return 0;
52667 + out:
52668 +       put_cluster_data(clust);
52669 +       return result;
52670 +}
52671 +
52672 +/* read one locked page */
52673 +reiser4_internal int
52674 +do_readpage_ctail(reiser4_cluster_t * clust, struct page *page)
52675 +{
52676 +       int ret;
52677 +       unsigned cloff;
52678 +       struct inode * inode;
52679 +       char * data;
52680 +       int release = 0;
52681 +       size_t pgcnt;
52682 +
52683 +       assert("edward-212", PageLocked(page));
52684 +
52685 +       if(PageUptodate(page))
52686 +               goto exit;
52687 +
52688 +       inode = page->mapping->host;
52689 +
52690 +       if (!cluster_is_uptodate(clust)) {
52691 +               clust->index = pg_to_clust(page->index, inode);
52692 +               unlock_page(page);
52693 +               ret = ctail_read_cluster(clust, inode, 0 /* do not write */);
52694 +               lock_page(page);
52695 +               if (ret)
52696 +                       return ret;
52697 +               /* cluster was uptodated here, release it before exit */
52698 +               release = 1;
52699 +       }
52700 +       if(PageUptodate(page))
52701 +               /* races with another read/write */
52702 +               goto exit;
52703 +       if (clust->stat == FAKE_CLUSTER) {
52704 +               /* fill page by zeroes */
52705 +               char *kaddr = kmap_atomic(page, KM_USER0);
52706 +
52707 +               assert("edward-119", clust->buf == NULL);
52708 +
52709 +               memset(kaddr, 0, PAGE_CACHE_SIZE);
52710 +               flush_dcache_page(page);
52711 +               kunmap_atomic(kaddr, KM_USER0);
52712 +               SetPageUptodate(page);
52713 +
52714 +               ON_TRACE(TRACE_CTAIL, " - hole, OK\n");
52715 +               return 0;
52716 +       }
52717 +       /* fill page by plain text from cluster handle */
52718 +
52719 +       assert("edward-120", clust->len <= inode_cluster_size(inode));
52720 +
52721 +        /* start page offset in the cluster */
52722 +       cloff = pg_to_off_to_cloff(page->index, inode);
52723 +       /* bytes in page */
52724 +       pgcnt = off_to_pgcount(inode->i_size, page->index);
52725 +       assert("edward-620", off_to_pgcount(inode->i_size, page->index) > 0);
52726 +
52727 +       data = kmap(page);
52728 +       memcpy(data, clust->buf + cloff, pgcnt);
52729 +       memset(data + pgcnt, 0, (size_t)PAGE_CACHE_SIZE - pgcnt);
52730 +       kunmap(page);
52731 +       SetPageUptodate(page);
52732 + exit:
52733 +       if (release)
52734 +               put_cluster_data(clust);
52735 +       return 0;
52736 +}
52737 +
52738 +/* plugin->u.item.s.file.readpage */
52739 +reiser4_internal int readpage_ctail(void * vp, struct page * page)
52740 +{
52741 +       int result;
52742 +       hint_t hint;
52743 +       lock_handle lh;
52744 +       reiser4_cluster_t * clust = vp;
52745 +
52746 +       assert("edward-114", clust != NULL);
52747 +       assert("edward-115", PageLocked(page));
52748 +       assert("edward-116", !PageUptodate(page));
52749 +       assert("edward-117", !jprivate(page) && !PagePrivate(page));
52750 +       assert("edward-118", page->mapping && page->mapping->host);
52751 +
52752 +       clust->hint = &hint;
52753 +       init_lh(&lh);
52754 +       result = load_file_hint(clust->file, &hint, &lh);
52755 +       if (result)
52756 +               return result;
52757 +
52758 +       result = do_readpage_ctail(clust, page);
52759 +
52760 +       hint.coord.valid = 0;
52761 +       save_file_hint(clust->file, &hint);
52762 +       done_lh(&lh);
52763 +       put_cluster_data(clust);
52764 +
52765 +       assert("edward-213", PageLocked(page));
52766 +       return result;
52767 +}
52768 +
52769 +static int
52770 +ctail_read_page_cluster(reiser4_cluster_t * clust, struct inode * inode)
52771 +{
52772 +       int i;
52773 +       int result;
52774 +       assert("edward-779", clust != NULL);
52775 +       assert("edward-780", inode != NULL);
52776 +
52777 +       set_nrpages_by_inode(clust, inode);
52778 +
52779 +       result = grab_cluster_pages(inode, clust);
52780 +       if (result)
52781 +               return result;
52782 +       result = ctail_read_cluster(clust, inode, 0 /* read */);
52783 +       if (result)
52784 +               goto out;
52785 +       /* stream is attached at this point */
52786 +       assert("edward-781", cluster_is_uptodate(clust));
52787 +
52788 +       for (i=0; i < clust->nr_pages; i++) {
52789 +               struct page * page = clust->pages[i];
52790 +               lock_page(page);
52791 +               do_readpage_ctail(clust, page);
52792 +               unlock_page(page);
52793 +       }
52794 +       release_cluster_buf(clust);
52795 + out:
52796 +       release_cluster_pages(clust, 0);
52797 +       return result;
52798 +}
52799 +
52800 +#define check_order(pages)                                                    \
52801 +assert("edward-214", ergo(!list_empty(pages) && pages->next != pages->prev,   \
52802 +       list_to_page(pages)->index < list_to_next_page(pages)->index))
52803 +
52804 +/* plugin->s.file.writepage */
52805 +
52806 +/* plugin->u.item.s.file.readpages
52807 +   populate an address space with some pages, and start reads against them.
52808 +   FIXME_EDWARD: this function should return errors
52809 +*/
52810 +reiser4_internal void
52811 +readpages_ctail(void *vp, struct address_space *mapping, struct list_head *pages)
52812 +{
52813 +       int ret = 0;
52814 +       hint_t hint;
52815 +       lock_handle lh;
52816 +       reiser4_cluster_t clust;
52817 +       struct page *page;
52818 +       struct pagevec lru_pvec;
52819 +       struct inode * inode = mapping->host;
52820 +
52821 +       check_order(pages);
52822 +       pagevec_init(&lru_pvec, 0);
52823 +       reiser4_cluster_init(&clust);
52824 +       clust.file = vp;
52825 +       clust.hint = &hint;
52826 +
52827 +       alloc_clust_pages(&clust, inode);
52828 +       init_lh(&lh);
52829 +
52830 +       ret = load_file_hint(clust.file, &hint, &lh);
52831 +       if (ret)
52832 +               return;
52833 +
52834 +       /* address_space-level file readahead doesn't know about
52835 +          reiser4 page clustering, so we work around this fact */
52836 +
52837 +       while (!list_empty(pages)) {
52838 +               page = list_to_page(pages);
52839 +               list_del(&page->lru);
52840 +               if (add_to_page_cache(page, mapping, page->index, GFP_KERNEL)) {
52841 +                       page_cache_release(page);
52842 +                       continue;
52843 +               }
52844 +               if (PageUptodate(page)) {
52845 +                       unlock_page(page);
52846 +                       continue;
52847 +               }
52848 +               unlock_page(page);
52849 +               clust.index = pg_to_clust(page->index, inode);
52850 +               ret = ctail_read_page_cluster(&clust, inode);
52851 +               if (ret)
52852 +                       goto exit;
52853 +               assert("edward-782", !cluster_is_uptodate(&clust));
52854 +
52855 +               lock_page(page);
52856 +               ret = do_readpage_ctail(&clust, page);
52857 +               if (!pagevec_add(&lru_pvec, page))
52858 +                       __pagevec_lru_add(&lru_pvec);
52859 +               if (ret) {
52860 +                       warning("edward-215", "do_readpage_ctail failed");
52861 +                       unlock_page(page);
52862 +               exit:
52863 +                       while (!list_empty(pages)) {
52864 +                               struct page *victim;
52865 +
52866 +                               victim = list_to_page(pages);
52867 +                               list_del(&victim->lru);
52868 +                               page_cache_release(victim);
52869 +                       }
52870 +                       break;
52871 +               }
52872 +               unlock_page(page);
52873 +       }
52874 +       assert("edward-783", !cluster_is_uptodate(&clust));
52875 +       hint.coord.valid = 0;
52876 +       save_file_hint(clust.file, &hint);
52877 +
52878 +       done_lh(&lh);
52879 +       /* free array */
52880 +       free_clust_pages(&clust);
52881 +       put_cluster_data(&clust);
52882 +       pagevec_lru_add(&lru_pvec);
52883 +       return;
52884 +}
52885 +
52886 +/*
52887 +   plugin->u.item.s.file.append_key
52888 +*/
52889 +reiser4_internal reiser4_key *
52890 +append_key_ctail(const coord_t *coord, reiser4_key *key)
52891 +{
52892 +       return NULL;
52893 +}
52894 +
52895 +/* key of the first item of the next cluster */
52896 +reiser4_internal reiser4_key *
52897 +append_cluster_key_ctail(const coord_t *coord, reiser4_key *key)
52898 +{
52899 +       item_key_by_coord(coord, key);
52900 +       set_key_offset(key, ((__u64)(clust_by_coord(coord)) + 1) << cluster_shift_by_coord(coord) << PAGE_CACHE_SHIFT);
52901 +       return key;
52902 +}
52903 +
52904 +static int
52905 +insert_crc_flow(coord_t * coord, lock_handle * lh, flow_t * f, struct inode * inode)
52906 +{
52907 +       int result;
52908 +       carry_pool pool;
52909 +       carry_level lowest_level;
52910 +       carry_op *op;
52911 +       reiser4_item_data data;
52912 +       __u8 cluster_shift = inode_cluster_shift(inode);
52913 +
52914 +       init_carry_pool(&pool);
52915 +       init_carry_level(&lowest_level, &pool);
52916 +
52917 +       assert("edward-466", coord->between == AFTER_ITEM || coord->between == AFTER_UNIT ||
52918 +              coord->between == BEFORE_ITEM);
52919 +
52920 +       if (coord->between == AFTER_UNIT) {
52921 +               coord->unit_pos = 0;
52922 +               coord->between = AFTER_ITEM;
52923 +       }
52924 +       op = post_carry(&lowest_level, COP_INSERT_FLOW, coord->node, 0 /* operate directly on coord -> node */ );
52925 +       if (IS_ERR(op) || (op == NULL))
52926 +               return RETERR(op ? PTR_ERR(op) : -EIO);
52927 +       data.user = 0;
52928 +       data.iplug = item_plugin_by_id(CTAIL_ID);
52929 +       data.arg = &cluster_shift;
52930 +
52931 +       data.length = 0;
52932 +       data.data = 0;
52933 +
52934 +       op->u.insert_flow.flags = COPI_DONT_SHIFT_LEFT | COPI_DONT_SHIFT_RIGHT;
52935 +       op->u.insert_flow.insert_point = coord;
52936 +       op->u.insert_flow.flow = f;
52937 +       op->u.insert_flow.data = &data;
52938 +       op->u.insert_flow.new_nodes = 0;
52939 +
52940 +       lowest_level.track_type = CARRY_TRACK_CHANGE;
52941 +       lowest_level.tracked = lh;
52942 +
52943 +       ON_STATS(lowest_level.level_no = znode_get_level(coord->node));
52944 +       result = carry(&lowest_level, 0);
52945 +       done_carry_pool(&pool);
52946 +
52947 +       return result;
52948 +}
52949 +
52950 +static int
52951 +insert_crc_flow_in_place(coord_t * coord, lock_handle * lh, flow_t * f, struct inode * inode)
52952 +{
52953 +       int ret;
52954 +       coord_t point;
52955 +       lock_handle lock;
52956 +
52957 +       assert("edward-674", f->length <= inode_scaled_cluster_size(inode));
52958 +       assert("edward-484", coord->between == AT_UNIT ||
52959 +              coord->between == AFTER_UNIT || coord->between == AFTER_ITEM);
52960 +
52961 +       coord_dup (&point, coord);
52962 +
52963 +       if (coord->between == AT_UNIT) {
52964 +               coord_prev_item(&point);
52965 +
52966 +               assert("edward-485", item_plugin_by_coord(&point) == item_plugin_by_id(CTAIL_ID));
52967 +
52968 +               point.between = AFTER_ITEM;
52969 +       }
52970 +
52971 +       init_lh (&lock);
52972 +       copy_lh(&lock, lh);
52973 +
52974 +       ret = insert_crc_flow(&point, &lock, f, inode);
52975 +       done_lh(&lock);
52976 +       return ret;
52977 +}
52978 +
52979 +/* overwrite tail citem or its part */
52980 +static int
52981 +overwrite_ctail(coord_t * coord, flow_t * f)
52982 +{
52983 +       unsigned count;
52984 +
52985 +       assert("edward-269", f->user == 0);
52986 +       assert("edward-270", f->data != NULL);
52987 +       assert("edward-271", f->length > 0);
52988 +       assert("edward-272", coord_is_existing_unit(coord));
52989 +       assert("edward-273", coord->unit_pos == 0);
52990 +       assert("edward-274", znode_is_write_locked(coord->node));
52991 +       assert("edward-275", schedulable());
52992 +       assert("edward-467", item_plugin_by_coord(coord) == item_plugin_by_id(CTAIL_ID));
52993 +
52994 +       count = nr_units_ctail(coord);
52995 +
52996 +       if (count > f->length)
52997 +               count = f->length;
52998 +       xmemcpy(first_unit(coord), f->data, count);
52999 +       move_flow_forward(f, count);
53000 +       coord->unit_pos += count;
53001 +       return 0;
53002 +}
53003 +
53004 +/* cut ctail item or its tail subset */
53005 +static int
53006 +cut_ctail(coord_t * coord)
53007 +{
53008 +       coord_t stop;
53009 +
53010 +       assert("edward-435", coord->between == AT_UNIT &&
53011 +              coord->item_pos < coord_num_items(coord) &&
53012 +              coord->unit_pos <= coord_num_units(coord));
53013 +
53014 +       if(coord->unit_pos == coord_num_units(coord)) {
53015 +               /* nothing to cut */
53016 +               return 0;
53017 +       }
53018 +       coord_dup(&stop, coord);
53019 +       stop.unit_pos = coord_last_unit_pos(coord);
53020 +
53021 +       return cut_node_content(coord, &stop, NULL, NULL, NULL);
53022 +}
53023 +
53024 +#define UNPREPPED_DCLUSTER_LEN 2
53025 +
53026 +/* insert minimal disk cluster for unprepped page cluster */
53027 +int ctail_make_unprepped_cluster(reiser4_cluster_t * clust, struct inode * inode)
53028 +{
53029 +       char buf[UNPREPPED_DCLUSTER_LEN];
53030 +       flow_t f;
53031 +       int result;
53032 +
53033 +       assert("edward-675", get_current_context()->grabbed_blocks == 0);
53034 +       assert("edward-676", znode_is_wlocked(clust->hint->coord.lh->node));
53035 +
53036 +       grab_space_enable();
53037 +       result = reiser4_grab_space(estimate_insert_cluster(inode, 1 /*unprepped */), 0);
53038 +       if (result)
53039 +               return result;
53040 +       xmemset(buf, 0, UNPREPPED_DCLUSTER_LEN);
53041 +
53042 +       flow_by_inode_cryptcompress(inode, buf, 0 /* kernel space */, UNPREPPED_DCLUSTER_LEN, clust_to_off(clust->index, inode), WRITE_OP, &f);
53043 +
53044 +       result = insert_crc_flow(&clust->hint->coord.base_coord, clust->hint->coord.lh, &f, inode);
53045 +       all_grabbed2free();
53046 +       if (result)
53047 +               return result;
53048 +
53049 +       assert("edward-677", reiser4_clustered_blocks(reiser4_get_current_sb()));
53050 +       assert("edward-678", znode_is_dirty(clust->hint->coord.base_coord.node));
53051 +
53052 +       znode_set_squeezable(clust->hint->coord.base_coord.node);
53053 +       return result;
53054 +}
53055 +
53056 +static ctail_squeeze_info_t * ctail_squeeze_data(flush_pos_t * pos)
53057 +{
53058 +       return &pos->sq->itm->u.ctail_info;
53059 +}
53060 +
53061 +/* the following functions are used by flush item methods */
53062 +/* plugin->u.item.s.file.write ? */
53063 +reiser4_internal int
53064 +write_ctail(flush_pos_t * pos, crc_write_mode_t mode)
53065 +{
53066 +       int result;
53067 +       ctail_squeeze_info_t * info;
53068 +
53069 +       assert("edward-468", pos != NULL);
53070 +       assert("edward-469", pos->sq != NULL);
53071 +       assert("edward-845", item_squeeze_data(pos) != NULL);
53072 +
53073 +       info = ctail_squeeze_data(pos);
53074 +
53075 +       switch (mode) {
53076 +       case CRC_FIRST_ITEM:
53077 +       case CRC_APPEND_ITEM:
53078 +               assert("edward-679", info->flow.data != NULL);
53079 +               result = insert_crc_flow_in_place(&pos->coord, &pos->lock, &info->flow, info->inode);
53080 +               break;
53081 +       case CRC_OVERWRITE_ITEM:
53082 +               overwrite_ctail(&pos->coord, &info->flow);
53083 +       case CRC_CUT_ITEM:
53084 +               result = cut_ctail(&pos->coord);
53085 +               break;
53086 +       default:
53087 +               result = RETERR(-EIO);
53088 +               impossible("edward-244", "wrong ctail write mode");
53089 +       }
53090 +       return result;
53091 +}
53092 +
53093 +reiser4_internal item_plugin *
53094 +item_plugin_by_jnode(jnode * node)
53095 +{
53096 +       assert("edward-302", jnode_is_cluster_page(node));
53097 +       return (item_plugin_by_id(CTAIL_ID));
53098 +}
53099 +
53100 +static jnode *
53101 +next_jnode_cluster(jnode * node, struct inode *inode, reiser4_cluster_t * clust)
53102 +{
53103 +       return jlookup(tree_by_inode(inode), get_inode_oid(inode), clust_to_pg(clust->index + 1, inode));
53104 +}
53105 +
53106 +/* plugin->u.item.f.scan */
53107 +/* Check if the cluster node we started from is not presented by any items
53108 +   in the tree. If so, create the link by inserting prosessed cluster into
53109 +   the tree. Don't care about scan counter since leftward scanning will be
53110 +   continued from rightmost dirty node.
53111 +*/
53112 +reiser4_internal int scan_ctail(flush_scan * scan)
53113 +{
53114 +       int result;
53115 +       struct page * page;
53116 +       struct inode * inode;
53117 +       reiser4_cluster_t clust;
53118 +       flow_t f;
53119 +       jnode * node = scan->node;
53120 +       file_plugin * fplug;
53121 +
53122 +       reiser4_cluster_init(&clust);
53123 +
53124 +       assert("edward-227", scan->node != NULL);
53125 +       assert("edward-228", jnode_is_cluster_page(scan->node));
53126 +       assert("edward-639", znode_is_write_locked(scan->parent_lock.node));
53127 +
53128 +       if (!znode_squeezable(scan->parent_lock.node)) {
53129 +               assert("edward-680", !jnode_is_dirty(scan->node));
53130 +               warning("edward-681", "cluster page is already processed");
53131 +               return -EAGAIN;
53132 +       }
53133 +
53134 +       if (get_flush_scan_nstat(scan) == LINKED) {
53135 +               /* nothing to do */
53136 +               return 0;
53137 +       }
53138 +
53139 +       jref(node);
53140 +
53141 +       do {
53142 +               LOCK_JNODE(node);
53143 +               if (!(jnode_is_dirty(node) &&
53144 +                     (node->atom == ZJNODE(scan->parent_lock.node)->atom) &&
53145 +                     JF_ISSET(node, JNODE_NEW))) {
53146 +                        /* don't touch! */
53147 +                       UNLOCK_JNODE(node);
53148 +                       jput(node);
53149 +                       break;
53150 +               }
53151 +               UNLOCK_JNODE(node);
53152 +
53153 +               reiser4_cluster_init(&clust);
53154 +
53155 +               page = jnode_page(node);
53156 +
53157 +               assert("edward-229", page->mapping != NULL);
53158 +               assert("edward-230", page->mapping != NULL);
53159 +               assert("edward-231", page->mapping->host != NULL);
53160 +
53161 +               inode = page->mapping->host;
53162 +               fplug = inode_file_plugin(inode);
53163 +
53164 +               assert("edward-244", fplug == file_plugin_by_id(CRC_FILE_PLUGIN_ID));
53165 +               assert("edward-232", inode_get_flag(inode, REISER4_CLUSTER_KNOWN));
53166 +               assert("edward-233", scan->direction == LEFT_SIDE);
53167 +
53168 +               clust.index = pg_to_clust(page->index, inode);
53169 +
53170 +               /* remove jnode cluster from dirty list */
53171 +               result = flush_cluster_pages(&clust, inode);
53172 +               if (result)
53173 +                       return result;
53174 +               result = deflate_cluster(NULL, &clust, inode);
53175 +               if (result)
53176 +                       goto error;
53177 +
53178 +               assert("edward-633", clust.len != 0);
53179 +
53180 +               fplug->flow_by_inode(inode, clust.buf, 0, clust.len, clust_to_off(clust.index, inode), WRITE, &f);
53181 +               /* insert processed data */
53182 +               result = insert_crc_flow(&scan->parent_coord, /* insert point */
53183 +                                        &scan->parent_lock, &f, inode);
53184 +               if (result)
53185 +                       goto error;
53186 +               assert("edward-234", f.length == 0);
53187 +               JF_CLR(node, JNODE_NEW);
53188 +               release_cluster_buf(&clust);
53189 +               jput(node);
53190 +       }
53191 +       while ((node = next_jnode_cluster(node, inode, &clust)));
53192 +
53193 +       /* now the child is linked to its parent,
53194 +          set appropriate status */
53195 +       set_flush_scan_nstat(scan, LINKED);
53196 +       return 0;
53197 + error:
53198 +       release_cluster_buf(&clust);
53199 +       return result;
53200 +}
53201 +
53202 +/* If true, this function attaches children */
53203 +static int
53204 +should_attach_squeeze_idata(flush_pos_t * pos)
53205 +{
53206 +       int result;
53207 +       assert("edward-431", pos != NULL);
53208 +       assert("edward-432", pos->child == NULL);
53209 +       assert("edward-619", znode_is_write_locked(pos->coord.node));
53210 +       assert("edward-470", item_plugin_by_coord(&pos->coord) == item_plugin_by_id(CTAIL_ID));
53211 +
53212 +       /* check for leftmost child */
53213 +       utmost_child_ctail(&pos->coord, LEFT_SIDE, &pos->child);
53214 +
53215 +       if (!pos->child)
53216 +               return 0;
53217 +       LOCK_JNODE(pos->child);
53218 +       result = jnode_is_dirty(pos->child) &&
53219 +               pos->child->atom == ZJNODE(pos->coord.node)->atom;
53220 +       UNLOCK_JNODE(pos->child);
53221 +       if (!result && pos->child) {
53222 +               /* existing child isn't to attach, clear up this one */
53223 +               jput(pos->child);
53224 +               pos->child = NULL;
53225 +       }
53226 +       return result;
53227 +}
53228 +
53229 +static int
53230 +alloc_squeeze_tfm_data(squeeze_info_t * sq)
53231 +{
53232 +       assert("edward-808", sq != NULL);
53233 +       assert("edward-809", sq->tfm == NULL);
53234 +
53235 +       sq->tfm = reiser4_kmalloc(SQUEEZE_TFM_INFO_SIZE , GFP_KERNEL);
53236 +       if (!sq->tfm)
53237 +               return -ENOMEM;
53238 +       xmemset(sq->tfm, 0, SQUEEZE_TFM_INFO_SIZE);
53239 +       return 0;
53240 +}
53241 +
53242 +static void
53243 +free_squeeze_tfm_data(squeeze_info_t * sq)
53244 +{
53245 +       reiser4_compression_id i;
53246 +       compression_plugin * cplug;
53247 +
53248 +       assert("edward-810", sq != NULL);
53249 +       assert("edward-811", sq->tfm != NULL);
53250 +
53251 +       for(i=0; i < LAST_COMPRESSION_ID; i++) {
53252 +               if (!sq->tfm[i])
53253 +                       continue;
53254 +               cplug = compression_plugin_by_id(i);
53255 +               assert("edward-812", cplug->free != NULL);
53256 +               cplug->free(&sq->tfm[i], TFM_WRITE);
53257 +       }
53258 +       reiser4_kfree(sq->tfm);
53259 +       sq->tfm = NULL;
53260 +       return;
53261 +}
53262 +
53263 +/* plugin->init_squeeze_data() */
53264 +static int
53265 +init_squeeze_data_ctail(squeeze_item_info_t * idata, struct inode * inode)
53266 +{
53267 +       ctail_squeeze_info_t * info;
53268 +       assert("edward-813", idata != NULL);
53269 +       assert("edward-814", inode != NULL);
53270 +
53271 +       info = &idata->u.ctail_info;
53272 +       info->clust = reiser4_kmalloc(sizeof(*info->clust), GFP_KERNEL);
53273 +       if (!info->clust)
53274 +               return -ENOMEM;
53275 +
53276 +       reiser4_cluster_init(info->clust);
53277 +       info->inode = inode;
53278 +
53279 +       return 0;
53280 +}
53281 +
53282 +/* plugin->free_squeeze_data() */
53283 +static void
53284 +free_squeeze_data_ctail(squeeze_item_info_t * idata)
53285 +{
53286 +       ctail_squeeze_info_t * info;
53287 +       assert("edward-815", idata != NULL);
53288 +
53289 +       info = &idata->u.ctail_info;
53290 +       if (info->clust) {
53291 +               release_cluster_buf(info->clust);
53292 +               reiser4_kfree(info->clust);
53293 +       }
53294 +       return;
53295 +}
53296 +
53297 +static int
53298 +alloc_item_squeeze_data(squeeze_info_t * sq)
53299 +{
53300 +       assert("edward-816", sq != NULL);
53301 +       assert("edward-817", sq->itm == NULL);
53302 +
53303 +       sq->itm = reiser4_kmalloc(sizeof(*sq->itm), GFP_KERNEL);
53304 +       if (sq->itm == NULL)
53305 +               return -ENOMEM;
53306 +       return 0;
53307 +}
53308 +
53309 +static void
53310 +free_item_squeeze_data(squeeze_info_t * sq)
53311 +{
53312 +       assert("edward-818", sq != NULL);
53313 +       assert("edward-819", sq->itm != NULL);
53314 +       assert("edward-820", sq->iplug != NULL);
53315 +
53316 +       /* iplug->free(sq->idata); */
53317 +       free_squeeze_data_ctail(sq->itm);
53318 +
53319 +       reiser4_kfree(sq->itm);
53320 +       sq->itm = NULL;
53321 +       return;
53322 +}
53323 +
53324 +static int
53325 +alloc_squeeze_data(flush_pos_t * pos)
53326 +{
53327 +       assert("edward-821", pos != NULL);
53328 +       assert("edward-822", pos->sq == NULL);
53329 +
53330 +       pos->sq = reiser4_kmalloc(sizeof(*pos->sq), GFP_KERNEL);
53331 +       if (!pos->sq)
53332 +               return -ENOMEM;
53333 +       xmemset(pos->sq, 0, sizeof(*pos->sq));
53334 +       return 0;
53335 +}
53336 +
53337 +reiser4_internal void
53338 +free_squeeze_data(flush_pos_t * pos)
53339 +{
53340 +       squeeze_info_t * sq;
53341 +
53342 +       assert("edward-823", pos != NULL);
53343 +       assert("edward-824", pos->sq != NULL);
53344 +
53345 +       sq = pos->sq;
53346 +       if (sq->tfm)
53347 +               free_squeeze_tfm_data(sq);
53348 +       if (sq->itm)
53349 +               free_item_squeeze_data(sq);
53350 +       reiser4_kfree(pos->sq);
53351 +       pos->sq = NULL;
53352 +       return;
53353 +}
53354 +
53355 +static int
53356 +init_item_squeeze_data(flush_pos_t * pos, struct inode * inode)
53357 +{
53358 +       squeeze_info_t * sq;
53359 +
53360 +       assert("edward-825", pos != NULL);
53361 +       assert("edward-826", pos->sq != NULL);
53362 +       assert("edward-827", item_squeeze_data(pos) != NULL);
53363 +       assert("edward-828", inode != NULL);
53364 +
53365 +       sq = pos->sq;
53366 +
53367 +       xmemset(sq->itm, 0, sizeof(*sq->itm));
53368 +
53369 +       /* iplug->init_squeeze_data() */
53370 +       return init_squeeze_data_ctail(sq->itm, inode);
53371 +}
53372 +
53373 +/* create disk cluster info used by 'squeeze' phase of the flush squalloc() */
53374 +static int
53375 +attach_squeeze_idata(flush_pos_t * pos, struct inode * inode)
53376 +{
53377 +       int ret = 0;
53378 +       ctail_squeeze_info_t * info;
53379 +       reiser4_cluster_t *clust;
53380 +       file_plugin * fplug = inode_file_plugin(inode);
53381 +       compression_plugin * cplug = inode_compression_plugin(inode);
53382 +
53383 +       assert("edward-248", pos != NULL);
53384 +       assert("edward-249", pos->child != NULL);
53385 +       assert("edward-251", inode != NULL);
53386 +       assert("edward-682", crc_inode_ok(inode));
53387 +       assert("edward-252", fplug == file_plugin_by_id(CRC_FILE_PLUGIN_ID));
53388 +       assert("edward-473", item_plugin_by_coord(&pos->coord) == item_plugin_by_id(CTAIL_ID));
53389 +
53390 +       if (!pos->sq) {
53391 +               ret = alloc_squeeze_data(pos);
53392 +               if (ret)
53393 +                       return ret;
53394 +       }
53395 +       if (!tfm_squeeze_data(pos) && cplug->alloc != NULL) {
53396 +               ret = alloc_squeeze_tfm_data(pos->sq);
53397 +               if (ret)
53398 +                       goto exit;
53399 +       }
53400 +       if (cplug->alloc != NULL && *tfm_squeeze_idx(pos, cplug->h.id) == NULL) {
53401 +               ret = cplug->alloc(tfm_squeeze_idx(pos, cplug->h.id), TFM_WRITE);
53402 +               if (ret)
53403 +                       goto exit;
53404 +       }
53405 +       assert("edward-829", pos->sq != NULL);
53406 +       assert("edward-250", item_squeeze_data(pos) == NULL);
53407 +
53408 +       pos->sq->iplug = item_plugin_by_id(CTAIL_ID);
53409 +
53410 +       ret = alloc_item_squeeze_data(pos->sq);
53411 +       if (ret)
53412 +               goto exit;
53413 +       ret = init_item_squeeze_data(pos, inode);
53414 +       if (ret)
53415 +               goto exit;
53416 +       info = ctail_squeeze_data(pos);
53417 +       clust = info->clust;
53418 +       clust->index = pg_to_clust(jnode_page(pos->child)->index, inode);
53419 +
53420 +       ret = flush_cluster_pages(clust, inode);
53421 +       if (ret)
53422 +               goto exit;
53423 +
53424 +       assert("edward-830", ergo(!tfm_squeeze_pos(pos, cplug->h.id), !cplug->alloc));
53425 +
53426 +       ret = deflate_cluster(tfm_squeeze_pos(pos, cplug->h.id), clust, inode);
53427 +       if (ret)
53428 +               goto exit;
53429 +
53430 +       inc_item_squeeze_count(pos);
53431 +
53432 +       /* make flow by transformed stream */
53433 +       fplug->flow_by_inode(info->inode, clust->buf, 0/* kernel space */,
53434 +                            clust->len, clust_to_off(clust->index, inode),
53435 +                            WRITE_OP, &info->flow);
53436 +       jput(pos->child);
53437 +
53438 +       assert("edward-683", crc_inode_ok(inode));
53439 +       return 0;
53440 + exit:
53441 +       jput(pos->child);
53442 +       free_squeeze_data(pos);
53443 +       return ret;
53444 +}
53445 +
53446 +/* clear up disk cluster info */
53447 +static void
53448 +detach_squeeze_idata(squeeze_info_t * sq)
53449 +{
53450 +       squeeze_item_info_t * idata;
53451 +       ctail_squeeze_info_t * info;
53452 +       struct inode * inode;
53453 +
53454 +       assert("edward-253", sq != NULL);
53455 +       assert("edward-840", sq->itm != NULL);
53456 +
53457 +       idata = sq->itm;
53458 +       info = &idata->u.ctail_info;
53459 +
53460 +       assert("edward-254", info->clust != NULL);
53461 +       assert("edward-255", info->inode != NULL);
53462 +
53463 +       inode = info->inode;
53464 +
53465 +       assert("edward-841", atomic_read(&inode->i_count));
53466 +       assert("edward-256", info->clust->buf != NULL);
53467 +
53468 +       atomic_dec(&inode->i_count);
53469 +
53470 +       free_item_squeeze_data(sq);
53471 +       return;
53472 +}
53473 +
53474 +/* plugin->u.item.f.utmost_child */
53475 +
53476 +/* This function sets leftmost child for a first cluster item,
53477 +   if the child exists, and NULL in other cases.
53478 +   NOTE-EDWARD: Do not call this for RIGHT_SIDE */
53479 +
53480 +reiser4_internal int
53481 +utmost_child_ctail(const coord_t * coord, sideof side, jnode ** child)
53482 +{
53483 +       reiser4_key key;
53484 +
53485 +       item_key_by_coord(coord, &key);
53486 +
53487 +       assert("edward-257", coord != NULL);
53488 +       assert("edward-258", child != NULL);
53489 +       assert("edward-259", side == LEFT_SIDE);
53490 +       assert("edward-260", item_plugin_by_coord(coord) == item_plugin_by_id(CTAIL_ID));
53491 +
53492 +       if (!cluster_key(&key, coord))
53493 +               *child = NULL;
53494 +       else
53495 +               *child = jlookup(current_tree, get_key_objectid(item_key_by_coord(coord, &key)), pg_by_coord(coord));
53496 +       return 0;
53497 +}
53498 +
53499 +/* plugin->u.item.f.squeeze */
53500 +/* write ctail in guessed mode */
53501 +reiser4_internal int
53502 +squeeze_ctail(flush_pos_t * pos)
53503 +{
53504 +       int result;
53505 +       ctail_squeeze_info_t * info = NULL;
53506 +       crc_write_mode_t mode = CRC_OVERWRITE_ITEM;
53507 +
53508 +       assert("edward-261", pos != NULL);
53509 +
53510 +       if (!pos->sq || !item_squeeze_data(pos)) {
53511 +               if (should_attach_squeeze_idata(pos)) {
53512 +                       /* attach squeeze item info */
53513 +                       struct inode * inode;
53514 +
53515 +                       assert("edward-264", pos->child != NULL);
53516 +                       assert("edward-265", jnode_page(pos->child) != NULL);
53517 +                       assert("edward-266", jnode_page(pos->child)->mapping != NULL);
53518 +
53519 +                       inode = jnode_page(pos->child)->mapping->host;
53520 +
53521 +                       assert("edward-267", inode != NULL);
53522 +
53523 +                       /* attach item squeeze info by child and put the last one */
53524 +                       result = attach_squeeze_idata(pos, inode);
53525 +                       pos->child = NULL;
53526 +                       if (result != 0)
53527 +                               return result;
53528 +                       info = ctail_squeeze_data(pos);
53529 +               }
53530 +               else
53531 +                       /* unsqueezable */
53532 +                       return 0;
53533 +       }
53534 +       else {
53535 +               /* use old squeeze info */
53536 +
53537 +               squeeze_item_info_t * idata = item_squeeze_data(pos);
53538 +               info = ctail_squeeze_data(pos);
53539 +
53540 +               if (info->flow.length) {
53541 +                       /* append or overwrite */
53542 +                       if (idata->mergeable) {
53543 +                               mode = CRC_OVERWRITE_ITEM;
53544 +                               idata->mergeable = 0;
53545 +                       }
53546 +                       else
53547 +                               mode = CRC_APPEND_ITEM;
53548 +               }
53549 +               else {
53550 +                        /* cut or invalidate */
53551 +                       if (idata->mergeable) {
53552 +                               mode = CRC_CUT_ITEM;
53553 +                               idata->mergeable = 0;
53554 +                       }
53555 +                       else {
53556 +                               detach_squeeze_idata(pos->sq);
53557 +                               return RETERR(-E_REPEAT);
53558 +                       }
53559 +               }
53560 +       }
53561 +       assert("edward-433", info != NULL);
53562 +       result = write_ctail(pos, mode);
53563 +       if (result) {
53564 +               detach_squeeze_idata(pos->sq);
53565 +               return result;
53566 +       }
53567 +
53568 +       if (mode == CRC_APPEND_ITEM) {
53569 +               /* detach squeeze info */
53570 +               assert("edward-434", info->flow.length == 0);
53571 +               detach_squeeze_idata(pos->sq);
53572 +               return RETERR(-E_REPEAT);
53573 +       }
53574 +       return 0;
53575 +}
53576 +
53577 +/* Make Linus happy.
53578 +   Local variables:
53579 +   c-indentation-style: "K&R"
53580 +   mode-name: "LC"
53581 +   c-basic-offset: 8
53582 +   tab-width: 8
53583 +   fill-column: 120
53584 +   End:
53585 +*/
53586 diff -rupN linux-2.6.8-rc3/fs/reiser4/plugin/item/ctail.h linux-2.6.8-rc3-a/fs/reiser4/plugin/item/ctail.h
53587 --- linux-2.6.8-rc3/fs/reiser4/plugin/item/ctail.h      1970-01-01 03:00:00.000000000 +0300
53588 +++ linux-2.6.8-rc3-a/fs/reiser4/plugin/item/ctail.h    2004-08-05 21:20:53.426588097 +0400
53589 @@ -0,0 +1,101 @@
53590 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
53591 +
53592 +#if !defined( __FS_REISER4_CTAIL_H__ )
53593 +#define __FS_REISER4_CTAIL_H__
53594 +
53595 +/* cryptcompress object item. See ctail.c for description. */
53596 +#include "../cryptcompress.h"
53597 +
53598 +#include <linux/pagevec.h>
53599 +
53600 +typedef struct ctail_item_format {
53601 +       /* cluster shift */
53602 +       d8 cluster_shift;
53603 +       /* ctail body */
53604 +       d8 body[0];
53605 +} __attribute__((packed)) ctail_item_format;
53606 +
53607 +/* for flush squeeze */
53608 +typedef struct ctail_squeeze_info {
53609 +       struct inode * inode;
53610 +       reiser4_cluster_t * clust;
53611 +       flow_t flow;
53612 +} ctail_squeeze_info_t;
53613 +
53614 +#define CTAIL_MIN_BODY_SIZE MIN_CRYPTO_BLOCKSIZE
53615 +
53616 +#define list_to_page(head) (list_entry((head)->prev, struct page, lru))
53617 +#define list_to_next_page(head) (list_entry((head)->prev->prev, struct page, lru))
53618 +
53619 +struct cut_list;
53620 +
53621 +/* plugin->item.b.* */
53622 +int can_contain_key_ctail(const coord_t *, const reiser4_key *, const reiser4_item_data *);
53623 +int mergeable_ctail(const coord_t * p1, const coord_t * p2);
53624 +pos_in_node_t nr_units_ctail(const coord_t * coord);
53625 +int estimate_ctail(const coord_t * coord, const reiser4_item_data * data);
53626 +void print_ctail(const char *prefix, coord_t * coord);
53627 +lookup_result lookup_ctail(const reiser4_key *, lookup_bias, coord_t *);
53628 +
53629 +int paste_ctail(coord_t * coord, reiser4_item_data * data, carry_plugin_info * info UNUSED_ARG);
53630 +int init_ctail(coord_t *, coord_t *, reiser4_item_data *);
53631 +int can_shift_ctail(unsigned free_space, coord_t * coord,
53632 +                 znode * target, shift_direction pend, unsigned *size, unsigned want);
53633 +void copy_units_ctail(coord_t * target, coord_t * source,
53634 +                   unsigned from, unsigned count, shift_direction where_is_free_space, unsigned free_space);
53635 +int cut_units_ctail(coord_t *coord, pos_in_node_t from, pos_in_node_t to,
53636 +                   carry_cut_data *, reiser4_key * smallest_removed, reiser4_key *new_first);
53637 +int kill_units_ctail(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
53638 +                    carry_kill_data *, reiser4_key * smallest_removed, reiser4_key *new_first);
53639 +
53640 +/*int check_check(const coord_t * coord, const char **error);*/
53641 +
53642 +/* plugin->u.item.s.* */
53643 +int read_ctail(struct file *, flow_t *, hint_t *);
53644 +int readpage_ctail(void *, struct page *);
53645 +void readpages_ctail(void *, struct address_space *, struct list_head *);
53646 +reiser4_key *append_key_ctail(const coord_t *, reiser4_key *);
53647 +int kill_hook_ctail(const coord_t *, pos_in_node_t, pos_in_node_t, carry_kill_data *);
53648 +int shift_hook_ctail(const coord_t *, unsigned, unsigned, znode *);
53649 +
53650 +/* plugin->u.item.f */
53651 +int utmost_child_ctail(const coord_t *, sideof, jnode **);
53652 +int scan_ctail(flush_scan *);
53653 +int squeeze_ctail(flush_pos_t *);
53654 +item_plugin * item_plugin_by_jnode(jnode *);
53655 +
53656 +crypto_stat_t * inode_crypto_stat(struct inode *);
53657 +
53658 +void reiser4_cluster_init(reiser4_cluster_t *);
53659 +void put_cluster_data(reiser4_cluster_t *);
53660 +int cluster_is_uptodate (reiser4_cluster_t *);
53661 +void release_cluster_buf(reiser4_cluster_t *);
53662 +size_t inode_scaled_cluster_size(struct inode *);
53663 +loff_t inode_scaled_offset (struct inode *, const loff_t);
53664 +unsigned max_crypto_overhead(struct inode *);
53665 +
53666 +int inflate_cluster(reiser4_cluster_t *, struct inode *);
53667 +int find_cluster_item(hint_t * hint, const reiser4_key *key,
53668 +                     znode_lock_mode lock_mode, ra_info_t *ra_info,
53669 +                     lookup_bias bias);
53670 +int page_of_cluster(struct page *, reiser4_cluster_t *, struct inode *);
53671 +int find_cluster(reiser4_cluster_t *, struct inode *, int read, int write);
53672 +int flush_cluster_pages(reiser4_cluster_t *, struct inode *);
53673 +int deflate_cluster(tfm_info_t, reiser4_cluster_t *, struct inode *);
53674 +void truncate_cluster(struct inode * inode, pgoff_t start, long count);
53675 +int hint_prev_cluster(reiser4_cluster_t * clust);
53676 +void set_nrpages_by_inode(reiser4_cluster_t * clust, struct inode * inode);
53677 +int grab_cluster_pages(struct inode * inode, reiser4_cluster_t * clust);
53678 +void release_cluster_pages(reiser4_cluster_t * clust, int from);
53679 +
53680 +#endif /* __FS_REISER4_CTAIL_H__ */
53681 +
53682 +/* Make Linus happy.
53683 +   Local variables:
53684 +   c-indentation-style: "K&R"
53685 +   mode-name: "LC"
53686 +   c-basic-offset: 8
53687 +   tab-width: 8
53688 +   fill-column: 120
53689 +   End:
53690 +*/
53691 diff -rupN linux-2.6.8-rc3/fs/reiser4/plugin/item/extent.c linux-2.6.8-rc3-a/fs/reiser4/plugin/item/extent.c
53692 --- linux-2.6.8-rc3/fs/reiser4/plugin/item/extent.c     1970-01-01 03:00:00.000000000 +0300
53693 +++ linux-2.6.8-rc3-a/fs/reiser4/plugin/item/extent.c   2004-08-05 21:20:52.843711040 +0400
53694 @@ -0,0 +1,196 @@
53695 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
53696 +
53697 +#include "item.h"
53698 +#include "../../key.h"
53699 +#include "../../super.h"
53700 +#include "../../carry.h"
53701 +#include "../../inode.h"
53702 +#include "../../page_cache.h"
53703 +#include "../../emergency_flush.h"
53704 +#include "../../prof.h"
53705 +#include "../../flush.h"
53706 +#include "../object.h"
53707 +
53708 +
53709 +/* prepare structure reiser4_item_data. It is used to put one extent unit into tree */
53710 +/* Audited by: green(2002.06.13) */
53711 +reiser4_internal reiser4_item_data *
53712 +init_new_extent(reiser4_item_data *data, void *ext_unit, int nr_extents)
53713 +{
53714 +       if (REISER4_ZERO_NEW_NODE)
53715 +               memset(data, 0, sizeof(reiser4_item_data));
53716 +
53717 +       data->data = ext_unit;
53718 +       /* data->data is kernel space */
53719 +       data->user = 0;
53720 +       data->length = sizeof(reiser4_extent) * nr_extents;
53721 +       data->arg = 0;
53722 +       data->iplug = item_plugin_by_id(EXTENT_POINTER_ID);
53723 +       return data;
53724 +}
53725 +
53726 +/* how many bytes are addressed by @nr first extents of the extent item */
53727 +reiser4_internal reiser4_block_nr
53728 +extent_size(const coord_t *coord, pos_in_node_t nr)
53729 +{
53730 +       pos_in_node_t i;
53731 +       reiser4_block_nr blocks;
53732 +       reiser4_extent *ext;
53733 +
53734 +       ext = item_body_by_coord(coord);
53735 +       assert("vs-263", nr <= nr_units_extent(coord));
53736 +
53737 +       blocks = 0;
53738 +       for (i = 0; i < nr; i++, ext++) {
53739 +               blocks += extent_get_width(ext);
53740 +       }
53741 +
53742 +       return blocks * current_blocksize;
53743 +}
53744 +
53745 +reiser4_internal extent_state
53746 +state_of_extent(reiser4_extent *ext)
53747 +{
53748 +       switch ((int) extent_get_start(ext)) {
53749 +       case 0:
53750 +               return HOLE_EXTENT;
53751 +       case 1:
53752 +               return UNALLOCATED_EXTENT;
53753 +       default:
53754 +               break;
53755 +       }
53756 +       return ALLOCATED_EXTENT;
53757 +}
53758 +
53759 +reiser4_internal int
53760 +extent_is_unallocated(const coord_t *item)
53761 +{
53762 +       assert("jmacd-5133", item_is_extent(item));
53763 +
53764 +       return state_of_extent(extent_by_coord(item)) == UNALLOCATED_EXTENT;
53765 +}
53766 +
53767 +reiser4_internal int
53768 +extent_is_allocated(const coord_t *item)
53769 +{
53770 +       assert("jmacd-5133", item_is_extent(item));
53771 +
53772 +       return state_of_extent(extent_by_coord(item)) == ALLOCATED_EXTENT;
53773 +}
53774 +
53775 +/* set extent's start and width */
53776 +reiser4_internal void
53777 +set_extent(reiser4_extent *ext, reiser4_block_nr start, reiser4_block_nr width)
53778 +{
53779 +       extent_set_start(ext, start);
53780 +       extent_set_width(ext, width);
53781 +}
53782 +
53783 +/* used in split_allocated_extent, conv_extent, plug_hole to insert 1 or 2 extent units (@exts_to_add) after the one
53784 +   @un_extent is set to. @un_extent itself is changed to @replace */
53785 +reiser4_internal int
53786 +replace_extent(coord_t *un_extent, lock_handle *lh,
53787 +              reiser4_key *key, reiser4_item_data *exts_to_add, const reiser4_extent *replace, unsigned flags UNUSED_ARG,
53788 +              int return_inserted_position /* if it is 1 - un_extent and lh are returned set to first of newly inserted
53789 +                                              units, if it is 0 - un_extent and lh are returned set to unit which was
53790 +                                              replaced */)
53791 +{
53792 +       int result;
53793 +       coord_t coord_after;
53794 +       lock_handle lh_after;
53795 +       tap_t watch;
53796 +       znode *orig_znode;
53797 +       ON_DEBUG(reiser4_extent orig_ext);      /* this is for debugging */
53798 +
53799 +       assert("vs-990", coord_is_existing_unit(un_extent));
53800 +       assert("vs-1375", znode_is_write_locked(un_extent->node));
53801 +       assert("vs-1426", extent_get_width(replace) != 0);
53802 +       assert("vs-1427", extent_get_width((reiser4_extent *)exts_to_add->data) != 0);
53803 +
53804 +       coord_dup(&coord_after, un_extent);
53805 +       init_lh(&lh_after);
53806 +       copy_lh(&lh_after, lh);
53807 +       tap_init(&watch, &coord_after, &lh_after, ZNODE_WRITE_LOCK);
53808 +       tap_monitor(&watch);
53809 +
53810 +       ON_DEBUG(orig_ext = *extent_by_coord(un_extent));
53811 +       orig_znode = un_extent->node;
53812 +
53813 +       /* make sure that key is set properly */
53814 +       if (REISER4_DEBUG) {
53815 +               reiser4_key tmp;
53816 +
53817 +               unit_key_by_coord(un_extent, &tmp);
53818 +               set_key_offset(&tmp, get_key_offset(&tmp) + extent_get_width(replace) * current_blocksize);
53819 +               assert("vs-1080", keyeq(&tmp, key));
53820 +       }
53821 +
53822 +       DISABLE_NODE_CHECK;
53823 +
53824 +       /* set insert point after unit to be replaced */
53825 +       un_extent->between = AFTER_UNIT;
53826 +
53827 +       result = insert_into_item(un_extent,
53828 +                                 return_inserted_position ? lh : 0,
53829 +                                 /*(flags == COPI_DONT_SHIFT_LEFT) ? 0 : lh,*/ key, exts_to_add, flags);
53830 +       if (!result) {
53831 +               /* now we have to replace the unit after which new units were inserted. Its position is tracked by
53832 +                  @watch */
53833 +               reiser4_extent *ext;
53834 +
53835 +               if (coord_after.node != orig_znode) {
53836 +                       coord_clear_iplug(&coord_after);
53837 +                       result = zload(coord_after.node);
53838 +               }
53839 +
53840 +               if (likely(!result)) {
53841 +                       ext = extent_by_coord(&coord_after);
53842 +
53843 +                       assert("vs-987", znode_is_loaded(coord_after.node));
53844 +                       assert("vs-988", !memcmp(ext, &orig_ext, sizeof (*ext)));
53845 +
53846 +                       *ext = *replace;
53847 +                       znode_make_dirty(coord_after.node);
53848 +
53849 +                       if (coord_after.node != orig_znode)
53850 +                               zrelse(coord_after.node);
53851 +
53852 +                       if (return_inserted_position == 0) {
53853 +                               /* return un_extent and lh set to the same */
53854 +                               assert("vs-1662", WITH_DATA(coord_after.node, !memcmp(replace, extent_by_coord(&coord_after), sizeof(reiser4_extent))));
53855 +
53856 +                               *un_extent = coord_after;
53857 +                               done_lh(lh);
53858 +                               copy_lh(lh, &lh_after);
53859 +                       } else {
53860 +                               /* return un_extent and lh set to first of inserted units */
53861 +                               assert("vs-1663", WITH_DATA(un_extent->node, !memcmp(exts_to_add->data, extent_by_coord(un_extent), sizeof(reiser4_extent))));
53862 +                               assert("vs-1664", lh->node == un_extent->node);
53863 +                       }
53864 +               }
53865 +       }
53866 +       tap_done(&watch);
53867 +
53868 +       ENABLE_NODE_CHECK;
53869 +       return result;
53870 +}
53871 +
53872 +reiser4_internal lock_handle *
53873 +znode_lh(znode *node)
53874 +{
53875 +       assert("vs-1371", znode_is_write_locked(node));
53876 +       assert("vs-1372", znode_is_wlocked_once(node));
53877 +       return owners_list_front(&node->lock.owners);
53878 +}
53879 +
53880 +
53881 +/*
53882 +   Local variables:
53883 +   c-indentation-style: "K&R"
53884 +   mode-name: "LC"
53885 +   c-basic-offset: 8
53886 +   tab-width: 8
53887 +   fill-column: 120
53888 +   scroll-step: 1
53889 +   End:
53890 +*/
53891 diff -rupN linux-2.6.8-rc3/fs/reiser4/plugin/item/extent.h linux-2.6.8-rc3-a/fs/reiser4/plugin/item/extent.h
53892 --- linux-2.6.8-rc3/fs/reiser4/plugin/item/extent.h     1970-01-01 03:00:00.000000000 +0300
53893 +++ linux-2.6.8-rc3-a/fs/reiser4/plugin/item/extent.h   2004-08-05 21:20:53.168642504 +0400
53894 @@ -0,0 +1,176 @@
53895 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
53896 +
53897 +#ifndef __REISER4_EXTENT_H__
53898 +#define __REISER4_EXTENT_H__
53899 +
53900 +/* on disk extent */
53901 +typedef struct {
53902 +       reiser4_dblock_nr start;
53903 +       reiser4_dblock_nr width;
53904 +} reiser4_extent;
53905 +
53906 +typedef struct extent_stat {
53907 +       int unallocated_units;
53908 +       int unallocated_blocks;
53909 +       int allocated_units;
53910 +       int allocated_blocks;
53911 +       int hole_units;
53912 +       int hole_blocks;
53913 +} extent_stat;
53914 +
53915 +/* extents in an extent item can be either holes, or unallocated or allocated
53916 +   extents */
53917 +typedef enum {
53918 +       HOLE_EXTENT,
53919 +       UNALLOCATED_EXTENT,
53920 +       ALLOCATED_EXTENT
53921 +} extent_state;
53922 +
53923 +#define HOLE_EXTENT_START 0
53924 +#define UNALLOCATED_EXTENT_START 1
53925 +#define UNALLOCATED_EXTENT_START2 2
53926 +
53927 +typedef struct {
53928 +       reiser4_block_nr pos_in_unit;
53929 +       reiser4_block_nr width; /* width of current unit */
53930 +       pos_in_node_t nr_units; /* number of units */
53931 +       int ext_offset; /* offset from the beginning of zdata() */
53932 +       unsigned long expected_page;
53933 +#if REISER4_DEBUG
53934 +       reiser4_extent extent;
53935 +#endif
53936 +} extent_coord_extension_t;
53937 +
53938 +/* macros to set/get fields of on-disk extent */
53939 +static inline reiser4_block_nr
53940 +extent_get_start(const reiser4_extent * ext)
53941 +{
53942 +       return dblock_to_cpu(&ext->start);
53943 +}
53944 +
53945 +static inline reiser4_block_nr
53946 +extent_get_width(const reiser4_extent * ext)
53947 +{
53948 +       return dblock_to_cpu(&ext->width);
53949 +}
53950 +
53951 +extern __u64 reiser4_current_block_count(void);
53952 +
53953 +static inline void
53954 +extent_set_start(reiser4_extent * ext, reiser4_block_nr start)
53955 +{
53956 +       cassert(sizeof (ext->start) == 8);
53957 +       assert("nikita-2510", ergo(start > 1, start < reiser4_current_block_count()));
53958 +       cpu_to_dblock(start, &ext->start);
53959 +}
53960 +
53961 +static inline void
53962 +extent_set_width(reiser4_extent *ext, reiser4_block_nr width)
53963 +{
53964 +       cassert(sizeof (ext->width) == 8);
53965 +       cpu_to_dblock(width, &ext->width);
53966 +       assert("nikita-2511",
53967 +              ergo(extent_get_start(ext) > 1,
53968 +                   extent_get_start(ext) + width <= reiser4_current_block_count()));
53969 +}
53970 +
53971 +#define extent_item(coord)                                     \
53972 +({                                                             \
53973 +       assert("nikita-3143", item_is_extent(coord));           \
53974 +       ((reiser4_extent *)item_body_by_coord (coord));         \
53975 +})
53976 +
53977 +#define extent_by_coord(coord)                                 \
53978 +({                                                             \
53979 +       assert("nikita-3144", item_is_extent(coord));           \
53980 +       (extent_item (coord) + (coord)->unit_pos);              \
53981 +})
53982 +
53983 +#define width_by_coord(coord)                                  \
53984 +({                                                             \
53985 +       assert("nikita-3145", item_is_extent(coord));           \
53986 +       extent_get_width (extent_by_coord(coord));              \
53987 +})
53988 +
53989 +struct carry_cut_data;
53990 +struct carry_kill_data;
53991 +
53992 +/* plugin->u.item.b.* */
53993 +reiser4_key *max_key_inside_extent(const coord_t *, reiser4_key *);
53994 +int can_contain_key_extent(const coord_t * coord, const reiser4_key * key, const reiser4_item_data *);
53995 +int mergeable_extent(const coord_t * p1, const coord_t * p2);
53996 +pos_in_node_t nr_units_extent(const coord_t *);
53997 +lookup_result lookup_extent(const reiser4_key *, lookup_bias, coord_t *);
53998 +void init_coord_extent(coord_t *);
53999 +int init_extent(coord_t *, reiser4_item_data *);
54000 +int paste_extent(coord_t *, reiser4_item_data *, carry_plugin_info *);
54001 +int can_shift_extent(unsigned free_space,
54002 +                    coord_t * source, znode * target, shift_direction, unsigned *size, unsigned want);
54003 +void copy_units_extent(coord_t * target,
54004 +                      coord_t * source,
54005 +                      unsigned from, unsigned count, shift_direction where_is_free_space, unsigned free_space);
54006 +int kill_hook_extent(const coord_t *, pos_in_node_t from, pos_in_node_t count, struct carry_kill_data *);
54007 +int create_hook_extent(const coord_t * coord, void *arg);
54008 +int cut_units_extent(coord_t *coord, pos_in_node_t from, pos_in_node_t to,
54009 +                    struct carry_cut_data *, reiser4_key *smallest_removed, reiser4_key *new_first);
54010 +int kill_units_extent(coord_t *coord, pos_in_node_t from, pos_in_node_t to,
54011 +                     struct carry_kill_data *, reiser4_key *smallest_removed, reiser4_key *new_first);
54012 +reiser4_key *unit_key_extent(const coord_t *, reiser4_key *);
54013 +reiser4_key *max_unit_key_extent(const coord_t *, reiser4_key *);
54014 +void print_extent(const char *, coord_t *);
54015 +void show_extent(struct seq_file *m, coord_t *coord);
54016 +int utmost_child_extent(const coord_t * coord, sideof side, jnode ** child);
54017 +int utmost_child_real_block_extent(const coord_t * coord, sideof side, reiser4_block_nr * block);
54018 +void item_stat_extent(const coord_t * coord, void *vp);
54019 +int check_extent(const coord_t * coord, const char **error);
54020 +
54021 +/* plugin->u.item.s.file.* */
54022 +int write_extent(struct inode *, flow_t *, hint_t *, int grabbed, write_mode_t);
54023 +int read_extent(struct file *, flow_t *, hint_t *);
54024 +int readpage_extent(void *, struct page *);
54025 +void readpages_extent(void *, struct address_space *, struct list_head *pages);
54026 +int capture_extent(reiser4_key *, uf_coord_t *, struct page *, write_mode_t);
54027 +reiser4_key *append_key_extent(const coord_t *, reiser4_key *);
54028 +void init_coord_extension_extent(uf_coord_t *, loff_t offset);
54029 +int get_block_address_extent(const coord_t *, sector_t block, struct buffer_head *);
54030 +
54031 +
54032 +/* these are used in flush.c
54033 +   FIXME-VS: should they be somewhere in item_plugin? */
54034 +int allocate_extent_item_in_place(coord_t *, lock_handle *, flush_pos_t * pos);
54035 +int allocate_and_copy_extent(znode * left, coord_t * right, flush_pos_t * pos, reiser4_key * stop_key);
54036 +
54037 +int extent_is_unallocated(const coord_t * item);       /* True if this extent is unallocated (i.e., not a hole, not allocated). */
54038 +int extent_is_allocated(const coord_t *);
54039 +__u64 extent_unit_index(const coord_t * item); /* Block offset of this unit. */
54040 +__u64 extent_unit_width(const coord_t * item); /* Number of blocks in this unit. */
54041 +reiser4_block_nr extent_unit_start(const coord_t * item);      /* Starting block location of this unit. */
54042 +
54043 +/* plugin->u.item.f. */
54044 +int scan_extent (flush_scan * scan);
54045 +extern int key_by_offset_extent(struct inode *, loff_t, reiser4_key *);
54046 +
54047 +reiser4_item_data *init_new_extent(reiser4_item_data *data, void *ext_unit, int nr_extents);
54048 +reiser4_block_nr extent_size(const coord_t *coord, pos_in_node_t nr);
54049 +extent_state state_of_extent(reiser4_extent *ext);
54050 +void set_extent(reiser4_extent *ext, reiser4_block_nr start, reiser4_block_nr width);
54051 +int replace_extent(coord_t *un_extent, lock_handle *lh,
54052 +                  reiser4_key *key, reiser4_item_data *data, const reiser4_extent *new_ext, unsigned flags, int);
54053 +lock_handle *znode_lh(znode *);
54054 +
54055 +/* the reiser4 repacker support */
54056 +struct repacker_cursor;
54057 +extern int process_extent_backward_for_repacking (tap_t *, struct repacker_cursor *);
54058 +extern int mark_extent_for_repacking (tap_t *, int);
54059 +
54060 +/* __REISER4_EXTENT_H__ */
54061 +#endif
54062 +/*
54063 +   Local variables:
54064 +   c-indentation-style: "K&R"
54065 +   mode-name: "LC"
54066 +   c-basic-offset: 8
54067 +   tab-width: 8
54068 +   fill-column: 120
54069 +   End:
54070 +*/
54071 diff -rupN linux-2.6.8-rc3/fs/reiser4/plugin/item/extent_file_ops.c linux-2.6.8-rc3-a/fs/reiser4/plugin/item/extent_file_ops.c
54072 --- linux-2.6.8-rc3/fs/reiser4/plugin/item/extent_file_ops.c    1970-01-01 03:00:00.000000000 +0300
54073 +++ linux-2.6.8-rc3-a/fs/reiser4/plugin/item/extent_file_ops.c  2004-08-05 21:20:53.441584934 +0400
54074 @@ -0,0 +1,1401 @@
54075 +/* COPYRIGHT 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
54076 +
54077 +#include "item.h"
54078 +#include "../../inode.h"
54079 +#include "../../page_cache.h"
54080 +#include "../../flush.h" /* just for jnode_tostring */
54081 +#include "../object.h"
54082 +
54083 +#include <linux/quotaops.h>
54084 +
54085 +static inline reiser4_extent *
54086 +ext_by_offset(const znode *node, int offset)
54087 +{
54088 +       reiser4_extent *ext;
54089 +
54090 +       ext = (reiser4_extent *)(zdata(node) + offset);
54091 +       return ext;
54092 +}
54093 +
54094 +static inline reiser4_extent *
54095 +ext_by_ext_coord(const uf_coord_t *uf_coord)
54096 +{
54097 +       reiser4_extent *ext;
54098 +
54099 +       ext = ext_by_offset(uf_coord->base_coord.node, uf_coord->extension.extent.ext_offset);
54100 +       assert("vs-1650", extent_get_start(ext) == extent_get_start(&uf_coord->extension.extent.extent));
54101 +       assert("vs-1651", extent_get_width(ext) == extent_get_width(&uf_coord->extension.extent.extent));
54102 +       return ext;
54103 +}
54104 +
54105 +#if REISER4_DEBUG
54106 +static int
54107 +coord_extension_is_ok(const uf_coord_t *uf_coord)
54108 +{
54109 +       const coord_t *coord;
54110 +       const extent_coord_extension_t *ext_coord;
54111 +       reiser4_extent *ext;
54112 +
54113 +       coord = &uf_coord->base_coord;
54114 +       ext_coord = &uf_coord->extension.extent;
54115 +       ext = ext_by_ext_coord(uf_coord);
54116 +
54117 +       return WITH_DATA(coord->node, (uf_coord->valid == 1 &&
54118 +                                      coord_is_iplug_set(coord) &&
54119 +                                      item_is_extent(coord) &&
54120 +                                      ext_coord->nr_units == nr_units_extent(coord) &&
54121 +                                      ext == extent_by_coord(coord) &&
54122 +                                      ext_coord->width == extent_get_width(ext) &&
54123 +                                      coord->unit_pos < ext_coord->nr_units &&
54124 +                                      ext_coord->pos_in_unit < ext_coord->width &&
54125 +                                      extent_get_start(ext) == extent_get_start(&ext_coord->extent) &&
54126 +                                      extent_get_width(ext) == extent_get_width(&ext_coord->extent)));
54127 +}
54128 +
54129 +#endif
54130 +
54131 +/* @coord is set either to the end of last extent item of a file
54132 +   (coord->node is a node on the twig level) or to a place where first
54133 +   item of file has to be inserted to (coord->node is leaf
54134 +   node). Calculate size of hole to be inserted. If that hole is too
54135 +   big - only part of it is inserted */
54136 +static int
54137 +add_hole(coord_t *coord, lock_handle *lh, const reiser4_key *key /* key of position in a file for write */)
54138 +{
54139 +       int result;
54140 +       znode *loaded;
54141 +       reiser4_extent *ext, new_ext;
54142 +       reiser4_block_nr hole_width;
54143 +       reiser4_item_data item;
54144 +       reiser4_key hole_key;
54145 +
54146 +       /*coord_clear_iplug(coord);*/
54147 +       result = zload(coord->node);
54148 +       if (result)
54149 +               return result;
54150 +       loaded = coord->node;
54151 +
54152 +       if (znode_get_level(coord->node) == LEAF_LEVEL) {
54153 +               /* there are no items of this file yet. First item will be
54154 +                  hole extent inserted here */
54155 +
54156 +               /* @coord must be set for inserting of new item */
54157 +               assert("vs-711", coord_is_between_items(coord));
54158 +
54159 +               hole_key = *key;
54160 +               set_key_offset(&hole_key, 0ull);
54161 +
54162 +               hole_width = ((get_key_offset(key) + current_blocksize - 1) >>
54163 +                             current_blocksize_bits);
54164 +               assert("vs-710", hole_width > 0);
54165 +
54166 +               /* compose body of hole extent */
54167 +               set_extent(&new_ext, HOLE_EXTENT_START, hole_width);
54168 +
54169 +               result = insert_extent_by_coord(coord, init_new_extent(&item, &new_ext, 1), &hole_key, lh);
54170 +               zrelse(loaded);
54171 +               return result;
54172 +       }
54173 +
54174 +       /* last item of file may have to be appended with hole */
54175 +       assert("vs-708", znode_get_level(coord->node) == TWIG_LEVEL);
54176 +       assert("vs-714", item_id_by_coord(coord) == EXTENT_POINTER_ID);
54177 +
54178 +       /* make sure we are at proper item */
54179 +       assert("vs-918", keylt(key, max_key_inside_extent(coord, &hole_key)));
54180 +
54181 +       /* key of first byte which is not addressed by this extent */
54182 +       append_key_extent(coord, &hole_key);
54183 +
54184 +       if (keyle(key, &hole_key)) {
54185 +               /* there is already extent unit which contains position
54186 +                  specified by @key */
54187 +               zrelse(loaded);
54188 +               return 0;
54189 +       }
54190 +
54191 +       /* extent item has to be appended with hole. Calculate length of that
54192 +          hole */
54193 +       hole_width = ((get_key_offset(key) - get_key_offset(&hole_key) +
54194 +                      current_blocksize - 1) >> current_blocksize_bits);
54195 +       assert("vs-954", hole_width > 0);
54196 +
54197 +       /* set coord after last unit */
54198 +       coord_init_after_item_end(coord);
54199 +
54200 +       /* get last extent in the item */
54201 +       ext = extent_by_coord(coord);
54202 +       if (state_of_extent(ext) == HOLE_EXTENT) {
54203 +               /* last extent of a file is hole extent. Widen that extent by
54204 +                  @hole_width blocks. Note that we do not worry about
54205 +                  overflowing - extent width is 64 bits */
54206 +               set_extent(ext, HOLE_EXTENT_START, extent_get_width(ext) + hole_width);
54207 +               znode_make_dirty(coord->node);
54208 +               zrelse(loaded);
54209 +               return 0;
54210 +       }
54211 +
54212 +       /* append item with hole extent unit */
54213 +       assert("vs-713", (state_of_extent(ext) == ALLOCATED_EXTENT || state_of_extent(ext) == UNALLOCATED_EXTENT));
54214 +
54215 +       /* compose body of hole extent */
54216 +       set_extent(&new_ext, HOLE_EXTENT_START, hole_width);
54217 +
54218 +       result = insert_into_item(coord, lh, &hole_key, init_new_extent(&item, &new_ext, 1), 0 /*flags */ );
54219 +       zrelse(loaded);
54220 +       return result;
54221 +}
54222 +
54223 +/* insert extent item (containing one unallocated extent of width 1) to place
54224 +   set by @coord */
54225 +static int
54226 +insert_first_block(uf_coord_t *uf_coord, const reiser4_key *key, reiser4_block_nr *block)
54227 +{
54228 +       int result;
54229 +       reiser4_extent ext;
54230 +       reiser4_item_data unit;
54231 +
54232 +       /* make sure that we really write to first block */
54233 +       assert("vs-240", get_key_offset(key) == 0);
54234 +
54235 +       /* extent insertion starts at leaf level */
54236 +       assert("vs-719", znode_get_level(uf_coord->base_coord.node) == LEAF_LEVEL);
54237 +
54238 +       set_extent(&ext, UNALLOCATED_EXTENT_START, 1);
54239 +       result = insert_extent_by_coord(&uf_coord->base_coord, init_new_extent(&unit, &ext, 1), key, uf_coord->lh);
54240 +       if (result) {
54241 +               /* FIXME-VITALY: this is grabbed at file_write time. */
54242 +               /* grabbed2free ((__u64)1); */
54243 +               return result;
54244 +       }
54245 +
54246 +       *block = fake_blocknr_unformatted();
54247 +
54248 +       /* invalidate coordinate, research must be performed to continue because write will continue on twig level */
54249 +       uf_coord->valid = 0;
54250 +       return 0;
54251 +}
54252 +
54253 +/* @coord is set to the end of extent item. Append it with pointer to one block - either by expanding last unallocated
54254 +   extent or by appending a new one of width 1 */
54255 +static int
54256 +append_one_block(uf_coord_t *uf_coord, reiser4_key *key, reiser4_block_nr *block)
54257 +{
54258 +       int result;
54259 +       reiser4_extent new_ext;
54260 +       reiser4_item_data unit;
54261 +       coord_t *coord;
54262 +       extent_coord_extension_t *ext_coord;
54263 +       reiser4_extent *ext;
54264 +
54265 +       coord = &uf_coord->base_coord;
54266 +       ext_coord = &uf_coord->extension.extent;
54267 +       ext = ext_by_ext_coord(uf_coord);
54268 +
54269 +       /* check correctness of position in the item */
54270 +       assert("vs-228", coord->unit_pos == coord_last_unit_pos(coord));
54271 +       assert("vs-1311", coord->between == AFTER_UNIT);
54272 +       assert("vs-1302", ext_coord->pos_in_unit == ext_coord->width - 1);
54273 +       assert("vs-883",
54274 +              ( {
54275 +                      reiser4_key next;
54276 +                      keyeq(key, append_key_extent(coord, &next));
54277 +              }));
54278 +
54279 +       switch (state_of_extent(ext)) {
54280 +       case UNALLOCATED_EXTENT:
54281 +               set_extent(ext, UNALLOCATED_EXTENT_START, extent_get_width(ext) + 1);
54282 +               znode_make_dirty(coord->node);
54283 +
54284 +               /* update coord extension */
54285 +               ext_coord->width ++;
54286 +               ON_DEBUG(extent_set_width(&uf_coord->extension.extent.extent, ext_coord->width));
54287 +               break;
54288 +
54289 +       case HOLE_EXTENT:
54290 +       case ALLOCATED_EXTENT:
54291 +               /* append one unallocated extent of width 1 */
54292 +               set_extent(&new_ext, UNALLOCATED_EXTENT_START, 1);
54293 +               result = insert_into_item(coord, uf_coord->lh, key, init_new_extent(&unit, &new_ext, 1), 0 /* flags */ );
54294 +               /* FIXME: for now */
54295 +               uf_coord->valid = 0;
54296 +               if (result)
54297 +                       return result;
54298 +               break;
54299 +       default:
54300 +               assert("", 0);
54301 +       }
54302 +
54303 +       *block = fake_blocknr_unformatted();
54304 +       return 0;
54305 +}
54306 +
54307 +/* @coord is set to hole unit inside of extent item, replace hole unit with an
54308 +   unit for unallocated extent of the width 1, and perhaps a hole unit before
54309 +   the unallocated unit and perhaps a hole unit after the unallocated unit. */
54310 +static int
54311 +plug_hole(uf_coord_t *uf_coord, reiser4_key *key)
54312 +{
54313 +       reiser4_extent *ext, new_exts[2],       /* extents which will be added after original
54314 +                                                * hole one */
54315 +        replace;               /* extent original hole extent will be replaced
54316 +                                * with */
54317 +       reiser4_block_nr width, pos_in_unit;
54318 +       reiser4_item_data item;
54319 +       int count;
54320 +       coord_t *coord;
54321 +       extent_coord_extension_t *ext_coord;
54322 +       reiser4_key tmp_key;
54323 +       int return_inserted_position;
54324 +
54325 +       coord = &uf_coord->base_coord;
54326 +       ext_coord = &uf_coord->extension.extent;
54327 +       ext = ext_by_ext_coord(uf_coord);
54328 +
54329 +       width = ext_coord->width;
54330 +       pos_in_unit = ext_coord->pos_in_unit;
54331 +
54332 +       if (width == 1) {
54333 +               set_extent(ext, UNALLOCATED_EXTENT_START, 1);
54334 +               znode_make_dirty(coord->node);
54335 +               return 0;
54336 +       } else if (pos_in_unit == 0) {
54337 +               if (coord->unit_pos) {
54338 +                       if (state_of_extent(ext - 1) == UNALLOCATED_EXTENT) {
54339 +                               extent_set_width(ext - 1, extent_get_width(ext - 1) + 1);
54340 +                               extent_set_width(ext, width - 1);
54341 +                               znode_make_dirty(coord->node);
54342 +
54343 +                               /* update coord extension */
54344 +                               coord->unit_pos --;
54345 +                               ext_coord->width = extent_get_width(ext - 1);
54346 +                               ext_coord->pos_in_unit = ext_coord->width - 1;
54347 +                               ext_coord->ext_offset -= sizeof(reiser4_extent);
54348 +                               ON_DEBUG(ext_coord->extent = *extent_by_coord(coord));
54349 +                               return 0;
54350 +                       }
54351 +               }
54352 +               /* extent for replace */
54353 +               set_extent(&replace, UNALLOCATED_EXTENT_START, 1);
54354 +               /* extent to be inserted */
54355 +               set_extent(&new_exts[0], HOLE_EXTENT_START, width - 1);
54356 +
54357 +               /* have replace_extent to return with @coord and @uf_coord->lh set to unit which was replaced */
54358 +               return_inserted_position = 0;
54359 +               count = 1;
54360 +       } else if (pos_in_unit == width - 1) {
54361 +               if (coord->unit_pos < nr_units_extent(coord) - 1) {
54362 +                       if (state_of_extent(ext + 1) == UNALLOCATED_EXTENT) {
54363 +                               extent_set_width(ext + 1, extent_get_width(ext + 1) + 1);
54364 +                               extent_set_width(ext, width - 1);
54365 +                               znode_make_dirty(coord->node);
54366 +
54367 +                               /* update coord extension */
54368 +                               coord->unit_pos ++;
54369 +                               ext_coord->width = extent_get_width(ext + 1);
54370 +                               ext_coord->pos_in_unit = 0;
54371 +                               ext_coord->ext_offset += sizeof(reiser4_extent);
54372 +                               ON_DEBUG(ext_coord->extent = *extent_by_coord(coord));
54373 +                               return 0;
54374 +                       }
54375 +               }
54376 +               /* extent for replace */
54377 +               set_extent(&replace, HOLE_EXTENT_START, width - 1);
54378 +               /* extent to be inserted */
54379 +               set_extent(&new_exts[0], UNALLOCATED_EXTENT_START, 1);
54380 +
54381 +               /* have replace_extent to return with @coord and @uf_coord->lh set to unit which was inserted */
54382 +               return_inserted_position = 1;
54383 +               count = 1;
54384 +       } else {
54385 +               /* extent for replace */
54386 +               set_extent(&replace, HOLE_EXTENT_START, pos_in_unit);
54387 +               /* extents to be inserted */
54388 +               set_extent(&new_exts[0], UNALLOCATED_EXTENT_START, 1);
54389 +               set_extent(&new_exts[1], HOLE_EXTENT_START, width - pos_in_unit - 1);
54390 +
54391 +               /* have replace_extent to return with @coord and @uf_coord->lh set to first of units which were
54392 +                  inserted */
54393 +               return_inserted_position = 1;
54394 +               count = 2;
54395 +       }
54396 +
54397 +       /* insert_into_item will insert new units after the one @coord is set
54398 +          to. So, update key correspondingly */
54399 +       unit_key_by_coord(coord, &tmp_key);
54400 +       set_key_offset(&tmp_key, (get_key_offset(&tmp_key) + extent_get_width(&replace) * current_blocksize));
54401 +
54402 +       uf_coord->valid = 0;
54403 +       return replace_extent(coord, uf_coord->lh, &tmp_key, init_new_extent(&item, new_exts, count), &replace, 0 /* flags */, return_inserted_position);
54404 +}
54405 +
54406 +/* make unallocated node pointer in the position @uf_coord is set to */
54407 +static int
54408 +overwrite_one_block(uf_coord_t *uf_coord, reiser4_key *key, reiser4_block_nr *block, int *created,
54409 +                   struct inode *inode)
54410 +{
54411 +       int result;
54412 +       extent_coord_extension_t *ext_coord;
54413 +       reiser4_extent *ext;
54414 +       oid_t oid;
54415 +       pgoff_t index;
54416 +
54417 +       oid = get_key_objectid(key);
54418 +       index = get_key_offset(key) >> current_blocksize_bits;
54419 +
54420 +       assert("vs-1312", uf_coord->base_coord.between == AT_UNIT);
54421 +
54422 +       result = 0;
54423 +       *created = 0;
54424 +       ext_coord = &uf_coord->extension.extent;
54425 +       ext = ext_by_ext_coord(uf_coord);
54426 +
54427 +       switch (state_of_extent(ext)) {
54428 +       case ALLOCATED_EXTENT:
54429 +               *block = extent_get_start(ext) + ext_coord->pos_in_unit;
54430 +               break;
54431 +
54432 +       case HOLE_EXTENT:
54433 +               if (inode != NULL && DQUOT_ALLOC_BLOCK(inode, 1))
54434 +                       return RETERR(-EDQUOT);
54435 +               result = plug_hole(uf_coord, key);
54436 +               if (!result) {
54437 +                       *block = fake_blocknr_unformatted();
54438 +                       *created = 1;
54439 +               } else {
54440 +                       if (inode != NULL)
54441 +                               DQUOT_FREE_BLOCK(inode, 1);
54442 +               }
54443 +               break;
54444 +
54445 +       case UNALLOCATED_EXTENT:
54446 +               break;
54447 +
54448 +       default:
54449 +               impossible("vs-238", "extent of unknown type found");
54450 +               result = RETERR(-EIO);
54451 +               break;
54452 +       }
54453 +
54454 +       return result;
54455 +}
54456 +
54457 +#if REISER4_DEBUG
54458 +
54459 +/* after make extent uf_coord's lock handle must be set to node containing unit which was inserted/found */
54460 +static void
54461 +check_make_extent_result(int result, write_mode_t mode, const reiser4_key *key,
54462 +                        const lock_handle *lh, reiser4_block_nr block)
54463 +{
54464 +       coord_t coord;
54465 +
54466 +       if (result != 0)
54467 +               return;
54468 +
54469 +       assert("vs-960", znode_is_write_locked(lh->node));
54470 +       zload(lh->node);
54471 +       result = lh->node->nplug->lookup(lh->node, key, FIND_EXACT, &coord);
54472 +       assert("vs-1502", result == NS_FOUND);
54473 +       assert("vs-1656", coord_is_existing_unit(&coord));
54474 +
54475 +       if (blocknr_is_fake(&block)) {
54476 +               assert("vs-1657", state_of_extent(extent_by_coord(&coord)) == UNALLOCATED_EXTENT);
54477 +       } else if (block == 0) {
54478 +               assert("vs-1660", mode == OVERWRITE_ITEM);
54479 +               assert("vs-1657", state_of_extent(extent_by_coord(&coord)) == UNALLOCATED_EXTENT);
54480 +       } else {
54481 +               reiser4_key tmp;
54482 +               reiser4_block_nr pos_in_unit;
54483 +
54484 +               assert("vs-1658", state_of_extent(extent_by_coord(&coord)) == ALLOCATED_EXTENT);
54485 +               unit_key_by_coord(&coord, &tmp);
54486 +               pos_in_unit = (get_key_offset(key) - get_key_offset(&tmp)) >> current_blocksize_bits;
54487 +               assert("vs-1659", block == extent_get_start(extent_by_coord(&coord)) + pos_in_unit);
54488 +       }
54489 +       zrelse(lh->node);
54490 +}
54491 +
54492 +#endif
54493 +
54494 +/* when @inode is not NULL, alloc quota before updating extent item */
54495 +static int
54496 +make_extent(reiser4_key *key, uf_coord_t *uf_coord, write_mode_t mode,
54497 +           reiser4_block_nr *block, int *created, struct inode *inode)
54498 +{
54499 +       int result;
54500 +       oid_t oid;
54501 +       pgoff_t index;
54502 +
54503 +       oid = get_key_objectid(key);
54504 +       index = get_key_offset(key) >> current_blocksize_bits;
54505 +
54506 +       assert("vs-960", znode_is_write_locked(uf_coord->base_coord.node));
54507 +       assert("vs-1334", znode_is_loaded(uf_coord->base_coord.node));
54508 +
54509 +       DISABLE_NODE_CHECK;
54510 +
54511 +       *block = 0;
54512 +       switch (mode) {
54513 +       case FIRST_ITEM:
54514 +               /* new block will be inserted into file. Check quota */
54515 +               if (inode != NULL && DQUOT_ALLOC_BLOCK(inode, 1))
54516 +                       return RETERR(-EDQUOT);
54517 +
54518 +               /* create first item of the file */
54519 +               result = insert_first_block(uf_coord, key, block);
54520 +               if (result && inode != NULL)
54521 +                       DQUOT_FREE_BLOCK(inode, 1);
54522 +               *created = 1;
54523 +               break;
54524 +
54525 +       case APPEND_ITEM:
54526 +               /* new block will be inserted into file. Check quota */
54527 +               if (inode != NULL && DQUOT_ALLOC_BLOCK(inode, 1))
54528 +                       return RETERR(-EDQUOT);
54529 +
54530 +               /* FIXME: item plugin should be initialized
54531 +                  item_plugin_by_coord(&uf_coord->base_coord);*/
54532 +               assert("vs-1316", coord_extension_is_ok(uf_coord));
54533 +               result = append_one_block(uf_coord, key, block);
54534 +               if (result && inode != NULL)
54535 +                       DQUOT_FREE_BLOCK(inode, 1);
54536 +               *created = 1;
54537 +               break;
54538 +
54539 +       case OVERWRITE_ITEM:
54540 +               /* FIXME: item plugin should be initialized
54541 +                  item_plugin_by_coord(&uf_coord->base_coord);*/
54542 +               assert("vs-1316", coord_extension_is_ok(uf_coord));
54543 +               result = overwrite_one_block(uf_coord, key, block, created, inode);
54544 +               break;
54545 +
54546 +       default:
54547 +               assert("vs-1346", 0);
54548 +               result = RETERR(-E_REPEAT);
54549 +               break;
54550 +       }
54551 +
54552 +       ENABLE_NODE_CHECK;
54553 +
54554 +       ON_DEBUG(check_make_extent_result(result, mode, key, uf_coord->lh, *block));
54555 +
54556 +       return result;
54557 +}
54558 +
54559 +/* drop longterm znode lock before calling balance_dirty_pages. balance_dirty_pages may cause transaction to close,
54560 +   therefore we have to update stat data if necessary */
54561 +static int
54562 +extent_balance_dirty_pages(struct address_space *mapping, const flow_t *f,
54563 +                          hint_t *hint)
54564 +{
54565 +       return item_balance_dirty_pages(mapping, f, hint, 0, 0/* do not set hint */);
54566 +}
54567 +
54568 +/* estimate and reserve space which may be required for writing one page of file */
54569 +static int
54570 +reserve_extent_write_iteration(struct inode *inode, reiser4_tree *tree)
54571 +{
54572 +       int result;
54573 +
54574 +       grab_space_enable();
54575 +       /* one unformatted node and one insertion into tree and one stat data update may be involved */
54576 +       result = reiser4_grab_space(1 + /* Hans removed reservation for balancing here. */
54577 +                                   /* if extent items will be ever used by plugins other than unix file plugin - estimate update should instead be taken by
54578 +                                      inode_file_plugin(inode)->estimate.update(inode)
54579 +                                   */
54580 +                                   estimate_update_common(inode),
54581 +                                   0/* flags */);
54582 +       return result;
54583 +}
54584 +
54585 +static void
54586 +write_move_coord(coord_t *coord, uf_coord_t *uf_coord, write_mode_t mode, int full_page)
54587 +{
54588 +       extent_coord_extension_t *ext_coord;
54589 +
54590 +       assert("vs-1339", ergo(mode == OVERWRITE_ITEM, coord->between == AT_UNIT));
54591 +       assert("vs-1341", ergo(mode == FIRST_ITEM, uf_coord->valid == 0));
54592 +
54593 +       if (uf_coord->valid == 0)
54594 +               return;
54595 +
54596 +       ext_coord = &uf_coord->extension.extent;
54597 +
54598 +       if (mode == APPEND_ITEM) {
54599 +               assert("vs-1340", coord->between == AFTER_UNIT);
54600 +               assert("vs-1342", coord->unit_pos == ext_coord->nr_units - 1);
54601 +               assert("vs-1343", ext_coord->pos_in_unit == ext_coord->width - 2);
54602 +               assert("vs-1344", state_of_extent(ext_by_ext_coord(uf_coord)) == UNALLOCATED_EXTENT);
54603 +               ON_DEBUG(ext_coord->extent = *ext_by_ext_coord(uf_coord));
54604 +               ext_coord->pos_in_unit ++;
54605 +               if (!full_page)
54606 +                       coord->between = AT_UNIT;
54607 +               return;
54608 +       }
54609 +
54610 +       assert("vs-1345", coord->between == AT_UNIT);
54611 +
54612 +       if (!full_page)
54613 +               return;
54614 +       if (ext_coord->pos_in_unit == ext_coord->width - 1) {
54615 +               /* last position in the unit */
54616 +               if (coord->unit_pos == ext_coord->nr_units - 1) {
54617 +                       /* last unit in the item */
54618 +                       uf_coord->valid = 0;
54619 +               } else {
54620 +                       /* move to the next unit */
54621 +                       coord->unit_pos ++;
54622 +                       ext_coord->ext_offset += sizeof(reiser4_extent);
54623 +                       ON_DEBUG(ext_coord->extent = *ext_by_offset(coord->node, ext_coord->ext_offset));
54624 +                       ext_coord->width = extent_get_width(ext_by_offset(coord->node, ext_coord->ext_offset));
54625 +                       ext_coord->pos_in_unit = 0;
54626 +               }
54627 +       } else
54628 +               ext_coord->pos_in_unit ++;
54629 +}
54630 +
54631 +static void
54632 +set_hint_unlock_node(hint_t *hint, flow_t *f, znode_lock_mode mode)
54633 +{
54634 +       if (hint->coord.valid) {
54635 +               set_hint(hint, &f->key, mode);
54636 +       } else {
54637 +               unset_hint(hint);
54638 +       }
54639 +       longterm_unlock_znode(hint->coord.lh);
54640 +}
54641 +
54642 +static int
54643 +write_is_partial(struct inode *inode, loff_t file_off, unsigned page_off, unsigned count)
54644 +{
54645 +       if (count == inode->i_sb->s_blocksize)
54646 +               return 0;
54647 +       if (page_off == 0 && file_off + count >= inode->i_size)
54648 +               return 0;
54649 +       return 1;
54650 +}
54651 +
54652 +/* this initialize content of page not covered by write */
54653 +static void
54654 +zero_around(struct page *page, int from, int count)
54655 +{
54656 +       char *data;
54657 +
54658 +       data = kmap_atomic(page, KM_USER0);
54659 +       memset(data, 0, from);
54660 +       memset(data + from + count, 0, PAGE_CACHE_SIZE - from - count);
54661 +       flush_dcache_page(page);
54662 +       kunmap_atomic(data, KM_USER0);
54663 +}
54664 +
54665 +/* write flow's data into file by pages */
54666 +static int
54667 +extent_write_flow(struct inode *inode, flow_t *flow, hint_t *hint,
54668 +                 int grabbed, /* 0 if space for operation is not reserved yet, 1 - otherwise */
54669 +                 write_mode_t mode)
54670 +{
54671 +       int result;
54672 +       loff_t file_off;
54673 +       unsigned long page_nr;
54674 +       unsigned long page_off, count;
54675 +       struct page *page;
54676 +       jnode *j;
54677 +       uf_coord_t *uf_coord;
54678 +       coord_t *coord;
54679 +       oid_t oid;
54680 +       reiser4_tree *tree;
54681 +       reiser4_key page_key;
54682 +       reiser4_block_nr blocknr;
54683 +       int created;
54684 +
54685 +
54686 +       assert("nikita-3139", !inode_get_flag(inode, REISER4_NO_SD));
54687 +       assert("vs-885", current_blocksize == PAGE_CACHE_SIZE);
54688 +       assert("vs-700", flow->user == 1);
54689 +       assert("vs-1352", flow->length > 0);
54690 +
54691 +       tree = tree_by_inode(inode);
54692 +       oid = get_inode_oid(inode);
54693 +       uf_coord = &hint->coord;
54694 +       coord = &uf_coord->base_coord;
54695 +
54696 +       /* position in a file to start write from */
54697 +       file_off = get_key_offset(&flow->key);
54698 +       /* index of page containing that offset */
54699 +       page_nr = (unsigned long)(file_off >> PAGE_CACHE_SHIFT);
54700 +       /* offset within the page */
54701 +       page_off = (unsigned long)(file_off & (PAGE_CACHE_SIZE - 1));
54702 +
54703 +       /* key of first byte of page */
54704 +       page_key = flow->key;
54705 +       set_key_offset(&page_key, (loff_t)page_nr << PAGE_CACHE_SHIFT);
54706 +       do {
54707 +               if (!grabbed) {
54708 +                       result = reserve_extent_write_iteration(inode, tree);
54709 +                       if (result)
54710 +                               break;
54711 +               }
54712 +               /* number of bytes to be written to page */
54713 +               count = PAGE_CACHE_SIZE - page_off;
54714 +               if (count > flow->length)
54715 +                       count = flow->length;
54716 +
54717 +               write_page_log(inode->i_mapping, page_nr);
54718 +
54719 +               result = make_extent(&page_key, uf_coord, mode, &blocknr, &created, inode/* check quota */);
54720 +               if (result) {
54721 +                       goto exit1;
54722 +               }
54723 +
54724 +               /* look for jnode and create it if it does not exist yet */
54725 +               j = find_get_jnode(tree, inode->i_mapping, oid, page_nr);
54726 +               if (IS_ERR(j)) {
54727 +                       result = PTR_ERR(j);
54728 +                       goto exit1;
54729 +               }
54730 +               LOCK_JNODE(j);
54731 +               if (created) {
54732 +                       /* extent corresponding to this jnode was just created */
54733 +                       assert("vs-1504", *jnode_get_block(j) == 0);
54734 +                       JF_SET(j, JNODE_CREATED);
54735 +                       /* new block is added to file. Update inode->i_blocks and inode->i_bytes. FIXME:
54736 +                          inode_set/get/add/sub_bytes is used to be called by quota macros */
54737 +                       /*inode_add_bytes(inode, PAGE_CACHE_SIZE);*/
54738 +               }
54739 +               if (*jnode_get_block(j) == 0) {
54740 +                       jnode_set_block(j, &blocknr);
54741 +               } else {
54742 +                       assert("vs-1508", !blocknr_is_fake(&blocknr));
54743 +                       assert("vs-1507", ergo(blocknr, *jnode_get_block(j) == blocknr));
54744 +               }
54745 +               UNLOCK_JNODE(j);
54746 +
54747 +               /* get page looked and attached to jnode */
54748 +               page = jnode_get_page_locked(j, GFP_KERNEL);
54749 +               if (IS_ERR(page)) {
54750 +                       result = PTR_ERR(page);
54751 +                       goto exit2;
54752 +               }
54753 +
54754 +               page_cache_get(page);
54755 +
54756 +               if (!PageUptodate(page)) {
54757 +                       if (mode == OVERWRITE_ITEM) {
54758 +                               /* this page may be either an anonymous page (a page which was dirtied via mmap,
54759 +                                  writepage-ed and for which extent pointer was just created. In this case jnode is
54760 +                                  eflushed) or correspod to not page cached block (in which case created == 0). In
54761 +                                  either case we have to read this page if it is being overwritten partially */
54762 +                               if (write_is_partial(inode, file_off, page_off, count) &&
54763 +                                   (created == 0 || JF_ISSET(j, JNODE_EFLUSH))) {
54764 +                                       result = page_io(page, j, READ, GFP_KERNEL);
54765 +                                       if (result)
54766 +                                               goto exit3;
54767 +                                       lock_page(page);
54768 +                                       if (!PageUptodate(page))
54769 +                                               goto exit3;
54770 +                               } else {
54771 +                                       zero_around(page, page_off, count);
54772 +                               }
54773 +                       } else {
54774 +                               /* new page added to the file. No need to carry about data it might contain. Zero
54775 +                                  content of new page around write area */
54776 +                               assert("vs-1681", !JF_ISSET(j, JNODE_EFLUSH));
54777 +                               zero_around(page, page_off, count);
54778 +                       }
54779 +               }
54780 +
54781 +               UNDER_SPIN_VOID(jnode, j, eflush_del(j, 1));
54782 +
54783 +               move_flow_forward(flow, count);
54784 +               write_move_coord(coord, uf_coord, mode, page_off + count == PAGE_CACHE_SIZE);
54785 +               set_hint_unlock_node(hint, flow, ZNODE_WRITE_LOCK);
54786 +
54787 +               assert("vs-1503", UNDER_SPIN(jnode, j, (!JF_ISSET(j, JNODE_EFLUSH) && jnode_page(j) == page)));
54788 +               assert("nikita-3033", schedulable());
54789 +               if (!lock_stack_isclean(get_current_lock_stack()))
54790 +                       print_clog();
54791 +               assert("nikita-2104", lock_stack_isclean(get_current_lock_stack()));
54792 +
54793 +               /* copy user data into page */
54794 +               result = __copy_from_user((char *)kmap(page) + page_off, flow->data - count, count);
54795 +               kunmap(page);
54796 +               if (unlikely(result)) {
54797 +                       /* FIXME: write(fd, 0, 10); to empty file will write no data but file will get increased
54798 +                          size. */
54799 +                       result = RETERR(-EFAULT);
54800 +                       goto exit3;
54801 +               }
54802 +
54803 +               set_page_dirty_internal(page, 0);
54804 +               SetPageUptodate(page);
54805 +               if (!PageReferenced(page))
54806 +                       SetPageReferenced(page);
54807 +
54808 +               unlock_page(page);
54809 +               page_cache_release(page);
54810 +
54811 +               /* FIXME: possible optimization: if jnode is not dirty yet - it gets into clean list in try_capture and
54812 +                  then in jnode_mark_dirty gets moved to dirty list. So, it would be more optimal to put jnode directly
54813 +                  to dirty list */
54814 +               LOCK_JNODE(j);
54815 +               result = try_capture(j, ZNODE_WRITE_LOCK, 0, 1/* can_coc */);
54816 +               if (result)
54817 +                       goto exit2;
54818 +               jnode_make_dirty_locked(j);
54819 +               UNLOCK_JNODE(j);
54820 +
54821 +               jput(j);
54822 +
54823 +               /* throttle the writer */
54824 +               result = extent_balance_dirty_pages(inode->i_mapping, flow, hint);
54825 +               if (!grabbed)
54826 +                       all_grabbed2free();
54827 +               if (result) {
54828 +                       reiser4_stat_inc(extent.bdp_caused_repeats);
54829 +                       break;
54830 +               }
54831 +
54832 +               page_off = 0;
54833 +               page_nr ++;
54834 +               file_off += count;
54835 +               set_key_offset(&page_key, (loff_t)page_nr << PAGE_CACHE_SHIFT);
54836 +
54837 +               continue;
54838 +
54839 +       exit3:
54840 +               unlock_page(page);
54841 +               page_cache_release(page);
54842 +       exit2:
54843 +               if (created)
54844 +                       inode_sub_bytes(inode, PAGE_CACHE_SIZE);
54845 +               jput(j);
54846 +       exit1:
54847 +               if (!grabbed)
54848 +                       all_grabbed2free();
54849 +               break;
54850 +
54851 +               /* hint is unset by make_page_extent when first extent of a
54852 +                  file was inserted: in that case we can not use coord anymore
54853 +                  because we are to continue on twig level but are still at
54854 +                  leaf level
54855 +               */
54856 +       } while (flow->length && uf_coord->valid == 1);
54857 +
54858 +       return result;
54859 +}
54860 +
54861 +/* estimate and reserve space which may be required for appending file with hole stored in extent */
54862 +static int
54863 +extent_hole_reserve(reiser4_tree *tree)
54864 +{
54865 +       /* adding hole may require adding a hole unit into extent item and stat data update */
54866 +       grab_space_enable();
54867 +       return reiser4_grab_space(estimate_one_insert_into_item(tree) * 2, 0);
54868 +}
54869 +
54870 +static int
54871 +extent_write_hole(struct inode *inode, flow_t *flow, hint_t *hint, int grabbed)
54872 +{
54873 +       int result;
54874 +       loff_t new_size;
54875 +       coord_t *coord;
54876 +
54877 +       coord = &hint->coord.base_coord;
54878 +       if (!grabbed) {
54879 +               result = extent_hole_reserve(znode_get_tree(coord->node));
54880 +               if (result)
54881 +                       return result;
54882 +       }
54883 +
54884 +       new_size = get_key_offset(&flow->key) + flow->length;
54885 +       set_key_offset(&flow->key, new_size);
54886 +       flow->length = 0;
54887 +       result = add_hole(coord, hint->coord.lh, &flow->key);
54888 +       hint->coord.valid = 0;
54889 +       if (!result) {
54890 +               done_lh(hint->coord.lh);
54891 +               INODE_SET_FIELD(inode, i_size, new_size);
54892 +               inode->i_ctime = inode->i_mtime = CURRENT_TIME;
54893 +               result = reiser4_update_sd(inode);
54894 +       }
54895 +       if (!grabbed)
54896 +               all_grabbed2free();
54897 +       return result;
54898 +}
54899 +
54900 +/*
54901 +  plugin->s.file.write
54902 +  It can be called in two modes:
54903 +  1. real write - to write data from flow to a file (@flow->data != 0)
54904 +  2. expanding truncate (@f->data == 0)
54905 +*/
54906 +reiser4_internal int
54907 +write_extent(struct inode *inode, flow_t *flow, hint_t *hint,
54908 +            int grabbed, /* extent's write may be called from plain unix file write and from tail conversion. In first
54909 +                            case (grabbed == 0) space is not reserved forehand, so, it must be done here. When it is
54910 +                            being called from tail conversion - space is reserved already for whole operation which may
54911 +                            involve several calls to item write. In this case space reservation will not be done
54912 +                            here */
54913 +            write_mode_t mode)
54914 +{
54915 +       if (flow->data)
54916 +               /* real write */
54917 +               return extent_write_flow(inode, flow, hint, grabbed, mode);
54918 +
54919 +       /* expanding truncate. add_hole requires f->key to be set to new end of file */
54920 +       return extent_write_hole(inode, flow, hint, grabbed);
54921 +}
54922 +
54923 +/* move coord one page forward. Return 1 if coord is moved out of item */
54924 +static int
54925 +read_move_coord(coord_t *coord, extent_coord_extension_t *ext_coord)
54926 +{
54927 +       if (ext_coord->pos_in_unit == ext_coord->width - 1) {
54928 +               /* last position in the unit */
54929 +               if (coord->unit_pos == ext_coord->nr_units - 1) {
54930 +                       /* last unit in the item */
54931 +                       return 1;
54932 +               } else {
54933 +                       /* move to the next unit */
54934 +                       coord->unit_pos ++;
54935 +                       ext_coord->ext_offset += sizeof(reiser4_extent);
54936 +                       ON_DEBUG(ext_coord->extent = *ext_by_offset(coord->node, ext_coord->ext_offset));
54937 +                       ext_coord->width = extent_get_width(ext_by_offset(coord->node, ext_coord->ext_offset));
54938 +                       ext_coord->pos_in_unit = 0;
54939 +               }
54940 +       } else
54941 +               ext_coord->pos_in_unit ++;
54942 +       return 0;
54943 +}
54944 +
54945 +static void
54946 +call_page_cache_readahead(struct address_space *mapping, struct file *file, unsigned long page_nr,
54947 +                         const uf_coord_t *uf_coord)
54948 +{
54949 +       reiser4_file_fsdata *fsdata;
54950 +       uf_coord_t ra_coord;
54951 +
54952 +       fsdata = reiser4_get_file_fsdata(file);
54953 +       ra_coord = *uf_coord;
54954 +       ra_coord.extension.extent.expected_page = page_nr;
54955 +       fsdata->reg.coord = &ra_coord;
54956 +
54957 +       page_cache_readahead(mapping, &file->f_ra, file, page_nr);
54958 +       fsdata->reg.coord = 0;
54959 +}
54960 +
54961 +#if REISER4_TRACE
54962 +static void
54963 +print_ext_coord(const char *s, uf_coord_t *uf_coord)
54964 +{
54965 +       reiser4_key key;
54966 +       extent_coord_extension_t *ext_coord;
54967 +       reiser4_extent *ext;
54968 +
54969 +       item_key_by_coord(&uf_coord->base_coord, &key);
54970 +       ext_coord = &uf_coord->extension.extent;
54971 +       ext = ext_by_ext_coord(uf_coord);
54972 +       printk("%s: item key [%llu, %llu], nr_units %d, cur extent [%llu, %llu], unit_pos %d, pos_in_unit %Lu\n",
54973 +              s, get_key_objectid(&key), get_key_offset(&key),
54974 +              ext_coord->nr_units,
54975 +              extent_get_start(ext), extent_get_width(ext),
54976 +              uf_coord->base_coord.unit_pos, ext_coord->pos_in_unit);
54977 +}
54978 +#endif
54979 +
54980 +#if REISER4_DEBUG
54981 +
54982 +/* return 1 if offset @off is inside of extent unit pointed to by @coord. Set pos_in_unit inside of unit
54983 +   correspondingly */
54984 +static int
54985 +offset_is_in_unit(const coord_t *coord, loff_t off)
54986 +{
54987 +       reiser4_key unit_key;
54988 +       __u64 unit_off;
54989 +       reiser4_extent *ext;
54990 +
54991 +       ext = extent_by_coord(coord);
54992 +
54993 +       unit_key_extent(coord, &unit_key);
54994 +       unit_off = get_key_offset(&unit_key);
54995 +       if (off < unit_off)
54996 +               return 0;
54997 +       if (off >= (unit_off + (current_blocksize * extent_get_width(ext))))
54998 +               return 0;
54999 +       return 1;
55000 +}
55001 +
55002 +static int
55003 +coord_matches_key_extent(const coord_t *coord, const reiser4_key *key)
55004 +{
55005 +       reiser4_key item_key;
55006 +
55007 +       assert("vs-771", coord_is_existing_unit(coord));
55008 +       assert("vs-1258", keylt(key, append_key_extent(coord, &item_key)));
55009 +       assert("vs-1259", keyge(key, item_key_by_coord(coord, &item_key)));
55010 +
55011 +       return offset_is_in_unit(coord, get_key_offset(key));
55012 +}
55013 +
55014 +#endif /* REISER4_DEBUG */
55015 +
55016 +/* Implements plugin->u.item.s.file.read operation for extent items. */
55017 +reiser4_internal int
55018 +read_extent(struct file *file, flow_t *flow,  hint_t *hint)
55019 +{
55020 +       int result;
55021 +       struct page *page;
55022 +       unsigned long page_nr;
55023 +       unsigned long page_off, count;
55024 +       struct inode *inode;
55025 +       __u64 file_off;
55026 +       uf_coord_t *uf_coord;
55027 +       coord_t *coord;
55028 +       extent_coord_extension_t *ext_coord;
55029 +
55030 +       uf_coord = &hint->coord;
55031 +       assert("vs-1318", coord_extension_is_ok(uf_coord));
55032 +
55033 +       inode = file->f_dentry->d_inode;
55034 +       coord = &uf_coord->base_coord;
55035 +       ext_coord = &uf_coord->extension.extent;
55036 +
55037 +       ON_TRACE(TRACE_EXTENTS, "read_extent start: ino %llu, size %llu, offset %llu, count %lld\n",
55038 +                get_inode_oid(inode), inode->i_size, get_key_offset(&flow->key), flow->length);
55039 +       IF_TRACE(TRACE_EXTENTS, print_ext_coord("read_extent start", uf_coord));
55040 +
55041 +       assert("vs-1353", current_blocksize == PAGE_CACHE_SIZE);
55042 +       assert("vs-572", flow->user == 1);
55043 +       assert("vs-1351", flow->length > 0);
55044 +       assert("vs-1119", znode_is_rlocked(coord->node));
55045 +       assert("vs-1120", znode_is_loaded(coord->node));
55046 +       assert("vs-1256", coord_matches_key_extent(coord, &flow->key));
55047 +
55048 +       /* offset in a file to start read from */
55049 +       file_off = get_key_offset(&flow->key);
55050 +       /* index of page containing that offset */
55051 +       page_nr = (unsigned long)(file_off >> PAGE_CACHE_SHIFT);
55052 +       /* offset within the page */
55053 +       page_off = (unsigned long)(file_off & (PAGE_CACHE_SIZE - 1));
55054 +
55055 +       count = PAGE_CACHE_SIZE - page_off;
55056 +
55057 +       do {
55058 +               call_page_cache_readahead(inode->i_mapping, file, page_nr, uf_coord);
55059 +
55060 +               /* this will return page if it exists and is uptodate, otherwise it will allocate page and call
55061 +                  extent_readpage to fill it */
55062 +               page = read_cache_page(inode->i_mapping, page_nr, readpage_extent, coord);
55063 +               if (IS_ERR(page))
55064 +                       return PTR_ERR(page);
55065 +
55066 +               /* number of bytes which can be read from the page */
55067 +               if (count > flow->length)
55068 +                       count = flow->length;
55069 +               move_flow_forward(flow, count);
55070 +               if (page_off + count == PAGE_CACHE_SIZE)
55071 +                       if (read_move_coord(coord, ext_coord))
55072 +                               uf_coord->valid = 0;
55073 +               set_hint_unlock_node(hint, flow, ZNODE_READ_LOCK);
55074 +
55075 +               wait_on_page_locked(page);
55076 +               if (!PageUptodate(page)) {
55077 +                       page_detach_jnode(page, inode->i_mapping, page_nr);
55078 +                       page_cache_release(page);
55079 +                       warning("jmacd-97178", "extent_read: page is not up to date");
55080 +                       return RETERR(-EIO);
55081 +               }
55082 +
55083 +               /* If users can be writing to this page using arbitrary virtual addresses, take care about potential
55084 +                  aliasing before reading the page on the kernel side.
55085 +               */
55086 +               if (mapping_writably_mapped(inode->i_mapping))
55087 +                       flush_dcache_page(page);
55088 +
55089 +               assert("nikita-3034", schedulable());
55090 +
55091 +
55092 +               /* AUDIT: We must page-in/prepare user area first to avoid deadlocks */
55093 +               result = __copy_to_user(flow->data - count, (char *)kmap(page) + page_off, count);
55094 +               kunmap(page);
55095 +
55096 +               page_cache_release(page);
55097 +               if (unlikely(result))
55098 +                       return RETERR(-EFAULT);
55099 +
55100 +               result = hint_validate(hint, &flow->key, 0/* do not check key */, ZNODE_READ_LOCK);
55101 +               if (result)
55102 +                       break;
55103 +               assert("vs-1318", coord_extension_is_ok(uf_coord));
55104 +               assert("vs-1263", coord_matches_key_extent(coord, &flow->key));
55105 +               page_off = 0;
55106 +               page_nr ++;
55107 +               count = PAGE_CACHE_SIZE;
55108 +       } while (flow->length && uf_coord->valid == 1);
55109 +
55110 +       ON_TRACE(TRACE_EXTENTS, "read_extent done: left %lld\n", flow->length);
55111 +       IF_TRACE(TRACE_EXTENTS, print_ext_coord("read_extent done", uf_coord));
55112 +
55113 +       return 0;
55114 +}
55115 +
55116 +static int
55117 +move_coord_pages(coord_t *coord, extent_coord_extension_t *ext_coord, unsigned count)
55118 +{
55119 +       reiser4_extent *ext;
55120 +
55121 +       ext_coord->expected_page += count;
55122 +
55123 +       ext = ext_by_offset(coord->node, ext_coord->ext_offset);
55124 +
55125 +       do {
55126 +               if (ext_coord->pos_in_unit + count < ext_coord->width) {
55127 +                       ext_coord->pos_in_unit += count;
55128 +                       break;
55129 +               }
55130 +
55131 +               if (coord->unit_pos == ext_coord->nr_units - 1) {
55132 +                       coord->between = AFTER_UNIT;
55133 +                       return 1;
55134 +               }
55135 +
55136 +               /* shift to next unit */
55137 +               count -= (ext_coord->width - ext_coord->pos_in_unit);
55138 +               coord->unit_pos ++;
55139 +               ext_coord->pos_in_unit = 0;
55140 +               ext_coord->ext_offset += sizeof(reiser4_extent);
55141 +               ext ++;
55142 +               ON_DEBUG(ext_coord->extent = *ext);
55143 +               ext_coord->width = extent_get_width(ext);
55144 +       } while (1);
55145 +
55146 +       return 0;
55147 +}
55148 +
55149 +static inline void
55150 +zero_page(struct page *page)
55151 +{
55152 +       char *kaddr = kmap_atomic(page, KM_USER0);
55153 +
55154 +       xmemset(kaddr, 0, PAGE_CACHE_SIZE);
55155 +       flush_dcache_page(page);
55156 +       kunmap_atomic(kaddr, KM_USER0);
55157 +       SetPageUptodate(page);
55158 +       unlock_page(page);
55159 +}
55160 +
55161 +static int
55162 +do_readpage_extent(reiser4_extent *ext, reiser4_block_nr pos, struct page *page)
55163 +{
55164 +       jnode *j;
55165 +       struct address_space *mapping;
55166 +       unsigned long index;
55167 +       oid_t oid;
55168 +
55169 +       mapping = page->mapping;
55170 +       oid = get_inode_oid(mapping->host);
55171 +       index = page->index;
55172 +
55173 +       switch (state_of_extent(ext)) {
55174 +       case HOLE_EXTENT:
55175 +               /*
55176 +                * it is possible to have hole page with jnode, if page was
55177 +                * eflushed previously.
55178 +                */
55179 +               j = jfind(mapping, index);
55180 +               if (j == NULL) {
55181 +                       zero_page(page);
55182 +                       return 0;
55183 +               }
55184 +               LOCK_JNODE(j);
55185 +               if (!jnode_page(j)) {
55186 +                       jnode_attach_page(j, page);
55187 +               } else {
55188 +                       BUG_ON(jnode_page(j) != page);
55189 +                       assert("vs-1504", jnode_page(j) == page);
55190 +               }
55191 +
55192 +               UNLOCK_JNODE(j);
55193 +               break;
55194 +
55195 +       case ALLOCATED_EXTENT:
55196 +               j = jnode_of_page(page);
55197 +               if (IS_ERR(j))
55198 +                       return PTR_ERR(j);
55199 +               if (*jnode_get_block(j) == 0) {
55200 +                       reiser4_block_nr blocknr;
55201 +
55202 +                       blocknr = extent_get_start(ext) + pos;
55203 +                       jnode_set_block(j, &blocknr);
55204 +               } else
55205 +                       assert("vs-1403", j->blocknr == extent_get_start(ext) + pos);
55206 +               break;
55207 +
55208 +       case UNALLOCATED_EXTENT:
55209 +               j = jfind(mapping, index);
55210 +               assert("nikita-2688", j);
55211 +               assert("vs-1426", jnode_page(j) == NULL);
55212 +
55213 +               UNDER_SPIN_VOID(jnode, j, jnode_attach_page(j, page));
55214 +
55215 +               /* page is locked, it is safe to check JNODE_EFLUSH */
55216 +               assert("vs-1668", JF_ISSET(j, JNODE_EFLUSH));
55217 +               break;
55218 +
55219 +       default:
55220 +               warning("vs-957", "extent_readpage: wrong extent\n");
55221 +               return RETERR(-EIO);
55222 +       }
55223 +
55224 +       BUG_ON(j == 0);
55225 +       page_io(page, j, READ, GFP_NOIO);
55226 +       jput(j);
55227 +       return 0;
55228 +}
55229 +
55230 +static int
55231 +readahead_readpage_extent(void *vp, struct page *page)
55232 +{
55233 +       int result;
55234 +       uf_coord_t *uf_coord;
55235 +       coord_t *coord;
55236 +       extent_coord_extension_t *ext_coord;
55237 +
55238 +       uf_coord = vp;
55239 +       coord = &uf_coord->base_coord;
55240 +
55241 +       if (coord->between != AT_UNIT) {
55242 +               unlock_page(page);
55243 +               return RETERR(-EINVAL);
55244 +       }
55245 +
55246 +       ext_coord = &uf_coord->extension.extent;
55247 +       if (ext_coord->expected_page != page->index) {
55248 +               /* read_cache_pages skipped few pages. Try to adjust coord to page */
55249 +               assert("vs-1269", page->index > ext_coord->expected_page);
55250 +               if (move_coord_pages(coord, ext_coord,  page->index - ext_coord->expected_page)) {
55251 +                       /* extent pointing to this page is not here */
55252 +                       unlock_page(page);
55253 +                       return RETERR(-EINVAL);
55254 +               }
55255 +
55256 +               assert("vs-1274", offset_is_in_unit(coord,
55257 +                                                   (loff_t)page->index << PAGE_CACHE_SHIFT));
55258 +               ext_coord->expected_page = page->index;
55259 +       }
55260 +
55261 +       assert("vs-1281", page->index == ext_coord->expected_page);
55262 +       result = do_readpage_extent(ext_by_ext_coord(uf_coord), ext_coord->pos_in_unit, page);
55263 +       if (!result)
55264 +               move_coord_pages(coord, ext_coord, 1);
55265 +       return result;
55266 +}
55267 +
55268 +/*
55269 +  plugin->u.item.s.file.readpages
55270 +*/
55271 +reiser4_internal void
55272 +readpages_extent(void *vp, struct address_space *mapping, struct list_head *pages)
55273 +{
55274 +       if (vp)
55275 +               read_cache_pages(mapping, pages, readahead_readpage_extent, vp);
55276 +}
55277 +
55278 +/*
55279 +   plugin->s.file.readpage
55280 +   reiser4_read->unix_file_read->page_cache_readahead->reiser4_readpage->unix_file_readpage->extent_readpage
55281 +   or
55282 +   filemap_nopage->reiser4_readpage->readpage_unix_file->->readpage_extent
55283 +
55284 +   At the beginning: coord->node is read locked, zloaded, page is
55285 +   locked, coord is set to existing unit inside of extent item (it is not necessary that coord matches to page->index)
55286 +*/
55287 +reiser4_internal int
55288 +readpage_extent(void *vp, struct page *page)
55289 +{
55290 +       uf_coord_t *uf_coord = vp;
55291 +       ON_DEBUG(coord_t *coord = &uf_coord->base_coord);
55292 +       ON_DEBUG(reiser4_key key);
55293 +
55294 +       assert("vs-1040", PageLocked(page));
55295 +       assert("vs-1050", !PageUptodate(page));
55296 +       assert("vs-757", !jprivate(page) && !PagePrivate(page));
55297 +       assert("vs-1039", page->mapping && page->mapping->host);
55298 +
55299 +       assert("vs-1044", znode_is_loaded(coord->node));
55300 +       assert("vs-758", item_is_extent(coord));
55301 +       assert("vs-1046", coord_is_existing_unit(coord));
55302 +       assert("vs-1045", znode_is_rlocked(coord->node));
55303 +       assert("vs-1047", page->mapping->host->i_ino == get_key_objectid(item_key_by_coord(coord, &key)));
55304 +       assert("vs-1320", coord_extension_is_ok(uf_coord));
55305 +
55306 +       return do_readpage_extent(ext_by_ext_coord(uf_coord), uf_coord->extension.extent.pos_in_unit, page);
55307 +}
55308 +
55309 +/*
55310 +  plugin->s.file.capture
55311 +
55312 +  At the beginning: coord.node is write locked, zloaded, page is not locked, coord is set to existing unit inside of
55313 +  extent item
55314 +*/
55315 +reiser4_internal int
55316 +capture_extent(reiser4_key *key, uf_coord_t *uf_coord, struct page *page, write_mode_t mode)
55317 +{
55318 +       jnode *j;
55319 +       int result;
55320 +       reiser4_block_nr blocknr;
55321 +       int created;
55322 +       int check_quota;
55323 +
55324 +       ON_TRACE(TRACE_EXTENTS, "WP: index %lu, count %d..", page->index, page_count(page));
55325 +
55326 +       assert("vs-1051", page->mapping && page->mapping->host);
55327 +       assert("nikita-3139", !inode_get_flag(page->mapping->host, REISER4_NO_SD));
55328 +       assert("vs-864", znode_is_wlocked(uf_coord->base_coord.node));
55329 +       assert("vs-1398", get_key_objectid(key) == get_inode_oid(page->mapping->host));
55330 +
55331 +       /* FIXME: assume for now that quota is only checked on write */
55332 +       check_quota = 0;
55333 +       result = make_extent(key, uf_coord, mode, &blocknr, &created, check_quota ? page->mapping->host : NULL);
55334 +       if (result) {
55335 +               done_lh(uf_coord->lh);
55336 +               return result;
55337 +       }
55338 +
55339 +       lock_page(page);
55340 +       j = jnode_of_page(page);
55341 +       if (IS_ERR(j)) {
55342 +               unlock_page(page);
55343 +               done_lh(uf_coord->lh);
55344 +               return PTR_ERR(j);
55345 +       }
55346 +       set_page_dirty_internal(page, 0);
55347 +       unlock_page(page);
55348 +
55349 +       LOCK_JNODE(j);
55350 +       if (created) {
55351 +               /* extent corresponding to this jnode was just created */
55352 +               assert("vs-1504", *jnode_get_block(j) == 0);
55353 +               JF_SET(j, JNODE_CREATED);
55354 +               /* new block is added to file. Update inode->i_blocks and inode->i_bytes. FIXME:
55355 +                  inode_set/get/add/sub_bytes is used to be called by quota macros */
55356 +               inode_add_bytes(page->mapping->host, PAGE_CACHE_SIZE);
55357 +       }
55358 +
55359 +       if (*jnode_get_block(j) == 0)
55360 +               jnode_set_block(j, &blocknr);
55361 +       else {
55362 +               assert("vs-1508", !blocknr_is_fake(&blocknr));
55363 +               assert("vs-1507", ergo(blocknr, *jnode_get_block(j) == blocknr));
55364 +       }
55365 +       UNLOCK_JNODE(j);
55366 +
55367 +       done_lh(uf_coord->lh);
55368 +
55369 +       LOCK_JNODE(j);
55370 +       result = try_capture(j, ZNODE_WRITE_LOCK, 0, 1/* can_coc */);
55371 +       if (result != 0)
55372 +               reiser4_panic("nikita-3324", "Cannot capture jnode: %i", result);
55373 +       jnode_make_dirty_locked(j);
55374 +       UNLOCK_JNODE(j);
55375 +       jput(j);
55376 +
55377 +       if (created)
55378 +               reiser4_update_sd(page->mapping->host);
55379 +               /* warning about failure of this is issued already */
55380 +
55381 +       ON_TRACE(TRACE_EXTENTS, "OK\n");
55382 +       return 0;
55383 +}
55384 +
55385 +/*
55386 +  plugin->u.item.s.file.get_block
55387 +*/
55388 +reiser4_internal int
55389 +get_block_address_extent(const coord_t *coord, sector_t block, struct buffer_head *bh)
55390 +{
55391 +       reiser4_extent *ext;
55392 +
55393 +       assert("vs-1321", coord_is_existing_unit(coord));
55394 +
55395 +       ext = extent_by_coord(coord);
55396 +
55397 +       if (state_of_extent(ext) != ALLOCATED_EXTENT)
55398 +               /* FIXME: bad things may happen if it is unallocated extent */
55399 +               bh->b_blocknr = 0;
55400 +       else {
55401 +               reiser4_key key;
55402 +
55403 +               unit_key_by_coord(coord, &key);
55404 +               assert("vs-1645", block >= get_key_offset(&key) >> current_blocksize_bits);
55405 +               assert("vs-1646", block < (get_key_offset(&key) >> current_blocksize_bits) + extent_get_width(ext));
55406 +               bh->b_blocknr = extent_get_start(ext) + (block - (get_key_offset(&key) >> current_blocksize_bits));
55407 +       }
55408 +       return 0;
55409 +}
55410 +
55411 +/*
55412 +  plugin->u.item.s.file.append_key
55413 +  key of first byte which is the next to last byte by addressed by this extent
55414 +*/
55415 +reiser4_internal reiser4_key *
55416 +append_key_extent(const coord_t *coord, reiser4_key *key)
55417 +{
55418 +       item_key_by_coord(coord, key);
55419 +       set_key_offset(key, get_key_offset(key) + extent_size(coord, nr_units_extent(coord)));
55420 +
55421 +       assert("vs-610", get_key_offset(key) && (get_key_offset(key) & (current_blocksize - 1)) == 0);
55422 +       return key;
55423 +}
55424 +
55425 +/* plugin->u.item.s.file.init_coord_extension */
55426 +reiser4_internal void
55427 +init_coord_extension_extent(uf_coord_t *uf_coord, loff_t lookuped)
55428 +{
55429 +       coord_t *coord;
55430 +       extent_coord_extension_t *ext_coord;
55431 +       reiser4_key key;
55432 +       loff_t offset;
55433 +
55434 +       assert("vs-1295", uf_coord->valid == 0);
55435 +
55436 +       coord = &uf_coord->base_coord;
55437 +       assert("vs-1288", coord_is_iplug_set(coord));
55438 +       assert("vs-1327", znode_is_loaded(coord->node));
55439 +
55440 +       if (coord->between != AFTER_UNIT && coord->between != AT_UNIT)
55441 +               return;
55442 +
55443 +       ext_coord = &uf_coord->extension.extent;
55444 +       ext_coord->nr_units = nr_units_extent(coord);
55445 +       ext_coord->ext_offset = (char *)extent_by_coord(coord) - zdata(coord->node);
55446 +       ext_coord->width = extent_get_width(extent_by_coord(coord));
55447 +       ON_DEBUG(ext_coord->extent = *extent_by_coord(coord));
55448 +       uf_coord->valid = 1;
55449 +
55450 +       /* pos_in_unit is the only uninitialized field in extended coord */
55451 +       if (coord->between == AFTER_UNIT) {
55452 +               assert("vs-1330", coord->unit_pos == nr_units_extent(coord) - 1);
55453 +
55454 +               ext_coord->pos_in_unit = ext_coord->width - 1;
55455 +       } else {
55456 +               /* AT_UNIT */
55457 +               unit_key_by_coord(coord, &key);
55458 +               offset = get_key_offset(&key);
55459 +
55460 +               assert("vs-1328", offset <= lookuped);
55461 +               assert("vs-1329", lookuped < offset + ext_coord->width * current_blocksize);
55462 +               ext_coord->pos_in_unit = ((lookuped - offset) >> current_blocksize_bits);
55463 +       }
55464 +}
55465 +
55466 +/*
55467 +   Local variables:
55468 +   c-indentation-style: "K&R"
55469 +   mode-name: "LC"
55470 +   c-basic-offset: 8
55471 +   tab-width: 8
55472 +   fill-column: 120
55473 +   scroll-step: 1
55474 +   End:
55475 +*/
55476 diff -rupN linux-2.6.8-rc3/fs/reiser4/plugin/item/extent_flush_ops.c linux-2.6.8-rc3-a/fs/reiser4/plugin/item/extent_flush_ops.c
55477 --- linux-2.6.8-rc3/fs/reiser4/plugin/item/extent_flush_ops.c   1970-01-01 03:00:00.000000000 +0300
55478 +++ linux-2.6.8-rc3-a/fs/reiser4/plugin/item/extent_flush_ops.c 2004-08-05 21:20:52.918695224 +0400
55479 @@ -0,0 +1,1096 @@
55480 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
55481 +
55482 +#include "item.h"
55483 +#include "../../tree.h"
55484 +#include "../../jnode.h"
55485 +#include "../../super.h"
55486 +#include "../../flush.h"
55487 +#include "../../carry.h"
55488 +#include "../object.h"
55489 +
55490 +#include <linux/pagemap.h>
55491 +
55492 +/* Return either first or last extent (depending on @side) of the item
55493 +   @coord is set to. Set @pos_in_unit either to first or to last block
55494 +   of extent. */
55495 +static reiser4_extent *
55496 +extent_utmost_ext(const coord_t *coord, sideof side, reiser4_block_nr *pos_in_unit)
55497 +{
55498 +       reiser4_extent *ext;
55499 +
55500 +       if (side == LEFT_SIDE) {
55501 +               /* get first extent of item */
55502 +               ext = extent_item(coord);
55503 +               *pos_in_unit = 0;
55504 +       } else {
55505 +               /* get last extent of item and last position within it */
55506 +               assert("vs-363", side == RIGHT_SIDE);
55507 +               ext = extent_item(coord) + coord_last_unit_pos(coord);
55508 +               *pos_in_unit = extent_get_width(ext) - 1;
55509 +       }
55510 +
55511 +       return ext;
55512 +}
55513 +
55514 +/* item_plugin->f.utmost_child */
55515 +/* Return the child. Coord is set to extent item. Find jnode corresponding
55516 +   either to first or to last unformatted node pointed by the item */
55517 +reiser4_internal int
55518 +utmost_child_extent(const coord_t *coord, sideof side, jnode **childp)
55519 +{
55520 +       reiser4_extent *ext;
55521 +       reiser4_block_nr pos_in_unit;
55522 +
55523 +       ext = extent_utmost_ext(coord, side, &pos_in_unit);
55524 +
55525 +       switch (state_of_extent(ext)) {
55526 +       case HOLE_EXTENT:
55527 +               *childp = NULL;
55528 +               return 0;
55529 +       case ALLOCATED_EXTENT:
55530 +       case UNALLOCATED_EXTENT:
55531 +               break;
55532 +       default:
55533 +               /* this should never happen */
55534 +               assert("vs-1417", 0);
55535 +       }
55536 +
55537 +       {
55538 +               reiser4_key key;
55539 +               reiser4_tree *tree;
55540 +               unsigned long index;
55541 +
55542 +               if (side == LEFT_SIDE) {
55543 +                       /* get key of first byte addressed by the extent */
55544 +                       item_key_by_coord(coord, &key);
55545 +               } else {
55546 +                       /* get key of byte which next after last byte addressed by the extent */
55547 +                       append_key_extent(coord, &key);
55548 +               }
55549 +
55550 +               assert("vs-544", (get_key_offset(&key) >> PAGE_CACHE_SHIFT) < ~0ul);
55551 +               /* index of first or last (depending on @side) page addressed
55552 +                  by the extent */
55553 +               index = (unsigned long) (get_key_offset(&key) >> PAGE_CACHE_SHIFT);
55554 +               if (side == RIGHT_SIDE)
55555 +                       index --;
55556 +
55557 +               tree = coord->node->zjnode.tree;
55558 +               *childp = jlookup(tree, get_key_objectid(&key), index);
55559 +       }
55560 +
55561 +       return 0;
55562 +}
55563 +
55564 +/* item_plugin->f.utmost_child_real_block */
55565 +/* Return the child's block, if allocated. */
55566 +reiser4_internal int
55567 +utmost_child_real_block_extent(const coord_t *coord, sideof side, reiser4_block_nr *block)
55568 +{
55569 +       reiser4_extent *ext;
55570 +
55571 +       ext = extent_by_coord(coord);
55572 +
55573 +       switch (state_of_extent(ext)) {
55574 +       case ALLOCATED_EXTENT:
55575 +               *block = extent_get_start(ext);
55576 +               if (side == RIGHT_SIDE)
55577 +                       *block += extent_get_width(ext) - 1;
55578 +               break;
55579 +       case HOLE_EXTENT:
55580 +       case UNALLOCATED_EXTENT:
55581 +               *block = 0;
55582 +               break;
55583 +       default:
55584 +               /* this should never happen */
55585 +               assert("vs-1418", 0);
55586 +       }
55587 +
55588 +       return 0;
55589 +}
55590 +
55591 +/* item_plugin->f.scan */
55592 +/* Performs leftward scanning starting from an unformatted node and its parent coordinate.
55593 +   This scan continues, advancing the parent coordinate, until either it encounters a
55594 +   formatted child or it finishes scanning this node.
55595 +
55596 +   If unallocated, the entire extent must be dirty and in the same atom.  (Actually, I'm
55597 +   not sure this is last property (same atom) is enforced, but it should be the case since
55598 +   one atom must write the parent and the others must read the parent, thus fusing?).  In
55599 +   any case, the code below asserts this case for unallocated extents.  Unallocated
55600 +   extents are thus optimized because we can skip to the endpoint when scanning.
55601 +
55602 +   It returns control to scan_extent, handles these terminating conditions, e.g., by
55603 +   loading the next twig.
55604 +*/
55605 +reiser4_internal int scan_extent(flush_scan * scan)
55606 +{
55607 +       coord_t coord;
55608 +       jnode *neighbor;
55609 +       unsigned long scan_index, unit_index, unit_width, scan_max, scan_dist;
55610 +       reiser4_block_nr unit_start;
55611 +       __u64 oid;
55612 +       reiser4_key key;
55613 +       int ret = 0, allocated, incr;
55614 +       reiser4_tree *tree;
55615 +
55616 +       if (!jnode_check_dirty(scan->node)) {
55617 +               scan->stop = 1;
55618 +               return 0; /* Race with truncate, this node is already
55619 +                          * truncated. */
55620 +       }
55621 +
55622 +       coord_dup(&coord, &scan->parent_coord);
55623 +
55624 +       assert("jmacd-1404", !scan_finished(scan));
55625 +       assert("jmacd-1405", jnode_get_level(scan->node) == LEAF_LEVEL);
55626 +       assert("jmacd-1406", jnode_is_unformatted(scan->node));
55627 +
55628 +       /* The scan_index variable corresponds to the current page index of the
55629 +          unformatted block scan position. */
55630 +       scan_index = index_jnode(scan->node);
55631 +
55632 +       assert("jmacd-7889", item_is_extent(&coord));
55633 +
55634 +       ON_TRACE(TRACE_FLUSH_VERB, "%s scan starts %lu: %s\n",
55635 +                (scanning_left(scan) ? "left" : "right"), scan_index, jnode_tostring(scan->node));
55636 +
55637 +repeat:
55638 +       /* objectid of file */
55639 +       oid = get_key_objectid(item_key_by_coord(&coord, &key));
55640 +
55641 +       ON_TRACE(TRACE_FLUSH_VERB, "%s scan index %lu: parent %p oid %llu\n",
55642 +                (scanning_left(scan) ? "left" : "right"), scan_index, coord.node, oid);
55643 +
55644 +       allocated = !extent_is_unallocated(&coord);
55645 +       /* Get the values of this extent unit: */
55646 +       unit_index = extent_unit_index(&coord);
55647 +       unit_width = extent_unit_width(&coord);
55648 +       unit_start = extent_unit_start(&coord);
55649 +
55650 +       assert("jmacd-7187", unit_width > 0);
55651 +       assert("jmacd-7188", scan_index >= unit_index);
55652 +       assert("jmacd-7189", scan_index <= unit_index + unit_width - 1);
55653 +
55654 +       /* Depending on the scan direction, we set different maximum values for scan_index
55655 +          (scan_max) and the number of nodes that would be passed if the scan goes the
55656 +          entire way (scan_dist).  Incr is an integer reflecting the incremental
55657 +          direction of scan_index. */
55658 +       if (scanning_left(scan)) {
55659 +               scan_max = unit_index;
55660 +               scan_dist = scan_index - unit_index;
55661 +               incr = -1;
55662 +       } else {
55663 +               scan_max = unit_index + unit_width - 1;
55664 +               scan_dist = scan_max - unit_index;
55665 +               incr = +1;
55666 +       }
55667 +
55668 +       tree = coord.node->zjnode.tree;
55669 +
55670 +       /* If the extent is allocated we have to check each of its blocks.  If the extent
55671 +          is unallocated we can skip to the scan_max. */
55672 +       if (allocated) {
55673 +               do {
55674 +                       neighbor = jlookup(tree, oid, scan_index);
55675 +                       if (neighbor == NULL)
55676 +                               goto stop_same_parent;
55677 +
55678 +                       ON_TRACE(TRACE_FLUSH_VERB, "alloc scan index %lu: %s\n",
55679 +                                scan_index, jnode_tostring(neighbor));
55680 +
55681 +                       if (scan->node != neighbor && !scan_goto(scan, neighbor)) {
55682 +                               /* @neighbor was jput() by scan_goto(). */
55683 +                               goto stop_same_parent;
55684 +                       }
55685 +
55686 +                       ret = scan_set_current(scan, neighbor, 1, &coord);
55687 +                       if (ret != 0) {
55688 +                               goto exit;
55689 +                       }
55690 +
55691 +                       /* reference to @neighbor is stored in @scan, no need
55692 +                          to jput(). */
55693 +                       scan_index += incr;
55694 +
55695 +               } while (incr + scan_max != scan_index);
55696 +
55697 +       } else {
55698 +               /* Optimized case for unallocated extents, skip to the end. */
55699 +               neighbor = jlookup(tree, oid, scan_max/*index*/);
55700 +               if (neighbor == NULL) {
55701 +                       /* Race with truncate */
55702 +                       scan->stop = 1;
55703 +                       ret = 0;
55704 +                       goto exit;
55705 +               }
55706 +
55707 +               assert ("zam-1043", blocknr_is_fake(jnode_get_block(neighbor)));
55708 +
55709 +               ON_TRACE(TRACE_FLUSH_VERB, "unalloc scan index %lu: %s\n", scan_index, jnode_tostring(neighbor));
55710 +
55711 +               /* XXX commented assertion out, because it is inherently
55712 +                * racy */
55713 +               /* assert("jmacd-3551", !jnode_check_flushprepped(neighbor)
55714 +                  && same_slum_check(neighbor, scan->node, 0, 0)); */
55715 +
55716 +               ret = scan_set_current(scan, neighbor, scan_dist, &coord);
55717 +               if (ret != 0) {
55718 +                       goto exit;
55719 +               }
55720 +       }
55721 +
55722 +       if (coord_sideof_unit(&coord, scan->direction) == 0 && item_is_extent(&coord)) {
55723 +               /* Continue as long as there are more extent units. */
55724 +
55725 +               scan_index =
55726 +                   extent_unit_index(&coord) + (scanning_left(scan) ? extent_unit_width(&coord) - 1 : 0);
55727 +               goto repeat;
55728 +       }
55729 +
55730 +       if (0) {
55731 +stop_same_parent:
55732 +
55733 +               /* If we are scanning left and we stop in the middle of an allocated
55734 +                  extent, we know the preceder immediately.. */
55735 +               /* middle of extent is (scan_index - unit_index) != 0. */
55736 +               if (scanning_left(scan) && (scan_index - unit_index) != 0) {
55737 +                       /* FIXME(B): Someone should step-through and verify that this preceder
55738 +                          calculation is indeed correct. */
55739 +                       /* @unit_start is starting block (number) of extent
55740 +                          unit. Flush stopped at the @scan_index block from
55741 +                          the beginning of the file, which is (scan_index -
55742 +                          unit_index) block within extent.
55743 +                       */
55744 +                       if (unit_start) {
55745 +                               /* skip preceder update when we are at hole */
55746 +                               scan->preceder_blk = unit_start + scan_index - unit_index;
55747 +                               check_preceder(scan->preceder_blk);
55748 +                       }
55749 +               }
55750 +
55751 +               /* In this case, we leave coord set to the parent of scan->node. */
55752 +               scan->stop = 1;
55753 +
55754 +       } else {
55755 +               /* In this case, we are still scanning, coord is set to the next item which is
55756 +                  either off-the-end of the node or not an extent. */
55757 +               assert("jmacd-8912", scan->stop == 0);
55758 +               assert("jmacd-7812", (coord_is_after_sideof_unit(&coord, scan->direction)
55759 +                                     || !item_is_extent(&coord)));
55760 +       }
55761 +
55762 +       ret = 0;
55763 +exit:
55764 +       return ret;
55765 +}
55766 +
55767 +/* ask block allocator for some blocks */
55768 +static void
55769 +extent_allocate_blocks(reiser4_blocknr_hint *preceder,
55770 +                      reiser4_block_nr wanted_count, reiser4_block_nr *first_allocated, reiser4_block_nr *allocated, block_stage_t block_stage)
55771 +{
55772 +       *allocated = wanted_count;
55773 +       preceder->max_dist = 0; /* scan whole disk, if needed */
55774 +
55775 +       /* that number of blocks (wanted_count) is either in UNALLOCATED or in GRABBED */
55776 +       preceder->block_stage = block_stage;
55777 +
55778 +       /* FIXME: we do not handle errors here now */
55779 +       check_me("vs-420", reiser4_alloc_blocks (preceder, first_allocated, allocated, BA_PERMANENT) == 0);
55780 +       /* update flush_pos's preceder to last allocated block number */
55781 +       preceder->blk = *first_allocated + *allocated - 1;
55782 +}
55783 +
55784 +/* when on flush time unallocated extent is to be replaced with allocated one it may happen that one unallocated extent
55785 +   will have to be replaced with set of allocated extents. In this case insert_into_item will be called which may have
55786 +   to add new nodes into tree. Space for that is taken from inviolable reserve (5%). */
55787 +static reiser4_block_nr
55788 +reserve_replace(void)
55789 +{
55790 +       reiser4_block_nr grabbed, needed;
55791 +
55792 +       grabbed = get_current_context()->grabbed_blocks;
55793 +       needed = estimate_one_insert_into_item(current_tree);
55794 +       check_me("vpf-340", !reiser4_grab_space_force(needed, BA_RESERVED));
55795 +       return grabbed;
55796 +}
55797 +
55798 +static void
55799 +free_replace_reserved(reiser4_block_nr grabbed)
55800 +{
55801 +       reiser4_context *ctx;
55802 +
55803 +       ctx = get_current_context();
55804 +       grabbed2free(ctx, get_super_private(ctx->super),
55805 +                    ctx->grabbed_blocks - grabbed);
55806 +}
55807 +
55808 +/* Block offset of first block addressed by unit */
55809 +reiser4_internal __u64
55810 +extent_unit_index(const coord_t *item)
55811 +{
55812 +       reiser4_key key;
55813 +
55814 +       assert("vs-648", coord_is_existing_unit(item));
55815 +       unit_key_by_coord(item, &key);
55816 +       return get_key_offset(&key) >> current_blocksize_bits;
55817 +}
55818 +
55819 +/* AUDIT shouldn't return value be of reiser4_block_nr type?
55820 +   Josh's answer: who knows?  Is a "number of blocks" the same type as "block offset"? */
55821 +reiser4_internal __u64
55822 +extent_unit_width(const coord_t *item)
55823 +{
55824 +       assert("vs-649", coord_is_existing_unit(item));
55825 +       return width_by_coord(item);
55826 +}
55827 +
55828 +/* Starting block location of this unit */
55829 +reiser4_internal reiser4_block_nr
55830 +extent_unit_start(const coord_t *item)
55831 +{
55832 +       return extent_get_start(extent_by_coord(item));
55833 +}
55834 +
55835 +/* replace allocated extent with two allocated extents */
55836 +static int
55837 +split_allocated_extent(coord_t *coord, reiser4_block_nr pos_in_unit)
55838 +{
55839 +       int result;
55840 +       reiser4_extent *ext;
55841 +       reiser4_extent replace_ext;
55842 +       reiser4_extent append_ext;
55843 +       reiser4_key key;
55844 +       reiser4_item_data item;
55845 +       reiser4_block_nr grabbed;
55846 +
55847 +       ext = extent_by_coord(coord);
55848 +       assert("vs-1410", state_of_extent(ext) == ALLOCATED_EXTENT);
55849 +       assert("vs-1411", extent_get_width(ext) > pos_in_unit);
55850 +
55851 +       set_extent(&replace_ext, extent_get_start(ext), pos_in_unit);
55852 +       set_extent(&append_ext, extent_get_start(ext) + pos_in_unit, extent_get_width(ext) - pos_in_unit);
55853 +
55854 +       /* insert_into_item will insert new unit after the one @coord is set to. So, update key correspondingly */
55855 +       unit_key_by_coord(coord, &key);
55856 +       set_key_offset(&key, (get_key_offset(&key) + pos_in_unit * current_blocksize));
55857 +
55858 +       ON_TRACE(TRACE_EXTENT_ALLOC,
55859 +                "split [%llu %llu] -> [%llu %llu][%llu %llu]\n",
55860 +                extent_get_start(ext), extent_get_width(ext),
55861 +                extent_get_start(&replace_ext), extent_get_width(&replace_ext),
55862 +                extent_get_start(&append_ext), extent_get_width(&append_ext));
55863 +
55864 +       grabbed = reserve_replace();
55865 +       result = replace_extent(coord, znode_lh(coord->node), &key, init_new_extent(&item, &append_ext, 1),
55866 +                               &replace_ext, COPI_DONT_SHIFT_LEFT, 0/* return replaced position */);
55867 +       free_replace_reserved(grabbed);
55868 +       return result;
55869 +}
55870 +
55871 +/* clear bit preventing node from being written bypassing extent allocation procedure */
55872 +static inline void
55873 +junprotect (jnode * node)
55874 +{
55875 +       assert("zam-837", !JF_ISSET(node, JNODE_EFLUSH));
55876 +       assert("zam-838", JF_ISSET(node, JNODE_EPROTECTED));
55877 +
55878 +       JF_CLR(node, JNODE_EPROTECTED);
55879 +}
55880 +
55881 +/* this is used to unprotect nodes which were protected before allocating but which will not be allocated either because
55882 +   space allocator allocates less blocks than were protected and/or if allocation of those nodes failed */
55883 +static void
55884 +unprotect_extent_nodes(flush_pos_t *flush_pos, __u64 count, capture_list_head *protected_nodes)
55885 +{
55886 +       jnode *node, *tmp;
55887 +       capture_list_head unprotected_nodes;
55888 +       txn_atom *atom;
55889 +
55890 +       capture_list_init(&unprotected_nodes);
55891 +
55892 +       atom = atom_locked_by_fq(pos_fq(flush_pos));
55893 +       assert("vs-1468", atom);
55894 +
55895 +       assert("vs-1469", !capture_list_empty(protected_nodes));
55896 +       assert("vs-1474", count > 0);
55897 +       node = capture_list_back(protected_nodes);
55898 +       do {
55899 +               count --;
55900 +               junprotect(node);
55901 +               ON_DEBUG(
55902 +                       LOCK_JNODE(node);
55903 +                       count_jnode(atom, node, PROTECT_LIST, DIRTY_LIST, 0);
55904 +                       UNLOCK_JNODE(node);
55905 +                       );
55906 +               if (count == 0) {
55907 +                       break;
55908 +               }
55909 +               tmp = capture_list_prev(node);
55910 +               node = tmp;
55911 +               assert("vs-1470", !capture_list_end(protected_nodes, node));
55912 +       } while (1);
55913 +
55914 +       /* move back to dirty list */
55915 +       capture_list_split(protected_nodes, &unprotected_nodes, node);
55916 +       capture_list_splice(ATOM_DIRTY_LIST(atom, LEAF_LEVEL), &unprotected_nodes);
55917 +
55918 +       UNLOCK_ATOM(atom);
55919 +}
55920 +
55921 +extern int getjevent(void);
55922 +
55923 +/* remove node from atom's list and put to the end of list @jnodes */
55924 +static void
55925 +protect_reloc_node(capture_list_head *jnodes, jnode *node)
55926 +{
55927 +       assert("zam-836", !JF_ISSET(node, JNODE_EPROTECTED));
55928 +       assert("vs-1216", jnode_is_unformatted(node));
55929 +       assert("vs-1477", spin_atom_is_locked(node->atom));
55930 +       assert("nikita-3390", spin_jnode_is_locked(node));
55931 +
55932 +       JF_SET(node, JNODE_EPROTECTED);
55933 +       capture_list_remove_clean(node);
55934 +       capture_list_push_back(jnodes, node);
55935 +       ON_DEBUG(count_jnode(node->atom, node, DIRTY_LIST, PROTECT_LIST, 0));
55936 +}
55937 +
55938 +#define JNODES_TO_UNFLUSH (16)
55939 +
55940 +/* @count nodes of file (objectid @oid) starting from @index are going to be allocated. Protect those nodes from
55941 +   e-flushing. Nodes which are eflushed already will be un-eflushed. There will be not more than JNODES_TO_UNFLUSH
55942 +   un-eflushed nodes. If a node is not found or flushprepped - stop protecting */
55943 +/* FIXME: it is likely that not flushprepped jnodes are on dirty capture list in sequential order.. */
55944 +static int
55945 +protect_extent_nodes(flush_pos_t *flush_pos, oid_t oid, unsigned long index, reiser4_block_nr count,
55946 +                    reiser4_block_nr *protected, reiser4_extent *ext,
55947 +                    capture_list_head *protected_nodes)
55948 +{
55949 +       __u64           i;
55950 +       __u64           j;
55951 +       int             result;
55952 +       reiser4_tree   *tree;
55953 +       int             eflushed;
55954 +       jnode          *buf[JNODES_TO_UNFLUSH];
55955 +               txn_atom       *atom;
55956 +
55957 +       assert("nikita-3394", capture_list_empty(protected_nodes));
55958 +
55959 +       tree = current_tree;
55960 +
55961 +       atom = atom_locked_by_fq(pos_fq(flush_pos));
55962 +       assert("vs-1468", atom);
55963 +
55964 +       assert("vs-1470", extent_get_width(ext) == count);
55965 +       eflushed = 0;
55966 +       *protected = 0;
55967 +       for (i = 0; i < count; ++i, ++index) {
55968 +               jnode  *node;
55969 +
55970 +               node = jlookup(tree, oid, index);
55971 +               if (!node)
55972 +                       break;
55973 +
55974 +               if (jnode_check_flushprepped(node)) {
55975 +                       atomic_dec(&node->x_count);
55976 +                       break;
55977 +               }
55978 +
55979 +               LOCK_JNODE(node);
55980 +               assert("vs-1476", atomic_read(&node->x_count) > 1);
55981 +               assert("nikita-3393", !JF_ISSET(node, JNODE_EPROTECTED));
55982 +
55983 +               if (JF_ISSET(node, JNODE_EFLUSH)) {
55984 +                       if (eflushed == JNODES_TO_UNFLUSH) {
55985 +                               UNLOCK_JNODE(node);
55986 +                               atomic_dec(&node->x_count);
55987 +                               break;
55988 +                       }
55989 +                       buf[eflushed] = node;
55990 +                       eflushed ++;
55991 +                       protect_reloc_node(protected_nodes, node);
55992 +                       UNLOCK_JNODE(node);
55993 +               } else {
55994 +                       assert("nikita-3384", node->atom == atom);
55995 +                       protect_reloc_node(protected_nodes, node);
55996 +                       assert("nikita-3383", !JF_ISSET(node, JNODE_EFLUSH));
55997 +                       UNLOCK_JNODE(node);
55998 +                       atomic_dec(&node->x_count);
55999 +               }
56000 +
56001 +               (*protected) ++;
56002 +       }
56003 +       UNLOCK_ATOM(atom);
56004 +
56005 +       /* start io for eflushed nodes */
56006 +       for (j = 0; j < eflushed; ++ j)
56007 +               jstartio(buf[j]);
56008 +
56009 +       result = 0;
56010 +       for (j = 0; j < eflushed; ++ j) {
56011 +               if (result == 0) {
56012 +                       result = emergency_unflush(buf[j]);
56013 +                       if (result != 0) {
56014 +                               warning("nikita-3179",
56015 +                                       "unflush failed: %i", result);
56016 +                               print_jnode("node", buf[j]);
56017 +                       }
56018 +               }
56019 +               jput(buf[j]);
56020 +       }
56021 +       if (result != 0) {
56022 +               /* unprotect all the jnodes we have protected so far */
56023 +               unprotect_extent_nodes(flush_pos, i, protected_nodes);
56024 +       }
56025 +       return result;
56026 +}
56027 +
56028 +/* replace extent @ext by extent @replace. Try to merge @replace with previous extent of the item (if there is
56029 +   one). Return 1 if it succeeded, 0 - otherwise */
56030 +static int
56031 +try_to_merge_with_left(coord_t *coord, reiser4_extent *ext, reiser4_extent *replace)
56032 +{
56033 +       assert("vs-1415", extent_by_coord(coord) == ext);
56034 +
56035 +       if (coord->unit_pos == 0 || state_of_extent(ext - 1) != ALLOCATED_EXTENT)
56036 +               /* @ext either does not exist or is not allocated extent */
56037 +               return 0;
56038 +       if (extent_get_start(ext - 1) + extent_get_width(ext - 1) != extent_get_start(replace))
56039 +               return 0;
56040 +
56041 +       /* we can glue, widen previous unit */
56042 +       ON_TRACE(TRACE_EXTENT_ALLOC,
56043 +                "wide previous [%llu %llu] ->",
56044 +                extent_get_start(ext - 1), extent_get_width(ext - 1));
56045 +
56046 +       extent_set_width(ext - 1, extent_get_width(ext - 1) + extent_get_width(replace));
56047 +
56048 +       ON_TRACE(TRACE_EXTENT_ALLOC, " [%llu %llu] -> ", extent_get_start(ext - 1), extent_get_width(ext - 1));
56049 +
56050 +       if (extent_get_width(ext) != extent_get_width(replace)) {
56051 +               /* make current extent narrower */
56052 +               ON_TRACE(TRACE_EXTENT_ALLOC, "narrow [%llu %llu] -> ", extent_get_start(ext), extent_get_width(ext));
56053 +
56054 +               if (state_of_extent(ext) == ALLOCATED_EXTENT)
56055 +                       extent_set_start(ext, extent_get_start(ext) + extent_get_width(replace));
56056 +               extent_set_width(ext, extent_get_width(ext) - extent_get_width(replace));
56057 +
56058 +               ON_TRACE(TRACE_EXTENT_ALLOC, "[%llu %llu]\n", extent_get_start(ext), extent_get_width(ext));
56059 +       } else {
56060 +               /* current extent completely glued with its left neighbor, remove it */
56061 +               coord_t from, to;
56062 +
56063 +               ON_TRACE(TRACE_EXTENT_ALLOC, "delete [%llu %llu]\n", extent_get_start(ext), extent_get_width(ext));
56064 +
56065 +               coord_dup(&from, coord);
56066 +               from.unit_pos = nr_units_extent(coord) - 1;
56067 +               coord_dup(&to, &from);
56068 +
56069 +               /* currently cut from extent can cut either from the beginning or from the end. Move place which got
56070 +                  freed after unit removal to end of item */
56071 +               xmemmove(ext, ext + 1, (from.unit_pos - coord->unit_pos) * sizeof(reiser4_extent));
56072 +               /* wipe part of item which is going to be cut, so that node_check will not be confused */
56073 +               ON_DEBUG(xmemset(extent_item(coord) + from.unit_pos, 0, sizeof (reiser4_extent)));
56074 +               cut_node_content(&from, &to, NULL, NULL, NULL);
56075 +       }
56076 +       znode_make_dirty(coord->node);
56077 +       /* move coord back */
56078 +       coord->unit_pos --;
56079 +       return 1;
56080 +}
56081 +
56082 +/* replace extent (unallocated or allocated) pointed by @coord with extent @replace (allocated). If @replace is shorter
56083 +   than @coord - add padding extent */
56084 +static int
56085 +conv_extent(coord_t *coord, reiser4_extent *replace)
56086 +{
56087 +       int result;
56088 +       reiser4_extent *ext;
56089 +       reiser4_extent padd_ext;
56090 +       reiser4_block_nr start, width, new_width;
56091 +       reiser4_block_nr grabbed;
56092 +       reiser4_item_data item;
56093 +       reiser4_key key;
56094 +       extent_state state;
56095 +
56096 +       ext = extent_by_coord(coord);
56097 +       state = state_of_extent(ext);
56098 +       start = extent_get_start(ext);
56099 +       width = extent_get_width(ext);
56100 +       new_width = extent_get_width(replace);
56101 +
56102 +       assert("vs-1458", state == UNALLOCATED_EXTENT || state == ALLOCATED_EXTENT);
56103 +       assert("vs-1459", width >= new_width);
56104 +
56105 +       if (try_to_merge_with_left(coord, ext, replace)) {
56106 +               /* merged @replace with left neighbor. Current unit is either removed or narrowed */
56107 +               assert("nikita-3563", znode_at_read(coord->node));
56108 +               return 0;
56109 +       }
56110 +
56111 +       if (width == new_width) {
56112 +               /* replace current extent with @replace */
56113 +               ON_TRACE(TRACE_EXTENT_ALLOC, "replace: [%llu %llu]->[%llu %llu]\n",
56114 +                      start, width,
56115 +                      extent_get_start(replace), extent_get_width(replace));
56116 +
56117 +               *ext = *replace;
56118 +               znode_make_dirty(coord->node);
56119 +               assert("nikita-3563", znode_at_read(coord->node));
56120 +               return 0;
56121 +       }
56122 +
56123 +       /* replace @ext with @replace and padding extent */
56124 +       set_extent(&padd_ext, state == ALLOCATED_EXTENT ? (start + new_width) : UNALLOCATED_EXTENT_START,
56125 +                  width - new_width);
56126 +
56127 +       /* insert_into_item will insert new units after the one @coord is set to. So, update key correspondingly */
56128 +       unit_key_by_coord(coord, &key);
56129 +       set_key_offset(&key, (get_key_offset(&key) + new_width * current_blocksize));
56130 +
56131 +       ON_TRACE(TRACE_EXTENT_ALLOC,
56132 +                "replace: [%llu %llu]->[%llu %llu][%llu %llu]\n",
56133 +                start, width,
56134 +                extent_get_start(replace), extent_get_width(replace),
56135 +                extent_get_start(&padd_ext), extent_get_width(&padd_ext));
56136 +
56137 +       grabbed = reserve_replace();
56138 +       result = replace_extent(coord, znode_lh(coord->node), &key, init_new_extent(&item, &padd_ext, 1),
56139 +                               replace, COPI_DONT_SHIFT_LEFT, 0/* return replaced position */);
56140 +
56141 +       assert("nikita-3563", znode_at_read(coord->node));
56142 +       free_replace_reserved(grabbed);
56143 +       return result;
56144 +}
56145 +
56146 +/* for every jnode from @protected_nodes list assign block number and mark it RELOC and FLUSH_QUEUED. Attach whole
56147 +   @protected_nodes list to flush queue's prepped list */
56148 +static void
56149 +assign_real_blocknrs(flush_pos_t *flush_pos, reiser4_block_nr first, reiser4_block_nr count,
56150 +                    extent_state state, capture_list_head *protected_nodes)
56151 +{
56152 +       jnode *node;
56153 +       txn_atom *atom;
56154 +       flush_queue_t *fq;
56155 +       int i;
56156 +
56157 +       fq = pos_fq(flush_pos);
56158 +       atom = atom_locked_by_fq(fq);
56159 +       assert("vs-1468", atom);
56160 +
56161 +       i = 0;
56162 +       for_all_type_safe_list(capture, protected_nodes, node) {
56163 +               LOCK_JNODE(node);
56164 +               assert("vs-1132", ergo(state == UNALLOCATED_EXTENT, blocknr_is_fake(jnode_get_block(node))));
56165 +               assert("vs-1475", node->atom == atom);
56166 +               assert("vs-1476", atomic_read(&node->x_count) > 0);
56167 +               JF_CLR(node, JNODE_FLUSH_RESERVED);
56168 +               jnode_set_block(node, &first);
56169 +               unformatted_make_reloc(node, fq);
56170 +               /*XXXX*/ON_DEBUG(count_jnode(node->atom, node, PROTECT_LIST, FQ_LIST, 0));
56171 +               junprotect(node);
56172 +               assert("", NODE_LIST(node) == FQ_LIST);
56173 +               UNLOCK_JNODE(node);
56174 +               first ++;
56175 +               i ++;
56176 +       }
56177 +
56178 +       capture_list_splice(ATOM_FQ_LIST(fq), protected_nodes);
56179 +       /*XXX*/
56180 +       assert("vs-1687", count == i);
56181 +       if (state == UNALLOCATED_EXTENT)
56182 +               dec_unalloc_unfm_ptrs(count);
56183 +       UNLOCK_ATOM(atom);
56184 +}
56185 +
56186 +static void
56187 +make_node_ovrwr(capture_list_head *jnodes, jnode *node)
56188 +{
56189 +       LOCK_JNODE(node);
56190 +
56191 +       assert ("zam-917", !JF_ISSET(node, JNODE_RELOC));
56192 +       assert ("zam-918", !JF_ISSET(node, JNODE_OVRWR));
56193 +
56194 +       JF_SET(node, JNODE_OVRWR);
56195 +       capture_list_remove_clean(node);
56196 +       capture_list_push_back(jnodes, node);
56197 +       ON_DEBUG(count_jnode(node->atom, node, DIRTY_LIST, OVRWR_LIST, 0));
56198 +
56199 +       UNLOCK_JNODE(node);
56200 +}
56201 +
56202 +/* put nodes of one extent (file objectid @oid, extent width @width) to overwrite set. Starting from the one with index
56203 +   @index. If end of slum is detected (node is not found or flushprepped) - stop iterating and set flush position's
56204 +   state to POS_INVALID */
56205 +static void
56206 +mark_jnodes_overwrite(flush_pos_t *flush_pos, oid_t oid, unsigned long index, reiser4_block_nr width)
56207 +{
56208 +       unsigned long i;
56209 +       reiser4_tree *tree;
56210 +       jnode *node;
56211 +       txn_atom *atom;
56212 +       capture_list_head jnodes;
56213 +
56214 +       capture_list_init(&jnodes);
56215 +
56216 +       tree = current_tree;
56217 +
56218 +       atom = atom_locked_by_fq(pos_fq(flush_pos));
56219 +       assert("vs-1478", atom);
56220 +
56221 +       for (i = flush_pos->pos_in_unit; i < width; i ++, index ++) {
56222 +               node = jlookup(tree, oid, index);
56223 +               if (!node) {
56224 +                       flush_pos->state = POS_INVALID;
56225 +
56226 +                       ON_TRACE(TRACE_EXTENT_ALLOC, "node not found: (oid %llu, index %lu)\n", oid, index);
56227 +
56228 +                       break;
56229 +               }
56230 +               if (jnode_check_flushprepped(node)) {
56231 +                       flush_pos->state = POS_INVALID;
56232 +                       atomic_dec(&node->x_count);
56233 +
56234 +                       ON_TRACE(TRACE_EXTENT_ALLOC, "flushprepped: (oid %llu, index %lu)\n", oid, index);
56235 +
56236 +                       break;
56237 +               }
56238 +               make_node_ovrwr(&jnodes, node);
56239 +               atomic_dec(&node->x_count);
56240 +       }
56241 +
56242 +       capture_list_splice(ATOM_OVRWR_LIST(atom), &jnodes);
56243 +       UNLOCK_ATOM(atom);
56244 +}
56245 +
56246 +/* this is called by handle_pos_on_twig to proceed extent unit flush_pos->coord is set to. It is to prepare for flushing
56247 +   sequence of not flushprepped nodes (slum). It supposes that slum starts at flush_pos->pos_in_unit position within the
56248 +   extent. Slum gets to relocate set if flush_pos->leaf_relocate is set to 1 and to overwrite set otherwise */
56249 +reiser4_internal int
56250 +alloc_extent(flush_pos_t *flush_pos)
56251 +{
56252 +       coord_t *coord;
56253 +       reiser4_extent *ext;
56254 +       reiser4_extent replace_ext;
56255 +       oid_t oid;
56256 +       reiser4_block_nr protected;
56257 +       reiser4_block_nr start;
56258 +       __u64 index;
56259 +       __u64 width;
56260 +       extent_state state;
56261 +       int result;
56262 +       reiser4_block_nr first_allocated;
56263 +       __u64 allocated;
56264 +       reiser4_key key;
56265 +       block_stage_t block_stage;
56266 +
56267 +       assert("vs-1468", flush_pos->state == POS_ON_EPOINT);
56268 +       assert("vs-1469", coord_is_existing_unit(&flush_pos->coord) && item_is_extent(&flush_pos->coord));
56269 +
56270 +       coord = &flush_pos->coord;
56271 +
56272 +       check_pos(flush_pos);
56273 +
56274 +       ext = extent_by_coord(coord);
56275 +       state = state_of_extent(ext);
56276 +       if (state == HOLE_EXTENT) {
56277 +               flush_pos->state = POS_INVALID;
56278 +               return 0;
56279 +       }
56280 +
56281 +       item_key_by_coord(coord, &key);
56282 +       oid = get_key_objectid(&key);
56283 +       index = extent_unit_index(coord) + flush_pos->pos_in_unit;
56284 +       start = extent_get_start(ext);
56285 +       width = extent_get_width(ext);
56286 +
56287 +       assert("vs-1457", width > flush_pos->pos_in_unit);
56288 +
56289 +       if (flush_pos->leaf_relocate || state == UNALLOCATED_EXTENT) {
56290 +               protected_jnodes jnodes;
56291 +
56292 +               /* relocate */
56293 +               if (flush_pos->pos_in_unit) {
56294 +                       /* split extent unit into two */
56295 +                       result = split_allocated_extent(coord, flush_pos->pos_in_unit);
56296 +                       check_pos(flush_pos);
56297 +                       flush_pos->pos_in_unit = 0;
56298 +                       return result;
56299 +               }
56300 +               ON_TRACE(TRACE_EXTENT_ALLOC,
56301 +                        "ALLOC: relocate: (oid %llu, index %llu) [%llu %llu] - ",
56302 +                        oid, index, start, width);
56303 +
56304 +               /* Prevent nodes from e-flushing before allocating disk space for them. Nodes which were eflushed will be
56305 +                  read from their temporary locations (but not more than certain limit: JNODES_TO_UNFLUSH) and that
56306 +                  disk space will be freed. */
56307 +
56308 +               protected_jnodes_init(&jnodes);
56309 +
56310 +               result = protect_extent_nodes(flush_pos, oid, index, width, &protected, ext, &jnodes.nodes);
56311 +               check_pos(flush_pos);
56312 +               if (result) {
56313 +                       warning("vs-1469", "Failed to protect extent. Should not happen\n");
56314 +                       protected_jnodes_done(&jnodes);
56315 +                       return result;
56316 +               }
56317 +               if (protected == 0) {
56318 +                       ON_TRACE(TRACE_EXTENT_ALLOC, "nothing todo\n");
56319 +                       flush_pos->state = POS_INVALID;
56320 +                       flush_pos->pos_in_unit = 0;
56321 +                       protected_jnodes_done(&jnodes);
56322 +                       return 0;
56323 +               }
56324 +
56325 +               if (state == ALLOCATED_EXTENT)
56326 +                       /* all protected nodes are not flushprepped, therefore
56327 +                        * they are counted as flush_reserved */
56328 +                       block_stage = BLOCK_FLUSH_RESERVED;
56329 +               else
56330 +                       block_stage = BLOCK_UNALLOCATED;
56331 +
56332 +               /* allocate new block numbers for protected nodes */
56333 +               extent_allocate_blocks(pos_hint(flush_pos), protected, &first_allocated, &allocated, block_stage);
56334 +               check_pos(flush_pos);
56335 +
56336 +               ON_TRACE(TRACE_EXTENT_ALLOC, "allocated: (first %llu, cound %llu) - ", first_allocated, allocated);
56337 +
56338 +               if (allocated != protected)
56339 +                       /* unprotect nodes which will not be
56340 +                        * allocated/relocated on this iteration */
56341 +                       unprotect_extent_nodes(flush_pos, protected - allocated,
56342 +                                              &jnodes.nodes);
56343 +               check_pos(flush_pos);
56344 +               if (state == ALLOCATED_EXTENT) {
56345 +                       /* on relocating - free nodes which are going to be
56346 +                        * relocated */
56347 +                       reiser4_dealloc_blocks(&start, &allocated, BLOCK_ALLOCATED, BA_DEFER);
56348 +               }
56349 +
56350 +               check_pos(flush_pos);
56351 +               /* assign new block numbers to protected nodes */
56352 +               assign_real_blocknrs(flush_pos, first_allocated, allocated, state, &jnodes.nodes);
56353 +
56354 +               check_pos(flush_pos);
56355 +               protected_jnodes_done(&jnodes);
56356 +
56357 +               /* send to log information about which blocks were allocated for what */
56358 +               write_current_logf(ALLOC_EXTENT_LOG,
56359 +                                  "alloc: oid: %llu, index: %llu, state %d, width: %llu. "
56360 +                                  "prot: %llu. got [%llu %llu]",
56361 +                                  oid, index, state, width, protected, first_allocated, allocated);
56362 +
56363 +               /* prepare extent which will replace current one */
56364 +               set_extent(&replace_ext, first_allocated, allocated);
56365 +
56366 +               /* adjust extent item */
56367 +               result = conv_extent(coord, &replace_ext);
56368 +               check_pos(flush_pos);
56369 +               if (result != 0 && result != -ENOMEM) {
56370 +                       warning("vs-1461", "Failed to allocate extent. Should not happen\n");
56371 +                       return result;
56372 +               }
56373 +       } else {
56374 +               /* overwrite */
56375 +               ON_TRACE(TRACE_EXTENT_ALLOC,
56376 +                        "ALLOC: overwrite: (oid %llu, index %llu) [%llu %llu]\n",
56377 +                        oid, index, start, width);
56378 +               mark_jnodes_overwrite(flush_pos, oid, index, width);
56379 +       }
56380 +       flush_pos->pos_in_unit = 0;
56381 +       check_pos(flush_pos);
56382 +       return 0;
56383 +}
56384 +
56385 +/* if @key is glueable to the item @coord is set to */
56386 +static int
56387 +must_insert(const coord_t *coord, const reiser4_key *key)
56388 +{
56389 +       reiser4_key last;
56390 +
56391 +       if (item_id_by_coord(coord) == EXTENT_POINTER_ID && keyeq(append_key_extent(coord, &last), key))
56392 +               return 0;
56393 +       return 1;
56394 +}
56395 +
56396 +  /* copy extent @copy to the end of @node. It may have to either insert new item after the last one, or append last item,
56397 +   or modify last unit of last item to have greater width */
56398 +static int
56399 +put_unit_to_end(znode *node, const reiser4_key *key, reiser4_extent *copy_ext)
56400 +{
56401 +       int result;
56402 +       coord_t coord;
56403 +       cop_insert_flag flags;
56404 +       reiser4_extent *last_ext;
56405 +       reiser4_item_data data;
56406 +
56407 +       /* set coord after last unit in an item */
56408 +       coord_init_last_unit(&coord, node);
56409 +       coord.between = AFTER_UNIT;
56410 +
56411 +       flags = COPI_DONT_SHIFT_LEFT | COPI_DONT_SHIFT_RIGHT | COPI_DONT_ALLOCATE;
56412 +       if (must_insert(&coord, key)) {
56413 +               result = insert_by_coord(&coord, init_new_extent(&data, copy_ext, 1), key, 0 /*lh */ , flags);
56414 +
56415 +       } else {
56416 +               /* try to glue with last unit */
56417 +               last_ext = extent_by_coord(&coord);
56418 +               if (state_of_extent(last_ext) &&
56419 +                   extent_get_start(last_ext) + extent_get_width(last_ext) == extent_get_start(copy_ext)) {
56420 +                       /* widen last unit of node */
56421 +                       extent_set_width(last_ext, extent_get_width(last_ext) + extent_get_width(copy_ext));
56422 +                       znode_make_dirty(node);
56423 +                       return 0;
56424 +               }
56425 +
56426 +               /* FIXME: put an assertion here that we can not merge last unit in @node and new unit */
56427 +               result = insert_into_item(&coord, 0 /*lh */ , key, init_new_extent(&data, copy_ext, 1), flags);
56428 +       }
56429 +
56430 +       assert("vs-438", result == 0 || result == -E_NODE_FULL);
56431 +       return result;
56432 +}
56433 +
56434 +/* @coord is set to extent unit */
56435 +reiser4_internal squeeze_result
56436 +squalloc_extent(znode *left, const coord_t *coord, flush_pos_t *flush_pos, reiser4_key *stop_key)
56437 +{
56438 +       reiser4_extent *ext;
56439 +       __u64 index;
56440 +       __u64 width;
56441 +       reiser4_block_nr start;
56442 +       extent_state state;
56443 +       oid_t oid;
56444 +       reiser4_block_nr first_allocated;
56445 +       __u64 allocated;
56446 +       __u64 protected;
56447 +       reiser4_extent copy_extent;
56448 +       reiser4_key key;
56449 +       int result;
56450 +       block_stage_t block_stage;
56451 +
56452 +       assert("vs-1457", flush_pos->pos_in_unit == 0);
56453 +       assert("vs-1467", coord_is_leftmost_unit(coord));
56454 +       assert("vs-1467", item_is_extent(coord));
56455 +
56456 +       ext = extent_by_coord(coord);
56457 +       index = extent_unit_index(coord);
56458 +       start = extent_get_start(ext);
56459 +       width = extent_get_width(ext);
56460 +       state = state_of_extent(ext);
56461 +       unit_key_by_coord(coord, &key);
56462 +       oid = get_key_objectid(&key);
56463 +
56464 +       if (flush_pos->leaf_relocate || state == UNALLOCATED_EXTENT) {
56465 +               protected_jnodes jnodes;
56466 +
56467 +               ON_TRACE(TRACE_EXTENT_ALLOC, "SQUALLOC: relocate: (oid %llu, index %llu) [%llu %llu] - ",
56468 +                        oid, index, start, width);
56469 +
56470 +               /* relocate */
56471 +               protected_jnodes_init(&jnodes);
56472 +               result = protect_extent_nodes(flush_pos, oid, index, width, &protected, ext, &jnodes.nodes);
56473 +               if (result) {
56474 +                       warning("vs-1469", "Failed to protect extent. Should not happen\n");
56475 +                       protected_jnodes_done(&jnodes);
56476 +                       return result;
56477 +               }
56478 +               if (protected == 0) {
56479 +                       flush_pos->state = POS_INVALID;
56480 +                       protected_jnodes_done(&jnodes);
56481 +                       return 0;
56482 +               }
56483 +
56484 +               if (state == ALLOCATED_EXTENT)
56485 +                       /* all protected nodes are not flushprepped, therefore
56486 +                        * they are counted as flush_reserved */
56487 +                       block_stage = BLOCK_FLUSH_RESERVED;
56488 +               else
56489 +                       block_stage = BLOCK_UNALLOCATED;
56490 +
56491 +               /* allocate new block numbers for protected nodes */
56492 +               extent_allocate_blocks(pos_hint(flush_pos), protected, &first_allocated, &allocated, block_stage);
56493 +               ON_TRACE(TRACE_EXTENT_ALLOC, "allocated: (first %llu, cound %llu) - ", first_allocated, allocated);
56494 +               if (allocated != protected)
56495 +                       unprotect_extent_nodes(flush_pos, protected - allocated,
56496 +                                              &jnodes.nodes);
56497 +
56498 +               /* prepare extent which will be copied to left */
56499 +               set_extent(&copy_extent, first_allocated, allocated);
56500 +
56501 +               result = put_unit_to_end(left, &key, &copy_extent);
56502 +               if (result == -E_NODE_FULL) {
56503 +                       int target_block_stage;
56504 +
56505 +                       /* free blocks which were just allocated */
56506 +                       ON_TRACE(TRACE_EXTENT_ALLOC,
56507 +                                "left is full, free (first %llu, count %llu)\n",
56508 +                                first_allocated, allocated);
56509 +                       target_block_stage = (state == ALLOCATED_EXTENT) ? BLOCK_FLUSH_RESERVED : BLOCK_UNALLOCATED;
56510 +                       reiser4_dealloc_blocks(&first_allocated, &allocated, target_block_stage, BA_PERMANENT);
56511 +                       unprotect_extent_nodes(flush_pos, allocated, &jnodes.nodes);
56512 +
56513 +                       /* rewind the preceder. */
56514 +                       flush_pos->preceder.blk = first_allocated;
56515 +                       check_preceder(flush_pos->preceder.blk);
56516 +
56517 +                       protected_jnodes_done(&jnodes);
56518 +                       return SQUEEZE_TARGET_FULL;
56519 +               }
56520 +
56521 +               if (state == ALLOCATED_EXTENT) {
56522 +                       /* free nodes which were relocated */
56523 +                       reiser4_dealloc_blocks(&start, &allocated, BLOCK_ALLOCATED, BA_DEFER);
56524 +               }
56525 +
56526 +               /* assign new block numbers to protected nodes */
56527 +               assign_real_blocknrs(flush_pos, first_allocated, allocated, state, &jnodes.nodes);
56528 +               protected_jnodes_done(&jnodes);
56529 +
56530 +               set_key_offset(&key, get_key_offset(&key) + (allocated << current_blocksize_bits));
56531 +               ON_TRACE(TRACE_EXTENT_ALLOC,
56532 +                        "copied to left: [%llu %llu]\n", first_allocated, allocated);
56533 +
56534 +               /* send to log information about which blocks were allocated for what */
56535 +               write_current_logf(ALLOC_EXTENT_LOG,
56536 +                                  "sqalloc: oid: %llu, index: %llu, state %d, width: %llu. "
56537 +                                  "prot: %llu. got [%llu %llu]",
56538 +                                  oid, index, state, width, protected, first_allocated, allocated);
56539 +       } else {
56540 +               /* overwrite */
56541 +               ON_TRACE(TRACE_EXTENT_ALLOC,
56542 +                        "SQUALLOC: overwrite: (oid %llu, index %llu) [%llu %llu] - ", oid, index, start, width);
56543 +
56544 +               /* overwrite: try to copy unit as it is to left neighbor and make all first not flushprepped nodes
56545 +                  overwrite nodes */
56546 +               set_extent(&copy_extent, start, width);
56547 +               result = put_unit_to_end(left, &key, &copy_extent);
56548 +               if (result == -E_NODE_FULL) {
56549 +                       ON_TRACE(TRACE_EXTENT_ALLOC, "left is full\n");
56550 +                       return SQUEEZE_TARGET_FULL;
56551 +               }
56552 +               mark_jnodes_overwrite(flush_pos, oid, index, width);
56553 +               set_key_offset(&key, get_key_offset(&key) + (width << current_blocksize_bits));
56554 +               ON_TRACE(TRACE_EXTENT_ALLOC, "copied to left\n");
56555 +       }
56556 +       *stop_key = key;
56557 +       return SQUEEZE_CONTINUE;
56558 +}
56559 +
56560 +reiser4_internal int
56561 +key_by_offset_extent(struct inode *inode, loff_t off, reiser4_key *key)
56562 +{
56563 +       return key_by_inode_and_offset_common(inode, off, key);
56564 +}
56565 +
56566 +/*
56567 +   Local variables:
56568 +   c-indentation-style: "K&R"
56569 +   mode-name: "LC"
56570 +   c-basic-offset: 8
56571 +   tab-width: 8
56572 +   fill-column: 120
56573 +   scroll-step: 1
56574 +   End:
56575 +*/
56576 diff -rupN linux-2.6.8-rc3/fs/reiser4/plugin/item/extent_item_ops.c linux-2.6.8-rc3-a/fs/reiser4/plugin/item/extent_item_ops.c
56577 --- linux-2.6.8-rc3/fs/reiser4/plugin/item/extent_item_ops.c    1970-01-01 03:00:00.000000000 +0300
56578 +++ linux-2.6.8-rc3-a/fs/reiser4/plugin/item/extent_item_ops.c  2004-08-05 21:20:52.971684047 +0400
56579 @@ -0,0 +1,875 @@
56580 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
56581 +
56582 +#include "item.h"
56583 +#include "../../inode.h"
56584 +#include "../../tree_walk.h" /* check_sibling_list() */
56585 +#include "../../page_cache.h"
56586 +#include "../../carry.h"
56587 +
56588 +#include <linux/quotaops.h>
56589 +
56590 +/* item_plugin->b.max_key_inside */
56591 +reiser4_internal reiser4_key *
56592 +max_key_inside_extent(const coord_t *coord, reiser4_key *key)
56593 +{
56594 +       item_key_by_coord(coord, key);
56595 +       set_key_offset(key, get_key_offset(max_key()));
56596 +       return key;
56597 +}
56598 +
56599 +/* item_plugin->b.can_contain_key
56600 +   this checks whether @key of @data is matching to position set by @coord */
56601 +reiser4_internal int
56602 +can_contain_key_extent(const coord_t *coord, const reiser4_key *key, const reiser4_item_data *data)
56603 +{
56604 +       reiser4_key item_key;
56605 +
56606 +       if (item_plugin_by_coord(coord) != data->iplug)
56607 +               return 0;
56608 +
56609 +       item_key_by_coord(coord, &item_key);
56610 +       if (get_key_locality(key) != get_key_locality(&item_key) ||
56611 +           get_key_objectid(key) != get_key_objectid(&item_key) ||
56612 +           get_key_ordering(key) != get_key_ordering(&item_key)) return 0;
56613 +
56614 +       return 1;
56615 +}
56616 +
56617 +/* item_plugin->b.mergeable
56618 +   first item is of extent type */
56619 +/* Audited by: green(2002.06.13) */
56620 +reiser4_internal int
56621 +mergeable_extent(const coord_t *p1, const coord_t *p2)
56622 +{
56623 +       reiser4_key key1, key2;
56624 +
56625 +       assert("vs-299", item_id_by_coord(p1) == EXTENT_POINTER_ID);
56626 +       /* FIXME-VS: Which is it? Assert or return 0 */
56627 +       if (item_id_by_coord(p2) != EXTENT_POINTER_ID) {
56628 +               return 0;
56629 +       }
56630 +
56631 +       item_key_by_coord(p1, &key1);
56632 +       item_key_by_coord(p2, &key2);
56633 +       if (get_key_locality(&key1) != get_key_locality(&key2) ||
56634 +           get_key_objectid(&key1) != get_key_objectid(&key2) ||
56635 +           get_key_ordering(&key1) != get_key_ordering(&key2) ||
56636 +           get_key_type(&key1) != get_key_type(&key2))
56637 +               return 0;
56638 +       if (get_key_offset(&key1) + extent_size(p1, nr_units_extent(p1)) != get_key_offset(&key2))
56639 +               return 0;
56640 +       return 1;
56641 +}
56642 +
56643 +/* item_plugin->b.show */
56644 +reiser4_internal void
56645 +show_extent(struct seq_file *m, coord_t *coord)
56646 +{
56647 +       reiser4_extent *ext;
56648 +       ext = extent_by_coord(coord);
56649 +       seq_printf(m, "%Lu %Lu", extent_get_start(ext), extent_get_width(ext));
56650 +}
56651 +
56652 +
56653 +#if REISER4_DEBUG_OUTPUT
56654 +
56655 +/* Audited by: green(2002.06.13) */
56656 +static const char *
56657 +state2label(extent_state state)
56658 +{
56659 +       const char *label;
56660 +
56661 +       label = 0;
56662 +       switch (state) {
56663 +       case HOLE_EXTENT:
56664 +               label = "hole";
56665 +               break;
56666 +
56667 +       case UNALLOCATED_EXTENT:
56668 +               label = "unalloc";
56669 +               break;
56670 +
56671 +       case ALLOCATED_EXTENT:
56672 +               label = "alloc";
56673 +               break;
56674 +       }
56675 +       assert("vs-376", label);
56676 +       return label;
56677 +}
56678 +
56679 +/* item_plugin->b.print */
56680 +reiser4_internal void
56681 +print_extent(const char *prefix, coord_t *coord)
56682 +{
56683 +       reiser4_extent *ext;
56684 +       unsigned i, nr;
56685 +
56686 +       if (prefix)
56687 +               printk("%s:", prefix);
56688 +
56689 +       nr = nr_units_extent(coord);
56690 +       ext = (reiser4_extent *) item_body_by_coord(coord);
56691 +
56692 +       printk("%u: ", nr);
56693 +       for (i = 0; i < nr; i++, ext++) {
56694 +               printk("[%Lu (%Lu) %s]", extent_get_start(ext), extent_get_width(ext), state2label(state_of_extent(ext)));
56695 +       }
56696 +       printk("\n");
56697 +}
56698 +
56699 +/* item_plugin->b.item_stat */
56700 +reiser4_internal void
56701 +item_stat_extent(const coord_t *coord, void *vp)
56702 +{
56703 +       reiser4_extent *ext;
56704 +       struct extent_stat *ex_stat;
56705 +       unsigned i, nr_units;
56706 +
56707 +       ex_stat = (struct extent_stat *) vp;
56708 +
56709 +       ext = extent_item(coord);
56710 +       nr_units = nr_units_extent(coord);
56711 +
56712 +       for (i = 0; i < nr_units; i++) {
56713 +               switch (state_of_extent(ext + i)) {
56714 +               case ALLOCATED_EXTENT:
56715 +                       ex_stat->allocated_units++;
56716 +                       ex_stat->allocated_blocks += extent_get_width(ext + i);
56717 +                       break;
56718 +               case UNALLOCATED_EXTENT:
56719 +                       ex_stat->unallocated_units++;
56720 +                       ex_stat->unallocated_blocks += extent_get_width(ext + i);
56721 +                       break;
56722 +               case HOLE_EXTENT:
56723 +                       ex_stat->hole_units++;
56724 +                       ex_stat->hole_blocks += extent_get_width(ext + i);
56725 +                       break;
56726 +               default:
56727 +                       assert("vs-1419", 0);
56728 +               }
56729 +       }
56730 +}
56731 +
56732 +#endif /* REISER4_DEBUG_OUTPUT */
56733 +
56734 +/* item_plugin->b.nr_units */
56735 +reiser4_internal pos_in_node_t
56736 +nr_units_extent(const coord_t *coord)
56737 +{
56738 +       /* length of extent item has to be multiple of extent size */
56739 +       assert("vs-1424", (item_length_by_coord(coord) % sizeof (reiser4_extent)) == 0);
56740 +       return item_length_by_coord(coord) / sizeof (reiser4_extent);
56741 +}
56742 +
56743 +/* item_plugin->b.lookup */
56744 +reiser4_internal lookup_result
56745 +lookup_extent(const reiser4_key *key, lookup_bias bias UNUSED_ARG, coord_t *coord)
56746 +{                              /* znode and item_pos are
56747 +                                  set to an extent item to
56748 +                                  look through */
56749 +       reiser4_key item_key;
56750 +       reiser4_block_nr lookuped, offset;
56751 +       unsigned i, nr_units;
56752 +       reiser4_extent *ext;
56753 +       unsigned blocksize;
56754 +       unsigned char blocksize_bits;
56755 +
56756 +       item_key_by_coord(coord, &item_key);
56757 +       offset = get_key_offset(&item_key);
56758 +
56759 +       /* key we are looking for must be greater than key of item @coord */
56760 +       assert("vs-414", keygt(key, &item_key));
56761 +
56762 +       assert("umka-99945",
56763 +               !keygt(key, max_key_inside_extent(coord, &item_key)));
56764 +
56765 +       ext = extent_item(coord);
56766 +       assert("vs-1350", (char *)ext == (zdata(coord->node) + coord->offset));
56767 +
56768 +       blocksize = current_blocksize;
56769 +       blocksize_bits = current_blocksize_bits;
56770 +
56771 +       /* offset we are looking for */
56772 +       lookuped = get_key_offset(key);
56773 +
56774 +       nr_units = nr_units_extent(coord);
56775 +       /* go through all extents until the one which address given offset */
56776 +       for (i = 0; i < nr_units; i++, ext++) {
56777 +               offset += (extent_get_width(ext) << blocksize_bits);
56778 +               if (offset > lookuped) {
56779 +                       /* desired byte is somewhere in this extent */
56780 +                       coord->unit_pos = i;
56781 +                       coord->between = AT_UNIT;
56782 +                       return CBK_COORD_FOUND;
56783 +               }
56784 +       }
56785 +
56786 +       /* set coord after last unit */
56787 +       coord->unit_pos = nr_units - 1;
56788 +       coord->between = AFTER_UNIT;
56789 +       return CBK_COORD_FOUND;
56790 +}
56791 +
56792 +/* item_plugin->b.paste
56793 +   item @coord is set to has been appended with @data->length of free
56794 +   space. data->data contains data to be pasted into the item in position
56795 +   @coord->in_item.unit_pos. It must fit into that free space.
56796 +   @coord must be set between units.
56797 +*/
56798 +reiser4_internal int
56799 +paste_extent(coord_t *coord, reiser4_item_data *data, carry_plugin_info *info UNUSED_ARG)
56800 +{
56801 +       unsigned old_nr_units;
56802 +       reiser4_extent *ext;
56803 +       int item_length;
56804 +
56805 +       ext = extent_item(coord);
56806 +       item_length = item_length_by_coord(coord);
56807 +       old_nr_units = (item_length - data->length) / sizeof (reiser4_extent);
56808 +
56809 +       /* this is also used to copy extent into newly created item, so
56810 +          old_nr_units could be 0 */
56811 +       assert("vs-260", item_length >= data->length);
56812 +
56813 +       /* make sure that coord is set properly */
56814 +       assert("vs-35", ((!coord_is_existing_unit(coord)) || (!old_nr_units && !coord->unit_pos)));
56815 +
56816 +       /* first unit to be moved */
56817 +       switch (coord->between) {
56818 +       case AFTER_UNIT:
56819 +               coord->unit_pos++;
56820 +       case BEFORE_UNIT:
56821 +               coord->between = AT_UNIT;
56822 +               break;
56823 +       case AT_UNIT:
56824 +               assert("vs-331", !old_nr_units && !coord->unit_pos);
56825 +               break;
56826 +       default:
56827 +               impossible("vs-330", "coord is set improperly");
56828 +       }
56829 +
56830 +       /* prepare space for new units */
56831 +       xmemmove(ext + coord->unit_pos + data->length / sizeof (reiser4_extent),
56832 +                ext + coord->unit_pos, (old_nr_units - coord->unit_pos) * sizeof (reiser4_extent));
56833 +
56834 +       /* copy new data from kernel space */
56835 +       assert("vs-556", data->user == 0);
56836 +       xmemcpy(ext + coord->unit_pos, data->data, (unsigned) data->length);
56837 +
56838 +       /* after paste @coord is set to first of pasted units */
56839 +       assert("vs-332", coord_is_existing_unit(coord));
56840 +       assert("vs-333", !memcmp(data->data, extent_by_coord(coord), (unsigned) data->length));
56841 +       return 0;
56842 +}
56843 +
56844 +/* item_plugin->b.can_shift */
56845 +reiser4_internal int
56846 +can_shift_extent(unsigned free_space, coord_t *source,
56847 +                znode *target UNUSED_ARG, shift_direction pend UNUSED_ARG, unsigned *size, unsigned want)
56848 +{
56849 +       *size = item_length_by_coord(source);
56850 +       if (*size > free_space)
56851 +               /* never split a unit of extent item */
56852 +               *size = free_space - free_space % sizeof (reiser4_extent);
56853 +
56854 +       /* we can shift *size bytes, calculate how many do we want to shift */
56855 +       if (*size > want * sizeof (reiser4_extent))
56856 +               *size = want * sizeof (reiser4_extent);
56857 +
56858 +       if (*size % sizeof (reiser4_extent) != 0)
56859 +               impossible("vs-119", "Wrong extent size: %i %i", *size, sizeof (reiser4_extent));
56860 +       return *size / sizeof (reiser4_extent);
56861 +
56862 +}
56863 +
56864 +/* item_plugin->b.copy_units */
56865 +reiser4_internal void
56866 +copy_units_extent(coord_t *target, coord_t *source,
56867 +                 unsigned from, unsigned count, shift_direction where_is_free_space, unsigned free_space)
56868 +{
56869 +       char *from_ext, *to_ext;
56870 +
56871 +       assert("vs-217", free_space == count * sizeof (reiser4_extent));
56872 +
56873 +       from_ext = item_body_by_coord(source);
56874 +       to_ext = item_body_by_coord(target);
56875 +
56876 +       if (where_is_free_space == SHIFT_LEFT) {
56877 +               assert("vs-215", from == 0);
56878 +
56879 +               /* At this moment, item length was already updated in the item
56880 +                  header by shifting code, hence nr_units_extent() will
56881 +                  return "new" number of units---one we obtain after copying
56882 +                  units.
56883 +               */
56884 +               to_ext += (nr_units_extent(target) - count) * sizeof (reiser4_extent);
56885 +       } else {
56886 +               reiser4_key key;
56887 +               coord_t coord;
56888 +
56889 +               assert("vs-216", from + count == coord_last_unit_pos(source) + 1);
56890 +
56891 +               from_ext += item_length_by_coord(source) - free_space;
56892 +
56893 +               /* new units are inserted before first unit in an item,
56894 +                  therefore, we have to update item key */
56895 +               coord = *source;
56896 +               coord.unit_pos = from;
56897 +               unit_key_extent(&coord, &key);
56898 +
56899 +               node_plugin_by_node(target->node)->update_item_key(target, &key, 0/*info */);
56900 +       }
56901 +
56902 +       xmemcpy(to_ext, from_ext, free_space);
56903 +}
56904 +
56905 +/* item_plugin->b.create_hook
56906 +   @arg is znode of leaf node for which we need to update right delimiting key */
56907 +reiser4_internal int
56908 +create_hook_extent(const coord_t *coord, void *arg)
56909 +{
56910 +       coord_t *child_coord;
56911 +       znode *node;
56912 +       reiser4_key key;
56913 +       reiser4_tree *tree;
56914 +
56915 +       if (!arg)
56916 +               return 0;
56917 +
56918 +       child_coord = arg;
56919 +       tree = znode_get_tree(coord->node);
56920 +
56921 +       assert("nikita-3246", znode_get_level(child_coord->node) == LEAF_LEVEL);
56922 +
56923 +       WLOCK_DK(tree);
56924 +       WLOCK_TREE(tree);
56925 +       /* find a node on the left level for which right delimiting key has to
56926 +          be updated */
56927 +       if (coord_wrt(child_coord) == COORD_ON_THE_LEFT) {
56928 +               assert("vs-411", znode_is_left_connected(child_coord->node));
56929 +               node = child_coord->node->left;
56930 +       } else {
56931 +               assert("vs-412", coord_wrt(child_coord) == COORD_ON_THE_RIGHT);
56932 +               node = child_coord->node;
56933 +               assert("nikita-3314", node != NULL);
56934 +       }
56935 +
56936 +       if (node != NULL) {
56937 +               znode_set_rd_key(node, item_key_by_coord(coord, &key));
56938 +
56939 +               assert("nikita-3282", check_sibling_list(node));
56940 +               /* break sibling links */
56941 +               if (ZF_ISSET(node, JNODE_RIGHT_CONNECTED) && node->right) {
56942 +                       node->right->left = NULL;
56943 +                       node->right = NULL;
56944 +               }
56945 +       }
56946 +       WUNLOCK_TREE(tree);
56947 +       WUNLOCK_DK(tree);
56948 +       return 0;
56949 +}
56950 +
56951 +
56952 +#define ITEM_TAIL_KILLED 0
56953 +#define ITEM_HEAD_KILLED 1
56954 +#define ITEM_KILLED 2
56955 +
56956 +/* item_plugin->b.kill_hook
56957 +   this is called when @count units starting from @from-th one are going to be removed
56958 +   */
56959 +reiser4_internal int
56960 +kill_hook_extent(const coord_t *coord, pos_in_node_t from, pos_in_node_t count, struct carry_kill_data *kdata)
56961 +{
56962 +       reiser4_extent *ext;
56963 +       reiser4_block_nr start, length;
56964 +       reiser4_key min_item_key, max_item_key;
56965 +       reiser4_key from_key, to_key;
56966 +       const reiser4_key *pfrom_key, *pto_key;
56967 +       struct inode *inode;
56968 +       reiser4_tree *tree;
56969 +       pgoff_t from_off, to_off, offset, skip;
56970 +       int retval;
56971 +
56972 +       assert ("zam-811", znode_is_write_locked(coord->node));
56973 +       assert("nikita-3315", kdata != NULL);
56974 +
56975 +       item_key_by_coord(coord, &min_item_key);
56976 +       max_item_key_by_coord(coord, &max_item_key);
56977 +
56978 +       if (kdata->params.from_key) {
56979 +               pfrom_key = kdata->params.from_key;
56980 +               pto_key = kdata->params.to_key;
56981 +       } else {
56982 +               coord_t dup;
56983 +
56984 +               assert("vs-1549", from == coord->unit_pos);
56985 +               unit_key_by_coord(coord, &from_key);
56986 +               pfrom_key = &from_key;
56987 +
56988 +               coord_dup(&dup, coord);
56989 +               dup.unit_pos = from + count - 1;
56990 +               max_unit_key_by_coord(&dup, &to_key);
56991 +               pto_key = &to_key;
56992 +       }
56993 +
56994 +       if (!keylt(pto_key, &max_item_key)) {
56995 +               if (!keygt(pfrom_key, &min_item_key)) {
56996 +                       znode *left, *right;
56997 +
56998 +                       /* item is to be removed completely */
56999 +                       assert("nikita-3316", kdata->left != NULL && kdata->right != NULL);
57000 +
57001 +                       left = kdata->left->node;
57002 +                       right = kdata->right->node;
57003 +
57004 +                       tree = current_tree;
57005 +                       /* we have to do two things:
57006 +                        *
57007 +                        *     1. link left and right formatted neighbors of
57008 +                        *        extent being removed, and
57009 +                        *
57010 +                        *     2. update their delimiting keys.
57011 +                        *
57012 +                        * atomicity of these operations is protected by
57013 +                        * taking dk-lock and tree-lock.
57014 +                        */
57015 +                       WLOCK_DK(tree);
57016 +                       /* if neighbors of item being removed are znodes -
57017 +                        * link them */
57018 +                       UNDER_RW_VOID(tree, tree,
57019 +                                     write, link_left_and_right(left, right));
57020 +
57021 +                       if (left) {
57022 +                               /* update right delimiting key of left
57023 +                                * neighbor of extent item */
57024 +                               coord_t next;
57025 +                               reiser4_key key;
57026 +
57027 +                               coord_dup(&next, coord);
57028 +
57029 +                               if (coord_next_item(&next))
57030 +                                       key = *znode_get_rd_key(coord->node);
57031 +                               else
57032 +                                       item_key_by_coord(&next, &key);
57033 +                               znode_set_rd_key(left, &key);
57034 +                       }
57035 +                       WUNLOCK_DK(tree);
57036 +
57037 +                       from_off = get_key_offset(&min_item_key) >> PAGE_CACHE_SHIFT;
57038 +                       to_off = (get_key_offset(&max_item_key) + 1) >> PAGE_CACHE_SHIFT;
57039 +                       retval = ITEM_KILLED;
57040 +               } else {
57041 +                       /* tail of item is to be removed */
57042 +                       from_off = (get_key_offset(pfrom_key) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
57043 +                       to_off = (get_key_offset(&max_item_key) + 1) >> PAGE_CACHE_SHIFT;
57044 +                       retval = ITEM_TAIL_KILLED;
57045 +               }
57046 +       } else {
57047 +               /* head of item is to be removed */
57048 +               assert("vs-1571", keyeq(pfrom_key, &min_item_key));
57049 +               assert("vs-1572", (get_key_offset(pfrom_key) & (PAGE_CACHE_SIZE - 1)) == 0);
57050 +               assert("vs-1573", ((get_key_offset(pto_key) + 1) & (PAGE_CACHE_SIZE - 1)) == 0);
57051 +
57052 +               if (kdata->left->node) {
57053 +                       /* update right delimiting key of left neighbor of extent item */
57054 +                       reiser4_key key;
57055 +
57056 +                       key = *pto_key;
57057 +                       set_key_offset(&key, get_key_offset(pto_key) + 1);
57058 +
57059 +                       UNDER_RW_VOID(dk, current_tree, write, znode_set_rd_key(kdata->left->node, &key));
57060 +               }
57061 +
57062 +               from_off = get_key_offset(pfrom_key) >> PAGE_CACHE_SHIFT;
57063 +               to_off = (get_key_offset(pto_key) + 1) >> PAGE_CACHE_SHIFT;
57064 +               retval = ITEM_HEAD_KILLED;
57065 +       }
57066 +
57067 +       inode = kdata->inode;
57068 +       assert("vs-1545", inode != NULL);
57069 +       if (inode != NULL)
57070 +               /* take care of pages and jnodes corresponding to part of item being killed */
57071 +               reiser4_invalidate_pages(inode->i_mapping, from_off, to_off - from_off);
57072 +
57073 +       ext = extent_item(coord) + from;
57074 +       offset = (get_key_offset(&min_item_key) + extent_size(coord, from)) >> PAGE_CACHE_SHIFT;
57075 +
57076 +       assert("vs-1551", from_off >= offset);
57077 +       assert("vs-1552", from_off - offset <= extent_get_width(ext));
57078 +       skip = from_off - offset;
57079 +       offset = from_off;
57080 +
57081 +       while (offset < to_off) {
57082 +               length = extent_get_width(ext) - skip;
57083 +               if (state_of_extent(ext) == HOLE_EXTENT) {
57084 +                       skip = 0;
57085 +                       offset += length;
57086 +                       ext ++;
57087 +                       continue;
57088 +               }
57089 +
57090 +               if (offset + length > to_off) {
57091 +                       length = to_off - offset;
57092 +               }
57093 +
57094 +               DQUOT_FREE_BLOCK(inode, length);
57095 +
57096 +               if (state_of_extent(ext) == UNALLOCATED_EXTENT) {
57097 +                       /* some jnodes corresponding to this unallocated extent */
57098 +                       fake_allocated2free(length,
57099 +                                           0 /* unformatted */);
57100 +
57101 +                       skip = 0;
57102 +                       offset += length;
57103 +                       ext ++;
57104 +                       continue;
57105 +               }
57106 +
57107 +               assert("vs-1218", state_of_extent(ext) == ALLOCATED_EXTENT);
57108 +
57109 +               if (length != 0) {
57110 +                       start = extent_get_start(ext) + skip;
57111 +
57112 +                       /* BA_DEFER bit parameter is turned on because blocks which get freed are not safe to be freed
57113 +                          immediately */
57114 +                       reiser4_dealloc_blocks(&start, &length, 0 /* not used */,
57115 +                                              BA_DEFER/* unformatted with defer */);
57116 +               }
57117 +               skip = 0;
57118 +               offset += length;
57119 +               ext ++;
57120 +       }
57121 +       return retval;
57122 +}
57123 +
57124 +/* item_plugin->b.kill_units */
57125 +reiser4_internal int
57126 +kill_units_extent(coord_t *coord, pos_in_node_t from, pos_in_node_t to, struct carry_kill_data *kdata,
57127 +                 reiser4_key *smallest_removed, reiser4_key *new_first)
57128 +{
57129 +       reiser4_extent *ext;
57130 +       reiser4_key item_key;
57131 +        pos_in_node_t count;
57132 +       reiser4_key from_key, to_key;
57133 +       const reiser4_key *pfrom_key, *pto_key;
57134 +       loff_t off;
57135 +       int result;
57136 +
57137 +       assert("vs-1541", ((kdata->params.from_key == NULL && kdata->params.to_key == NULL) ||
57138 +                          (kdata->params.from_key != NULL && kdata->params.to_key != NULL)));
57139 +
57140 +       if (kdata->params.from_key) {
57141 +               pfrom_key = kdata->params.from_key;
57142 +               pto_key = kdata->params.to_key;
57143 +       } else {
57144 +               coord_t dup;
57145 +
57146 +               /* calculate key range of kill */
57147 +               assert("vs-1549", from == coord->unit_pos);
57148 +               unit_key_by_coord(coord, &from_key);
57149 +               pfrom_key = &from_key;
57150 +
57151 +               coord_dup(&dup, coord);
57152 +               dup.unit_pos = to;
57153 +               max_unit_key_by_coord(&dup, &to_key);
57154 +               pto_key = &to_key;
57155 +       }
57156 +
57157 +       item_key_by_coord(coord, &item_key);
57158 +
57159 +#if REISER4_DEBUG
57160 +       {
57161 +               reiser4_key max_item_key;
57162 +
57163 +               max_item_key_by_coord(coord, &max_item_key);
57164 +
57165 +               if (new_first) {
57166 +                       /* head of item is to be cut */
57167 +                       assert("vs-1542", keyeq(pfrom_key, &item_key));
57168 +                       assert("vs-1538", keylt(pto_key, &max_item_key));
57169 +               } else {
57170 +                       /* tail of item is to be cut */
57171 +                       assert("vs-1540", keygt(pfrom_key, &item_key));
57172 +                       assert("vs-1543", !keylt(pto_key, &max_item_key));
57173 +               }
57174 +       }
57175 +#endif
57176 +
57177 +       if (smallest_removed)
57178 +               *smallest_removed = *pfrom_key;
57179 +
57180 +       if (new_first) {
57181 +               /* item head is cut. Item key will change. This new key is calculated here */
57182 +               assert("vs-1556", (get_key_offset(pto_key) & (PAGE_CACHE_SIZE - 1)) == (PAGE_CACHE_SIZE - 1));
57183 +               *new_first = *pto_key;
57184 +               set_key_offset(new_first, get_key_offset(new_first) + 1);
57185 +       }
57186 +
57187 +       count = to - from + 1;
57188 +       result = kill_hook_extent(coord, from, count, kdata);
57189 +       if (result == ITEM_TAIL_KILLED) {
57190 +               assert("vs-1553", get_key_offset(pfrom_key) >= get_key_offset(&item_key) + extent_size(coord, from));
57191 +               off = get_key_offset(pfrom_key) - (get_key_offset(&item_key) + extent_size(coord, from));
57192 +               if (off) {
57193 +                       /* unit @from is to be cut partially. Its width decreases */
57194 +                       ext = extent_item(coord) + from;
57195 +                       extent_set_width(ext, (off + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT);
57196 +                       count --;
57197 +               }
57198 +       } else {
57199 +               __u64 max_to_offset;
57200 +               __u64 rest;
57201 +
57202 +               assert("vs-1575", result == ITEM_HEAD_KILLED);
57203 +               assert("", from == 0);
57204 +               assert("", ((get_key_offset(pto_key) + 1) & (PAGE_CACHE_SIZE - 1)) == 0);
57205 +               assert("", get_key_offset(pto_key) + 1 > get_key_offset(&item_key) + extent_size(coord, to));
57206 +               max_to_offset = get_key_offset(&item_key) + extent_size(coord, to + 1) - 1;
57207 +               assert("", get_key_offset(pto_key) <= max_to_offset);
57208 +
57209 +               rest = (max_to_offset - get_key_offset(pto_key)) >> PAGE_CACHE_SHIFT;
57210 +               if (rest) {
57211 +                       /* unit @to is to be cut partially */
57212 +                       ext = extent_item(coord) + to;
57213 +
57214 +                       assert("", extent_get_width(ext) > rest);
57215 +
57216 +                       if (state_of_extent(ext) == ALLOCATED_EXTENT)
57217 +                               extent_set_start(ext, extent_get_start(ext) + (extent_get_width(ext) - rest));
57218 +
57219 +                       extent_set_width(ext, rest);
57220 +                       count --;
57221 +               }
57222 +       }
57223 +       return count * sizeof(reiser4_extent);
57224 +}
57225 +
57226 +/* item_plugin->b.cut_units
57227 +   this is too similar to kill_units_extent */
57228 +reiser4_internal int
57229 +cut_units_extent(coord_t *coord, pos_in_node_t from, pos_in_node_t to, struct carry_cut_data *cdata,
57230 +                reiser4_key *smallest_removed, reiser4_key *new_first)
57231 +{
57232 +       reiser4_extent *ext;
57233 +       reiser4_key item_key;
57234 +        pos_in_node_t count;
57235 +       reiser4_key from_key, to_key;
57236 +       const reiser4_key *pfrom_key, *pto_key;
57237 +       loff_t off;
57238 +
57239 +       assert("vs-1541", ((cdata->params.from_key == NULL && cdata->params.to_key == NULL) ||
57240 +                          (cdata->params.from_key != NULL && cdata->params.to_key != NULL)));
57241 +
57242 +       if (cdata->params.from_key) {
57243 +               pfrom_key = cdata->params.from_key;
57244 +               pto_key = cdata->params.to_key;
57245 +       } else {
57246 +               coord_t dup;
57247 +
57248 +               /* calculate key range of kill */
57249 +               coord_dup(&dup, coord);
57250 +               dup.unit_pos = from;
57251 +               unit_key_by_coord(&dup, &from_key);
57252 +
57253 +               dup.unit_pos = to;
57254 +               max_unit_key_by_coord(&dup, &to_key);
57255 +
57256 +               pfrom_key = &from_key;
57257 +               pto_key = &to_key;
57258 +       }
57259 +
57260 +       assert("vs-1555", (get_key_offset(pfrom_key) & (PAGE_CACHE_SIZE - 1)) == 0);
57261 +       assert("vs-1556", (get_key_offset(pto_key) & (PAGE_CACHE_SIZE - 1)) == (PAGE_CACHE_SIZE - 1));
57262 +
57263 +       item_key_by_coord(coord, &item_key);
57264 +
57265 +#if REISER4_DEBUG
57266 +       {
57267 +               reiser4_key max_item_key;
57268 +
57269 +               assert("vs-1584", get_key_locality(pfrom_key) ==  get_key_locality(&item_key));
57270 +               assert("vs-1585", get_key_type(pfrom_key) ==  get_key_type(&item_key));
57271 +               assert("vs-1586", get_key_objectid(pfrom_key) ==  get_key_objectid(&item_key));
57272 +               assert("vs-1587", get_key_ordering(pfrom_key) ==  get_key_ordering(&item_key));
57273 +
57274 +               max_item_key_by_coord(coord, &max_item_key);
57275 +
57276 +               if (new_first != NULL) {
57277 +                       /* head of item is to be cut */
57278 +                       assert("vs-1542", keyeq(pfrom_key, &item_key));
57279 +                       assert("vs-1538", keylt(pto_key, &max_item_key));
57280 +               } else {
57281 +                       /* tail of item is to be cut */
57282 +                       assert("vs-1540", keygt(pfrom_key, &item_key));
57283 +                       assert("vs-1543", keyeq(pto_key, &max_item_key));
57284 +               }
57285 +       }
57286 +#endif
57287 +
57288 +       if (smallest_removed)
57289 +               *smallest_removed = *pfrom_key;
57290 +
57291 +       if (new_first) {
57292 +               /* item head is cut. Item key will change. This new key is calculated here */
57293 +               *new_first = *pto_key;
57294 +               set_key_offset(new_first, get_key_offset(new_first) + 1);
57295 +       }
57296 +
57297 +       count = to - from + 1;
57298 +
57299 +       assert("vs-1553", get_key_offset(pfrom_key) >= get_key_offset(&item_key) + extent_size(coord, from));
57300 +       off = get_key_offset(pfrom_key) - (get_key_offset(&item_key) + extent_size(coord, from));
57301 +       if (off) {
57302 +               /* tail of unit @from is to be cut partially. Its width decreases */
57303 +               assert("vs-1582", new_first == NULL);
57304 +               ext = extent_item(coord) + from;
57305 +               extent_set_width(ext, off >> PAGE_CACHE_SHIFT);
57306 +               count --;
57307 +       }
57308 +
57309 +       assert("vs-1554", get_key_offset(pto_key) <= get_key_offset(&item_key) + extent_size(coord, to + 1) - 1);
57310 +       off = (get_key_offset(&item_key) + extent_size(coord, to + 1) - 1) - get_key_offset(pto_key);
57311 +       if (off) {
57312 +               /* @to_key is smaller than max key of unit @to. Unit @to will not be removed. It gets start increased
57313 +                  and width decreased. */
57314 +               assert("vs-1583", (off & (PAGE_CACHE_SIZE - 1)) == 0);
57315 +               ext = extent_item(coord) + to;
57316 +               if (state_of_extent(ext) == ALLOCATED_EXTENT)
57317 +                       extent_set_start(ext, extent_get_start(ext) + (extent_get_width(ext) - (off >> PAGE_CACHE_SHIFT)));
57318 +
57319 +               extent_set_width(ext, (off >> PAGE_CACHE_SHIFT));
57320 +               count --;
57321 +       }
57322 +       return count * sizeof(reiser4_extent);
57323 +}
57324 +
57325 +/* item_plugin->b.unit_key */
57326 +reiser4_internal reiser4_key *
57327 +unit_key_extent(const coord_t *coord, reiser4_key *key)
57328 +{
57329 +       assert("vs-300", coord_is_existing_unit(coord));
57330 +
57331 +       item_key_by_coord(coord, key);
57332 +       set_key_offset(key, (get_key_offset(key) + extent_size(coord, coord->unit_pos)));
57333 +
57334 +       return key;
57335 +}
57336 +
57337 +/* item_plugin->b.max_unit_key */
57338 +reiser4_internal reiser4_key *
57339 +max_unit_key_extent(const coord_t *coord, reiser4_key *key)
57340 +{
57341 +       assert("vs-300", coord_is_existing_unit(coord));
57342 +
57343 +       item_key_by_coord(coord, key);
57344 +       set_key_offset(key, (get_key_offset(key) + extent_size(coord, coord->unit_pos + 1) - 1));
57345 +       return key;
57346 +}
57347 +
57348 +/* item_plugin->b.estimate
57349 +   item_plugin->b.item_data_by_flow */
57350 +
57351 +#if REISER4_DEBUG
57352 +
57353 +/* item_plugin->b.check
57354 +   used for debugging, every item should have here the most complete
57355 +   possible check of the consistency of the item that the inventor can
57356 +   construct
57357 +*/
57358 +int
57359 +check_extent(const coord_t *coord /* coord of item to check */ ,
57360 +            const char **error /* where to store error message */ )
57361 +{
57362 +       reiser4_extent *ext, *first;
57363 +       unsigned i, j;
57364 +       reiser4_block_nr start, width, blk_cnt;
57365 +       unsigned num_units;
57366 +       reiser4_tree *tree;
57367 +       oid_t oid;
57368 +       reiser4_key key;
57369 +       coord_t scan;
57370 +
57371 +       assert("vs-933", REISER4_DEBUG);
57372 +
57373 +       if (znode_get_level(coord->node) != TWIG_LEVEL) {
57374 +               *error = "Extent on the wrong level";
57375 +               return -1;
57376 +       }
57377 +       if (item_length_by_coord(coord) % sizeof (reiser4_extent) != 0) {
57378 +               *error = "Wrong item size";
57379 +               return -1;
57380 +       }
57381 +       ext = first = extent_item(coord);
57382 +       blk_cnt = reiser4_block_count(reiser4_get_current_sb());
57383 +       num_units = coord_num_units(coord);
57384 +       tree = znode_get_tree(coord->node);
57385 +       item_key_by_coord(coord, &key);
57386 +       oid = get_key_objectid(&key);
57387 +       coord_dup(&scan, coord);
57388 +
57389 +       for (i = 0; i < num_units; ++i, ++ext) {
57390 +               __u64 index;
57391 +
57392 +               scan.unit_pos = i;
57393 +               index = extent_unit_index(&scan);
57394 +
57395 +#if 0
57396 +               /* check that all jnodes are present for the unallocated
57397 +                * extent */
57398 +               if (state_of_extent(ext) == UNALLOCATED_EXTENT) {
57399 +                       for (j = 0; j < extent_get_width(ext); j ++) {
57400 +                               jnode *node;
57401 +
57402 +                               node = jlookup(tree, oid, index + j);
57403 +                               if (node == NULL) {
57404 +                                       print_coord("scan", &scan, 0);
57405 +                                       *error = "Jnode missing";
57406 +                                       return -1;
57407 +                               }
57408 +                               jput(node);
57409 +                       }
57410 +               }
57411 +#endif
57412 +
57413 +               start = extent_get_start(ext);
57414 +               if (start < 2)
57415 +                       continue;
57416 +               /* extent is allocated one */
57417 +               width = extent_get_width(ext);
57418 +               if (start >= blk_cnt) {
57419 +                       *error = "Start too large";
57420 +                       return -1;
57421 +               }
57422 +               if (start + width > blk_cnt) {
57423 +                       *error = "End too large";
57424 +                       return -1;
57425 +               }
57426 +               /* make sure that this extent does not overlap with other
57427 +                  allocated extents extents */
57428 +               for (j = 0; j < i; j++) {
57429 +                       if (state_of_extent(first + j) != ALLOCATED_EXTENT)
57430 +                               continue;
57431 +                       if (!((extent_get_start(ext) >= extent_get_start(first + j) + extent_get_width(first + j))
57432 +                             || (extent_get_start(ext) + extent_get_width(ext) <= extent_get_start(first + j)))) {
57433 +                               *error = "Extent overlaps with others";
57434 +                               return -1;
57435 +                       }
57436 +               }
57437 +
57438 +       }
57439 +
57440 +       return 0;
57441 +}
57442 +
57443 +#endif /* REISER4_DEBUG */
57444 +
57445 +/*
57446 +   Local variables:
57447 +   c-indentation-style: "K&R"
57448 +   mode-name: "LC"
57449 +   c-basic-offset: 8
57450 +   tab-width: 8
57451 +   fill-column: 120
57452 +   scroll-step: 1
57453 +   End:
57454 +*/
57455 diff -rupN linux-2.6.8-rc3/fs/reiser4/plugin/item/extent_repack_ops.c linux-2.6.8-rc3-a/fs/reiser4/plugin/item/extent_repack_ops.c
57456 --- linux-2.6.8-rc3/fs/reiser4/plugin/item/extent_repack_ops.c  1970-01-01 03:00:00.000000000 +0300
57457 +++ linux-2.6.8-rc3-a/fs/reiser4/plugin/item/extent_repack_ops.c        2004-08-05 21:20:52.964685523 +0400
57458 @@ -0,0 +1,446 @@
57459 +/* Copyright 2003 by Hans Reiser. */
57460 +
57461 +#include "item.h"
57462 +#include "../../key.h"
57463 +#include "../../super.h"
57464 +#include "../../carry.h"
57465 +#include "../../inode.h"
57466 +#include "../../page_cache.h"
57467 +#include "../../emergency_flush.h"
57468 +#include "../../prof.h"
57469 +#include "../../flush.h"
57470 +#include "../../tap.h"
57471 +#include "../object.h"
57472 +
57473 +#include "../../repacker.h"
57474 +#include "extent.h"
57475 +
57476 +static int get_reiser4_inode_by_tap (struct inode ** result, tap_t * tap)
57477 +{
57478 +       reiser4_key ext_key;
57479 +
57480 +       unit_key_by_coord(tap->coord, &ext_key);
57481 +       return get_reiser4_inode_by_key(result, &ext_key);
57482 +}
57483 +
57484 +static jnode * get_jnode_by_mapping (struct inode * inode, long index)
57485 +{
57486 +       struct page * page;
57487 +       jnode * node;
57488 +
57489 +       page = grab_cache_page(inode->i_mapping, index);
57490 +       if (page == NULL)
57491 +               return ERR_PTR(-ENOMEM);
57492 +       node = jnode_of_page(page);
57493 +       unlock_page(page);
57494 +       page_cache_release(page);
57495 +       return node;
57496 +}
57497 +
57498 +static int mark_jnode_for_repacking (jnode * node)
57499 +{
57500 +       int ret = 0;
57501 +
57502 +       LOCK_JNODE(node);
57503 +       ret = try_capture(node, ZNODE_WRITE_LOCK, 0, 0/* no can_coc */);
57504 +       if (ret) {
57505 +               UNLOCK_JNODE(node);
57506 +               return ret;
57507 +       }
57508 +
57509 +       jnode_make_dirty_locked(node);
57510 +       UNLOCK_JNODE(node);
57511 +       JF_SET(node, JNODE_REPACK);
57512 +
57513 +       ret = jload(node);
57514 +       if (ret == 0) {
57515 +               struct page * page;
57516 +
57517 +               page = jnode_page(node);
57518 +               lock_page(page);
57519 +               set_page_dirty_internal(page, 0);
57520 +               unlock_page(page);
57521 +               jrelse(node);
57522 +       }
57523 +
57524 +       return ret;
57525 +}
57526 +
57527 +/*
57528 +   Mark jnodes of given extent for repacking.
57529 +   @tap : lock, coord and load status for the tree traversal position,
57530 +   @max_nr_marked: a maximum number of nodes which can be marked for repacking,
57531 +   @return: error code if < 0, number of marked nodes otherwise.
57532 +*/
57533 +reiser4_internal int mark_extent_for_repacking (tap_t * tap, int max_nr_marked)
57534 +{
57535 +       coord_t * coord = tap->coord;
57536 +       reiser4_extent *ext;
57537 +       int nr_marked;
57538 +       struct inode * inode;
57539 +       unsigned long index, pos_in_extent;
57540 +       reiser4_block_nr width, start;
57541 +       int ret;
57542 +
57543 +       ext = extent_by_coord(coord);
57544 +
57545 +       if (state_of_extent(ext) == HOLE_EXTENT)
57546 +               return 0;
57547 +
57548 +       width = extent_get_width(ext);
57549 +       start = extent_get_start(ext);
57550 +       index = extent_unit_index(coord);
57551 +
57552 +       ret = get_reiser4_inode_by_tap(&inode, tap);
57553 +       if (ret)
57554 +               return ret;
57555 +
57556 +       for (nr_marked = 0, pos_in_extent = 0;
57557 +            nr_marked < max_nr_marked && pos_in_extent < width; pos_in_extent ++)
57558 +       {
57559 +               jnode * node;
57560 +
57561 +               node = get_jnode_by_mapping(inode, index + pos_in_extent);
57562 +               if (IS_ERR(node)) {
57563 +                       ret = PTR_ERR(node);
57564 +                       break;
57565 +               }
57566 +
57567 +               /* Freshly created jnode has no block number set. */
57568 +               if (node->blocknr == 0) {
57569 +                       reiser4_block_nr block;
57570 +                       block = start + pos_in_extent;
57571 +                       jnode_set_block(node, &block);
57572 +
57573 +                       node->parent_item_id = EXTENT_POINTER_ID;
57574 +               }
57575 +
57576 +               if (!JF_ISSET(node, JNODE_REPACK)) {
57577 +                       do {
57578 +                               /* Check whether the node is already read. */
57579 +                               if (!JF_ISSET(node, JNODE_PARSED)) {
57580 +                                       ret = jstartio(node);
57581 +                                       if (ret)
57582 +                                               break;
57583 +                               }
57584 +                               ret = mark_jnode_for_repacking(node);
57585 +                               if (ret)
57586 +                                       break;
57587 +                               nr_marked ++;
57588 +                       } while (0);
57589 +               }
57590 +               jput(node);
57591 +               if (ret)
57592 +                       break;
57593 +       }
57594 +
57595 +       iput(inode);
57596 +       if (ret)
57597 +               return ret;
57598 +       return nr_marked;
57599 +}
57600 +
57601 +/* Check should the repacker relocate this node. */
57602 +static int relocatable (jnode * check)
57603 +{
57604 +       return !JF_ISSET(check, JNODE_OVRWR) && !JF_ISSET(check, JNODE_RELOC);
57605 +}
57606 +
57607 +static int replace_end_of_extent (coord_t * coord, reiser4_block_nr end_part_start,
57608 +                                 reiser4_block_nr end_part_width, int * all_replaced)
57609 +{
57610 +       reiser4_extent * ext;
57611 +       reiser4_block_nr ext_start;
57612 +       reiser4_block_nr ext_width;
57613 +
57614 +       reiser4_item_data item;
57615 +       reiser4_extent new_ext, replace_ext;
57616 +       reiser4_block_nr replace_ext_width;
57617 +       reiser4_key key;
57618 +
57619 +       int ret;
57620 +
57621 +       assert ("zam-959", item_is_extent(coord));
57622 +
57623 +       ext = extent_by_coord(coord);
57624 +       ext_start = extent_get_start(ext);
57625 +       ext_width = extent_get_width(ext);
57626 +
57627 +       assert ("zam-960", end_part_width <= ext_width);
57628 +
57629 +       replace_ext_width = ext_width - end_part_width;
57630 +       if (replace_ext_width == 0) {
57631 +               set_extent(ext, end_part_start, end_part_width);
57632 +               znode_make_dirty(coord->node);
57633 +               /* End part of extent is equal to the whole extent. */
57634 +               * all_replaced = 1;
57635 +               return 0;
57636 +       }
57637 +
57638 +       set_extent(&replace_ext, ext_start, replace_ext_width);
57639 +       set_extent(&new_ext, end_part_start, end_part_width);
57640 +
57641 +       unit_key_by_coord(coord, &key);
57642 +       set_key_offset(&key, get_key_offset(&key) + replace_ext_width * current_blocksize);
57643 +
57644 +       {
57645 +               reiser4_context * ctx = get_current_context();
57646 +               reiser4_super_info_data * sinfo = get_super_private(ctx->super);
57647 +               __u64 estimated;
57648 +               __u64 were_grabbed;
57649 +
57650 +               were_grabbed = ctx->grabbed_blocks;
57651 +               estimated = estimate_one_insert_item(&get_super_private(ctx->super)->tree);
57652 +
57653 +               /* grab space for operations on internal levels. */
57654 +               ret = reiser4_grab_space(
57655 +                       estimated, BA_FORCE | BA_RESERVED | BA_PERMANENT | BA_FORMATTED);
57656 +               if (ret)
57657 +                       return ret;
57658 +
57659 +               ret =  replace_extent(
57660 +                       coord, znode_lh(coord->node), &key,
57661 +                       init_new_extent(&item, &new_ext, 1), &replace_ext,
57662 +                       COPI_DONT_SHIFT_LEFT, 0);
57663 +
57664 +               /* release grabbed space if it was not used. */
57665 +               assert ("zam-988", ctx->grabbed_blocks >= were_grabbed);
57666 +               grabbed2free(ctx, sinfo, ctx->grabbed_blocks - were_grabbed);
57667 +       }
57668 +
57669 +       return ret;
57670 +}
57671 +
57672 +static int make_new_extent_at_end (coord_t * coord, reiser4_block_nr width, int * all_replaced)
57673 +{
57674 +       reiser4_extent * ext;
57675 +       reiser4_block_nr ext_start;
57676 +       reiser4_block_nr ext_width;
57677 +       reiser4_block_nr new_ext_start;
57678 +
57679 +       assert ("zam-961", item_is_extent(coord));
57680 +
57681 +       ext = extent_by_coord(coord);
57682 +       ext_start = extent_get_start(ext);
57683 +       ext_width = extent_get_width(ext);
57684 +
57685 +       assert ("zam-962", width < ext_width);
57686 +
57687 +       if (state_of_extent(ext) == ALLOCATED_EXTENT)
57688 +               new_ext_start = ext_start + ext_width - width;
57689 +       else
57690 +               new_ext_start = ext_start;
57691 +
57692 +       return replace_end_of_extent(coord, new_ext_start, width, all_replaced);
57693 +}
57694 +
57695 +static void parse_extent(coord_t * coord, reiser4_block_nr * start, reiser4_block_nr * width, long * ind)
57696 +{
57697 +       reiser4_extent * ext;
57698 +
57699 +       ext   = extent_by_coord(coord);
57700 +       *start = extent_get_start(ext);
57701 +       *width = extent_get_width(ext);
57702 +       *ind   = extent_unit_index(coord);
57703 +}
57704 +
57705 +static int skip_not_relocatable_extent(struct inode * inode, coord_t * coord, int * done)
57706 +{
57707 +       reiser4_block_nr ext_width, ext_start;
57708 +       long ext_index, reloc_start;
57709 +       jnode * check = NULL;
57710 +       int ret = 0;
57711 +
57712 +       assert("zam-985", state_of_extent(extent_by_coord(coord)));
57713 +       parse_extent(coord, &ext_start, &ext_width, &ext_index);
57714 +
57715 +       for (reloc_start = ext_width - 1; reloc_start >= 0; reloc_start --) {
57716 +               check = get_jnode_by_mapping(inode, reloc_start + ext_index);
57717 +               if (IS_ERR(check))
57718 +                       return PTR_ERR(check);
57719 +
57720 +               if (check->blocknr == 0) {
57721 +                       reiser4_block_nr block;
57722 +                       block = ext_start + reloc_start;
57723 +                       jnode_set_block(check, &block);
57724 +
57725 +                       check->parent_item_id = EXTENT_POINTER_ID;
57726 +               }
57727 +
57728 +               if (relocatable(check)) {
57729 +                       jput(check);
57730 +                       if (reloc_start < ext_width - 1)
57731 +                               ret = make_new_extent_at_end(coord, ext_width - reloc_start - 1, done);
57732 +                       return ret;
57733 +               }
57734 +               jput(check);
57735 +       }
57736 +       *done = 1;
57737 +       return 0;
57738 +}
57739 +
57740 +
57741 +static int relocate_extent (struct inode * inode, coord_t * coord, reiser4_blocknr_hint * hint,
57742 +                           int *done, reiser4_block_nr * len)
57743 +{
57744 +       reiser4_block_nr ext_width, ext_start;
57745 +       long ext_index, reloc_ind;
57746 +       reiser4_block_nr new_ext_width, new_ext_start, new_block;
57747 +       int unallocated_flg;
57748 +       int ret = 0;
57749 +
57750 +       parse_extent(coord, &ext_start, &ext_width, &ext_index);
57751 +       assert("zam-974", *len != 0);
57752 +
57753 +       unallocated_flg = (state_of_extent(extent_by_coord(coord)) == UNALLOCATED_EXTENT);
57754 +       hint->block_stage = unallocated_flg ? BLOCK_UNALLOCATED : BLOCK_FLUSH_RESERVED;
57755 +
57756 +       new_ext_width = *len;
57757 +       ret = reiser4_alloc_blocks(hint, &new_ext_start, &new_ext_width, BA_PERMANENT);
57758 +       if (ret)
57759 +               return ret;
57760 +
57761 +       hint->blk = new_ext_start;
57762 +       if (!unallocated_flg) {
57763 +               reiser4_block_nr dealloc_ext_start;
57764 +
57765 +               dealloc_ext_start = ext_start + ext_width - new_ext_width;
57766 +               ret = reiser4_dealloc_blocks(&dealloc_ext_start, &new_ext_width, 0,
57767 +                                            BA_DEFER | BA_PERMANENT);
57768 +               if (ret)
57769 +                       return ret;
57770 +       }
57771 +
57772 +       new_block = new_ext_start;
57773 +       for (reloc_ind = ext_width - new_ext_width; reloc_ind < ext_width; reloc_ind ++)
57774 +       {
57775 +               jnode * check;
57776 +
57777 +               check = get_jnode_by_mapping(inode, ext_index + reloc_ind);
57778 +               if (IS_ERR(check))
57779 +                       return PTR_ERR(check);
57780 +
57781 +               assert("zam-975", relocatable(check));
57782 +               assert("zam-986", check->blocknr != 0);
57783 +
57784 +               jnode_set_block(check, &new_block);
57785 +               check->parent_item_id = EXTENT_POINTER_ID;
57786 +               new_block ++;
57787 +
57788 +               JF_SET(check, JNODE_RELOC);
57789 +               JF_SET(check, JNODE_REPACK);
57790 +
57791 +               jput(check);
57792 +       }
57793 +
57794 +       ret = replace_end_of_extent(coord, new_ext_start, new_ext_width, done);
57795 +       *len = new_ext_width;
57796 +       return ret;
57797 +}
57798 +
57799 +static int find_relocatable_extent (struct inode * inode, coord_t * coord,
57800 +                                   int * nr_reserved, reiser4_block_nr * len)
57801 +{
57802 +       reiser4_block_nr ext_width, ext_start;
57803 +       long ext_index, reloc_end;
57804 +       jnode * check = NULL;
57805 +       int ret = 0;
57806 +
57807 +       *len = 0;
57808 +       parse_extent(coord, &ext_start, &ext_width, &ext_index);
57809 +
57810 +       for (reloc_end = ext_width - 1;
57811 +            reloc_end >= 0 && *nr_reserved > 0; reloc_end --)
57812 +       {
57813 +               assert("zam-980", get_current_context()->grabbed_blocks >= *nr_reserved);
57814 +
57815 +               check = get_jnode_by_mapping(inode, reloc_end + ext_index);
57816 +               if (IS_ERR(check))
57817 +                       return PTR_ERR(check);
57818 +
57819 +               if (check->blocknr == 0) {
57820 +                       reiser4_block_nr block;
57821 +                       block = ext_start + reloc_end;
57822 +                       jnode_set_block(check, &block);
57823 +               }
57824 +
57825 +               if (!relocatable(check)) {
57826 +                       assert("zam-973", reloc_end < ext_width - 1);
57827 +                       goto out;
57828 +               }
57829 +               /* add node to transaction. */
57830 +               ret = mark_jnode_for_repacking(check);
57831 +               if (ret)
57832 +                       goto out;               ;
57833 +               jput(check);
57834 +
57835 +               (*len) ++;
57836 +               (*nr_reserved) --;
57837 +       }
57838 +       if (0) {
57839 +       out:
57840 +               jput(check);
57841 +       }
57842 +       return ret;
57843 +}
57844 +
57845 +static int find_and_relocate_end_of_extent (
57846 +       struct inode * inode, coord_t * coord,
57847 +       struct repacker_cursor * cursor, int * done)
57848 +{
57849 +       reiser4_block_nr len;
57850 +       int ret;
57851 +
57852 +       ret = skip_not_relocatable_extent(inode, coord, done);
57853 +       if (ret || (*done))
57854 +               return ret;
57855 +
57856 +       ret = find_relocatable_extent(inode, coord, &cursor->count, &len);
57857 +       if (ret)
57858 +               return ret;
57859 +       if (len == 0) {
57860 +               *done = 1;
57861 +               return 0;
57862 +       }
57863 +
57864 +       ret = relocate_extent(inode, coord, &cursor->hint, done, &len);
57865 +       if (ret)
57866 +               return ret;
57867 +       cursor->stats.jnodes_dirtied += (long)len;
57868 +       return 0;
57869 +}
57870 +
57871 +/* process (relocate) unformatted nodes in backward direction: from the end of extent to the its start.  */
57872 +reiser4_internal int
57873 +process_extent_backward_for_repacking (tap_t * tap, struct repacker_cursor * cursor)
57874 +{
57875 +       coord_t * coord = tap->coord;
57876 +       reiser4_extent *ext;
57877 +       struct inode * inode = NULL;
57878 +       int done = 0;
57879 +       int ret;
57880 +
57881 +       assert("zam-985", cursor->count > 0);
57882 +       ext = extent_by_coord(coord);
57883 +       if (state_of_extent(ext) == HOLE_EXTENT)
57884 +               return 0;
57885 +
57886 +       ret = get_reiser4_inode_by_tap(&inode, tap);
57887 +
57888 +       while (!ret && !done)
57889 +               ret = find_and_relocate_end_of_extent(inode, coord, cursor, &done);
57890 +
57891 +       iput(inode);
57892 +       return ret;
57893 +}
57894 +
57895 +/*
57896 +   Local variables:
57897 +   c-indentation-style: "K&R"
57898 +   mode-name: "LC"
57899 +   c-basic-offset: 8
57900 +   tab-width: 8
57901 +   fill-column: 120
57902 +   scroll-step: 1
57903 +   End:
57904 +*/
57905 diff -rupN linux-2.6.8-rc3/fs/reiser4/plugin/item/internal.c linux-2.6.8-rc3-a/fs/reiser4/plugin/item/internal.c
57906 --- linux-2.6.8-rc3/fs/reiser4/plugin/item/internal.c   1970-01-01 03:00:00.000000000 +0300
57907 +++ linux-2.6.8-rc3-a/fs/reiser4/plugin/item/internal.c 2004-08-05 21:20:53.476577553 +0400
57908 @@ -0,0 +1,411 @@
57909 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
57910 +
57911 +/* Implementation of internal-item plugin methods. */
57912 +
57913 +#include "../../forward.h"
57914 +#include "../../debug.h"
57915 +#include "../../dformat.h"
57916 +#include "../../key.h"
57917 +#include "../../coord.h"
57918 +#include "internal.h"
57919 +#include "item.h"
57920 +#include "../node/node.h"
57921 +#include "../plugin.h"
57922 +#include "../../jnode.h"
57923 +#include "../../znode.h"
57924 +#include "../../tree_walk.h"
57925 +#include "../../tree_mod.h"
57926 +#include "../../tree.h"
57927 +#include "../../super.h"
57928 +#include "../../block_alloc.h"
57929 +
57930 +/* see internal.h for explanation */
57931 +
57932 +/* plugin->u.item.b.mergeable */
57933 +reiser4_internal int
57934 +mergeable_internal(const coord_t * p1 UNUSED_ARG /* first item */ ,
57935 +                  const coord_t * p2 UNUSED_ARG /* second item */ )
57936 +{
57937 +       /* internal items are not mergeable */
57938 +       return 0;
57939 +}
57940 +
57941 +/* ->lookup() method for internal items */
57942 +reiser4_internal lookup_result
57943 +lookup_internal(const reiser4_key * key /* key to look up */ ,
57944 +               lookup_bias bias UNUSED_ARG /* lookup bias */ ,
57945 +               coord_t * coord /* coord of item */ )
57946 +{
57947 +       reiser4_key ukey;
57948 +
57949 +       switch (keycmp(unit_key_by_coord(coord, &ukey), key)) {
57950 +       default:
57951 +               impossible("", "keycmp()?!");
57952 +       case LESS_THAN:
57953 +               /* FIXME-VS: AFTER_ITEM used to be here. But with new coord
57954 +                  item plugin can not be taken using coord set this way */
57955 +               assert("vs-681", coord->unit_pos == 0);
57956 +               coord->between = AFTER_UNIT;
57957 +       case EQUAL_TO:
57958 +               return CBK_COORD_FOUND;
57959 +       case GREATER_THAN:
57960 +               return CBK_COORD_NOTFOUND;
57961 +       }
57962 +}
57963 +
57964 +/* return body of internal item at @coord */
57965 +static internal_item_layout *
57966 +internal_at(const coord_t * coord      /* coord of
57967 +                                          * item */ )
57968 +{
57969 +       assert("nikita-607", coord != NULL);
57970 +       assert("nikita-1650", item_plugin_by_coord(coord) == item_plugin_by_id(NODE_POINTER_ID));
57971 +       return (internal_item_layout *) item_body_by_coord(coord);
57972 +}
57973 +
57974 +reiser4_internal void
57975 +update_internal(const coord_t * coord, const reiser4_block_nr * blocknr)
57976 +{
57977 +       internal_item_layout *item = internal_at(coord);
57978 +       assert("nikita-2959", reiser4_blocknr_is_sane(blocknr));
57979 +
57980 +       cpu_to_dblock(*blocknr, &item->pointer);
57981 +}
57982 +
57983 +/* return child block number stored in the internal item at @coord */
57984 +static reiser4_block_nr
57985 +pointer_at(const coord_t * coord /* coord of item */ )
57986 +{
57987 +       assert("nikita-608", coord != NULL);
57988 +       return dblock_to_cpu(&internal_at(coord)->pointer);
57989 +}
57990 +
57991 +/* get znode pointed to by internal @item */
57992 +static znode *
57993 +znode_at(const coord_t * item /* coord of item */ ,
57994 +        znode * parent /* parent node */)
57995 +{
57996 +       return child_znode(item, parent, 1, 0);
57997 +}
57998 +
57999 +/* store pointer from internal item into "block". Implementation of
58000 +    ->down_link() method */
58001 +reiser4_internal void
58002 +down_link_internal(const coord_t * coord /* coord of item */ ,
58003 +                  const reiser4_key * key UNUSED_ARG   /* key to get
58004 +                                                        * pointer for */ ,
58005 +                  reiser4_block_nr * block /* resulting block number */ )
58006 +{
58007 +       ON_DEBUG(reiser4_key item_key);
58008 +
58009 +       assert("nikita-609", coord != NULL);
58010 +       assert("nikita-611", block != NULL);
58011 +       assert("nikita-612", (key == NULL) ||
58012 +              /* twig horrors */
58013 +              (znode_get_level(coord->node) == TWIG_LEVEL) || keyle(item_key_by_coord(coord, &item_key), key));
58014 +
58015 +       *block = pointer_at(coord);
58016 +       assert("nikita-2960", reiser4_blocknr_is_sane(block));
58017 +}
58018 +
58019 +/* Get the child's block number, or 0 if the block is unallocated. */
58020 +reiser4_internal int
58021 +utmost_child_real_block_internal(const coord_t * coord, sideof side UNUSED_ARG, reiser4_block_nr * block)
58022 +{
58023 +       assert("jmacd-2059", coord != NULL);
58024 +
58025 +       *block = pointer_at(coord);
58026 +       assert("nikita-2961", reiser4_blocknr_is_sane(block));
58027 +
58028 +       if (blocknr_is_fake(block)) {
58029 +               *block = 0;
58030 +       }
58031 +
58032 +       return 0;
58033 +}
58034 +
58035 +/* Return the child. */
58036 +reiser4_internal int
58037 +utmost_child_internal(const coord_t * coord, sideof side UNUSED_ARG, jnode ** childp)
58038 +{
58039 +       reiser4_block_nr block = pointer_at(coord);
58040 +       znode *child;
58041 +
58042 +       assert("jmacd-2059", childp != NULL);
58043 +       assert("nikita-2962", reiser4_blocknr_is_sane(&block));
58044 +
58045 +       child = zlook(znode_get_tree(coord->node), &block);
58046 +
58047 +       if (IS_ERR(child)) {
58048 +               return PTR_ERR(child);
58049 +       }
58050 +
58051 +       *childp = ZJNODE(child);
58052 +
58053 +       return 0;
58054 +}
58055 +
58056 +static void check_link(znode *left, znode *right)
58057 +{
58058 +       znode *scan;
58059 +
58060 +       for (scan = left; scan != right; scan = scan->right) {
58061 +               if (ZF_ISSET(scan, JNODE_RIP))
58062 +                       break;
58063 +               if (znode_is_right_connected(scan) && scan->right != NULL) {
58064 +                       if (ZF_ISSET(scan->right, JNODE_RIP))
58065 +                               break;
58066 +                       assert("nikita-3285",
58067 +                              znode_is_left_connected(scan->right));
58068 +                       assert("nikita-3265",
58069 +                              ergo(scan != left,
58070 +                                   ZF_ISSET(scan, JNODE_HEARD_BANSHEE)));
58071 +                       assert("nikita-3284", scan->right->left == scan);
58072 +               } else
58073 +                       break;
58074 +       }
58075 +}
58076 +
58077 +reiser4_internal int check__internal(const coord_t * coord, const char **error)
58078 +{
58079 +       reiser4_block_nr blk;
58080 +       znode *child;
58081 +       coord_t cpy;
58082 +
58083 +       blk = pointer_at(coord);
58084 +       if (!reiser4_blocknr_is_sane(&blk)) {
58085 +               *error = "Invalid pointer";
58086 +               return -1;
58087 +       }
58088 +       coord_dup(&cpy, coord);
58089 +       child = znode_at(&cpy, cpy.node);
58090 +       if (child != NULL) {
58091 +               znode *left_child;
58092 +               znode *right_child;
58093 +
58094 +               left_child = right_child = NULL;
58095 +
58096 +               assert("nikita-3256", znode_invariant(child));
58097 +               if (coord_prev_item(&cpy) == 0 && item_is_internal(&cpy)) {
58098 +                       left_child = znode_at(&cpy, cpy.node);
58099 +                       RLOCK_TREE(znode_get_tree(child));
58100 +                       if (left_child != NULL)
58101 +                               check_link(left_child, child);
58102 +                       RUNLOCK_TREE(znode_get_tree(child));
58103 +                       if (left_child != NULL)
58104 +                               zput(left_child);
58105 +               }
58106 +               coord_dup(&cpy, coord);
58107 +               if (coord_next_item(&cpy) == 0 && item_is_internal(&cpy)) {
58108 +                       right_child = znode_at(&cpy, cpy.node);
58109 +                       RLOCK_TREE(znode_get_tree(child));
58110 +                       if (right_child != NULL)
58111 +                               check_link(child, right_child);
58112 +                       RUNLOCK_TREE(znode_get_tree(child));
58113 +                       if (right_child != NULL)
58114 +                               zput(right_child);
58115 +               }
58116 +               zput(child);
58117 +       }
58118 +       return 0;
58119 +}
58120 +
58121 +#if REISER4_DEBUG_OUTPUT
58122 +/* debugging aid: print human readable information about internal item at
58123 +   @coord  */
58124 +reiser4_internal void
58125 +print_internal(const char *prefix /* prefix to print */ ,
58126 +              coord_t * coord /* coord of item to print  */ )
58127 +{
58128 +       reiser4_block_nr blk;
58129 +
58130 +       blk = pointer_at(coord);
58131 +       assert("nikita-2963", reiser4_blocknr_is_sane(&blk));
58132 +       printk("%s: internal: %s\n", prefix, sprint_address(&blk));
58133 +}
58134 +#endif
58135 +
58136 +/* return true only if this item really points to "block" */
58137 +/* Audited by: green(2002.06.14) */
58138 +reiser4_internal int
58139 +has_pointer_to_internal(const coord_t * coord /* coord of item */ ,
58140 +                       const reiser4_block_nr * block  /* block number to
58141 +                                                        * check */ )
58142 +{
58143 +       assert("nikita-613", coord != NULL);
58144 +       assert("nikita-614", block != NULL);
58145 +
58146 +       return pointer_at(coord) == *block;
58147 +}
58148 +
58149 +/* hook called by ->create_item() method of node plugin after new internal
58150 +   item was just created.
58151 +
58152 +   This is point where pointer to new node is inserted into tree. Initialize
58153 +   parent pointer in child znode, insert child into sibling list and slum.
58154 +
58155 +*/
58156 +reiser4_internal int
58157 +create_hook_internal(const coord_t * item /* coord of item */ ,
58158 +                    void *arg /* child's left neighbor, if any */ )
58159 +{
58160 +       znode *child;
58161 +
58162 +       assert("nikita-1252", item != NULL);
58163 +       assert("nikita-1253", item->node != NULL);
58164 +       assert("nikita-1181", znode_get_level(item->node) > LEAF_LEVEL);
58165 +       assert("nikita-1450", item->unit_pos == 0);
58166 +
58167 +       child = znode_at(item, item->node);
58168 +       if (!IS_ERR(child)) {
58169 +               znode *left;
58170 +               int result = 0;
58171 +               reiser4_tree *tree;
58172 +
58173 +               left = arg;
58174 +               tree = znode_get_tree(item->node);
58175 +               WLOCK_DK(tree);
58176 +               WLOCK_TREE(tree);
58177 +               assert("nikita-1400", (child->in_parent.node == NULL) || (znode_above_root(child->in_parent.node)));
58178 +               ++ item->node->c_count;
58179 +               coord_to_parent_coord(item, &child->in_parent);
58180 +               sibling_list_insert_nolock(child, left);
58181 +
58182 +               assert("nikita-3297", ZF_ISSET(child, JNODE_ORPHAN));
58183 +               ZF_CLR(child, JNODE_ORPHAN);
58184 +
58185 +               ON_TRACE(TRACE_ZWEB, "create: %llx: %i [%llx]\n",
58186 +                        *znode_get_block(item->node), item->node->c_count,
58187 +                        *znode_get_block(child));
58188 +
58189 +               WUNLOCK_TREE(tree);
58190 +               if ((left != NULL) && !keyeq(znode_get_rd_key(left),
58191 +                                            znode_get_rd_key(child))) {
58192 +                       znode_set_rd_key(child, znode_get_rd_key(left));
58193 +               }
58194 +               WUNLOCK_DK(tree);
58195 +               zput(child);
58196 +               return result;
58197 +       } else
58198 +               return PTR_ERR(child);
58199 +}
58200 +
58201 +/* hook called by ->cut_and_kill() method of node plugin just before internal
58202 +   item is removed.
58203 +
58204 +   This is point where empty node is removed from the tree. Clear parent
58205 +   pointer in child, and mark node for pending deletion.
58206 +
58207 +   Node will be actually deleted later and in several installations:
58208 +
58209 +    . when last lock on this node will be released, node will be removed from
58210 +    the sibling list and its lock will be invalidated
58211 +
58212 +    . when last reference to this node will be dropped, bitmap will be updated
58213 +    and node will be actually removed from the memory.
58214 +
58215 +
58216 +*/
58217 +reiser4_internal int
58218 +kill_hook_internal(const coord_t * item /* coord of item */ ,
58219 +                  pos_in_node_t from UNUSED_ARG /* start unit */ ,
58220 +                  pos_in_node_t count UNUSED_ARG /* stop unit */,
58221 +                  struct carry_kill_data *p UNUSED_ARG)
58222 +{
58223 +       znode *child;
58224 +
58225 +       assert("nikita-1222", item != NULL);
58226 +       assert("nikita-1224", from == 0);
58227 +       assert("nikita-1225", count == 1);
58228 +
58229 +       child = znode_at(item, item->node);
58230 +       if (IS_ERR(child))
58231 +               return PTR_ERR(child);
58232 +       else if (node_is_empty(child)) {
58233 +               reiser4_tree *tree;
58234 +
58235 +               assert("nikita-1397", znode_is_write_locked(child));
58236 +               assert("nikita-1398", child->c_count == 0);
58237 +               assert("nikita-2546", ZF_ISSET(child, JNODE_HEARD_BANSHEE));
58238 +
58239 +               tree = znode_get_tree(item->node);
58240 +               WLOCK_TREE(tree);
58241 +               init_parent_coord(&child->in_parent, NULL);
58242 +               -- item->node->c_count;
58243 +               WUNLOCK_TREE(tree);
58244 +               ON_TRACE(TRACE_ZWEB, "kill: %llx: %i [%llx]\n",
58245 +                        *znode_get_block(item->node), item->node->c_count,
58246 +                        *znode_get_block(child));
58247 +
58248 +               zput(child);
58249 +               return 0;
58250 +       } else {
58251 +               warning("nikita-1223", "Cowardly refuse to remove link to non-empty node");
58252 +               print_znode("parent", item->node);
58253 +               print_znode("child", child);
58254 +               zput(child);
58255 +               return RETERR(-EIO);
58256 +       }
58257 +}
58258 +
58259 +/* hook called by ->shift() node plugin method when iternal item was just
58260 +   moved from one node to another.
58261 +
58262 +   Update parent pointer in child and c_counts in old and new parent
58263 +
58264 +*/
58265 +reiser4_internal int
58266 +shift_hook_internal(const coord_t * item /* coord of item */ ,
58267 +                   unsigned from UNUSED_ARG /* start unit */ ,
58268 +                   unsigned count UNUSED_ARG /* stop unit */ ,
58269 +                   znode * old_node /* old parent */ )
58270 +{
58271 +       znode *child;
58272 +       znode *new_node;
58273 +       reiser4_tree *tree;
58274 +
58275 +       assert("nikita-1276", item != NULL);
58276 +       assert("nikita-1277", from == 0);
58277 +       assert("nikita-1278", count == 1);
58278 +       assert("nikita-1451", item->unit_pos == 0);
58279 +
58280 +       new_node = item->node;
58281 +       assert("nikita-2132", new_node != old_node);
58282 +       tree = znode_get_tree(item->node);
58283 +       child = child_znode(item, old_node, 1, 0);
58284 +       if (child == NULL)
58285 +               return 0;
58286 +       if (!IS_ERR(child)) {
58287 +               reiser4_stat_inc(tree.reparenting);
58288 +               WLOCK_TREE(tree);
58289 +               ++ new_node->c_count;
58290 +               assert("nikita-1395", znode_parent(child) == old_node);
58291 +               assert("nikita-1396", old_node->c_count > 0);
58292 +               coord_to_parent_coord(item, &child->in_parent);
58293 +               assert("nikita-1781", znode_parent(child) == new_node);
58294 +               assert("nikita-1782", check_tree_pointer(item, child) == NS_FOUND);
58295 +               -- old_node->c_count;
58296 +               WUNLOCK_TREE(tree);
58297 +               zput(child);
58298 +               ON_TRACE(TRACE_ZWEB, "shift: %llx: %i -> %lli: %i [%llx]\n",
58299 +                        *znode_get_block(old_node),
58300 +                        old_node->c_count, *znode_get_block(new_node),
58301 +                        new_node->c_count, *znode_get_block(child));
58302 +               return 0;
58303 +       } else
58304 +               return PTR_ERR(child);
58305 +}
58306 +
58307 +/* plugin->u.item.b.max_key_inside - not defined */
58308 +
58309 +/* plugin->u.item.b.nr_units - item.c:single_unit */
58310 +
58311 +/* Make Linus happy.
58312 +   Local variables:
58313 +   c-indentation-style: "K&R"
58314 +   mode-name: "LC"
58315 +   c-basic-offset: 8
58316 +   tab-width: 8
58317 +   fill-column: 120
58318 +   End:
58319 +*/
58320 diff -rupN linux-2.6.8-rc3/fs/reiser4/plugin/item/internal.h linux-2.6.8-rc3-a/fs/reiser4/plugin/item/internal.h
58321 --- linux-2.6.8-rc3/fs/reiser4/plugin/item/internal.h   1970-01-01 03:00:00.000000000 +0300
58322 +++ linux-2.6.8-rc3-a/fs/reiser4/plugin/item/internal.h 2004-08-05 21:20:52.985681094 +0400
58323 @@ -0,0 +1,51 @@
58324 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
58325 +/* Internal item contains down-link to the child of the internal/twig
58326 +   node in a tree. It is internal items that are actually used during
58327 +   tree traversal. */
58328 +
58329 +#if !defined( __FS_REISER4_PLUGIN_ITEM_INTERNAL_H__ )
58330 +#define __FS_REISER4_PLUGIN_ITEM_INTERNAL_H__
58331 +
58332 +#include "../../forward.h"
58333 +#include "../../dformat.h"
58334 +
58335 +/* on-disk layout of internal item */
58336 +typedef struct internal_item_layout {
58337 +       /*  0 */ reiser4_dblock_nr pointer;
58338 +       /*  4 */
58339 +} internal_item_layout;
58340 +
58341 +struct cut_list;
58342 +
58343 +int mergeable_internal(const coord_t * p1, const coord_t * p2);
58344 +lookup_result lookup_internal(const reiser4_key * key, lookup_bias bias, coord_t * coord);
58345 +/* store pointer from internal item into "block". Implementation of
58346 +    ->down_link() method */
58347 +extern void down_link_internal(const coord_t * coord, const reiser4_key * key, reiser4_block_nr * block);
58348 +extern int has_pointer_to_internal(const coord_t * coord, const reiser4_block_nr * block);
58349 +extern int create_hook_internal(const coord_t * item, void *arg);
58350 +extern int kill_hook_internal(const coord_t * item, pos_in_node_t from, pos_in_node_t count,
58351 +                             struct carry_kill_data *);
58352 +extern int shift_hook_internal(const coord_t * item, unsigned from, unsigned count, znode * old_node);
58353 +extern void print_internal(const char *prefix, coord_t * coord);
58354 +
58355 +extern int utmost_child_internal(const coord_t * coord, sideof side, jnode ** child);
58356 +int utmost_child_real_block_internal(const coord_t * coord, sideof side, reiser4_block_nr * block);
58357 +
58358 +extern void update_internal(const coord_t * coord,
58359 +                           const reiser4_block_nr * blocknr);
58360 +/* FIXME: reiserfs has check_internal */
58361 +extern int check__internal(const coord_t * coord, const char **error);
58362 +
58363 +/* __FS_REISER4_PLUGIN_ITEM_INTERNAL_H__ */
58364 +#endif
58365 +
58366 +/* Make Linus happy.
58367 +   Local variables:
58368 +   c-indentation-style: "K&R"
58369 +   mode-name: "LC"
58370 +   c-basic-offset: 8
58371 +   tab-width: 8
58372 +   fill-column: 120
58373 +   End:
58374 +*/
58375 diff -rupN linux-2.6.8-rc3/fs/reiser4/plugin/item/item.c linux-2.6.8-rc3-a/fs/reiser4/plugin/item/item.c
58376 --- linux-2.6.8-rc3/fs/reiser4/plugin/item/item.c       1970-01-01 03:00:00.000000000 +0300
58377 +++ linux-2.6.8-rc3-a/fs/reiser4/plugin/item/item.c     2004-08-05 21:20:53.173641449 +0400
58378 @@ -0,0 +1,770 @@
58379 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
58380 +
58381 +/* definition of item plugins. */
58382 +
58383 +#include "../../forward.h"
58384 +#include "../../debug.h"
58385 +#include "../../key.h"
58386 +#include "../../coord.h"
58387 +#include "../plugin_header.h"
58388 +#include "sde.h"
58389 +#include "../cryptcompress.h"
58390 +#include "internal.h"
58391 +#include "item.h"
58392 +#include "static_stat.h"
58393 +#include "../plugin.h"
58394 +#include "../../znode.h"
58395 +#include "../../tree.h"
58396 +#include "../../context.h"
58397 +#include "ctail.h"
58398 +
58399 +/* return pointer to item body */
58400 +reiser4_internal void
58401 +item_body_by_coord_hard(coord_t * coord /* coord to query */ )
58402 +{
58403 +       assert("nikita-324", coord != NULL);
58404 +       assert("nikita-325", coord->node != NULL);
58405 +       assert("nikita-326", znode_is_loaded(coord->node));
58406 +       assert("nikita-3200", coord->offset == INVALID_OFFSET);
58407 +       trace_stamp(TRACE_TREE);
58408 +
58409 +       coord->offset = node_plugin_by_node(coord->node)->item_by_coord(coord) - zdata(coord->node);
58410 +       ON_DEBUG(coord->body_v = coord->node->times_locked);
58411 +}
58412 +
58413 +reiser4_internal void *
58414 +item_body_by_coord_easy(const coord_t * coord /* coord to query */ )
58415 +{
58416 +       return zdata(coord->node) + coord->offset;
58417 +}
58418 +
58419 +#if REISER4_DEBUG
58420 +
58421 +reiser4_internal int
58422 +item_body_is_valid(const coord_t * coord)
58423 +{
58424 +       return
58425 +               coord->offset ==
58426 +               node_plugin_by_node(coord->node)->item_by_coord(coord) - zdata(coord->node);
58427 +}
58428 +
58429 +#endif
58430 +
58431 +/* return length of item at @coord */
58432 +reiser4_internal pos_in_node_t
58433 +item_length_by_coord(const coord_t * coord /* coord to query */ )
58434 +{
58435 +       int len;
58436 +
58437 +       assert("nikita-327", coord != NULL);
58438 +       assert("nikita-328", coord->node != NULL);
58439 +       assert("nikita-329", znode_is_loaded(coord->node));
58440 +       trace_stamp(TRACE_TREE);
58441 +
58442 +       len = node_plugin_by_node(coord->node)->length_by_coord(coord);
58443 +       check_contexts();
58444 +       return len;
58445 +}
58446 +
58447 +reiser4_internal void
58448 +obtain_item_plugin(const coord_t * coord)
58449 +{
58450 +       assert("nikita-330", coord != NULL);
58451 +       assert("nikita-331", coord->node != NULL);
58452 +       assert("nikita-332", znode_is_loaded(coord->node));
58453 +       trace_stamp(TRACE_TREE);
58454 +
58455 +       coord_set_iplug((coord_t *) coord,
58456 +                       node_plugin_by_node(coord->node)->plugin_by_coord(coord));
58457 +       assert("nikita-2479",
58458 +              coord_iplug(coord) == node_plugin_by_node(coord->node)->plugin_by_coord(coord));
58459 +}
58460 +
58461 +/* return type of item at @coord */
58462 +reiser4_internal item_type_id
58463 +item_type_by_coord(const coord_t * coord /* coord to query */ )
58464 +{
58465 +       assert("nikita-333", coord != NULL);
58466 +       assert("nikita-334", coord->node != NULL);
58467 +       assert("nikita-335", znode_is_loaded(coord->node));
58468 +       assert("nikita-336", item_plugin_by_coord(coord) != NULL);
58469 +
58470 +       trace_stamp(TRACE_TREE);
58471 +
58472 +       return item_plugin_by_coord(coord)->b.item_type;
58473 +}
58474 +
58475 +/* return id of item */
58476 +/* Audited by: green(2002.06.15) */
58477 +reiser4_internal item_id
58478 +item_id_by_coord(const coord_t * coord /* coord to query */ )
58479 +{
58480 +       assert("vs-539", coord != NULL);
58481 +       assert("vs-538", coord->node != NULL);
58482 +       assert("vs-537", znode_is_loaded(coord->node));
58483 +       assert("vs-536", item_plugin_by_coord(coord) != NULL);
58484 +
58485 +       trace_stamp(TRACE_TREE);
58486 +
58487 +       assert("vs-540", item_id_by_plugin(item_plugin_by_coord(coord)) < LAST_ITEM_ID);
58488 +       return item_id_by_plugin(item_plugin_by_coord(coord));
58489 +}
58490 +
58491 +/* return key of item at @coord */
58492 +/* Audited by: green(2002.06.15) */
58493 +reiser4_internal reiser4_key *
58494 +item_key_by_coord(const coord_t * coord /* coord to query */ ,
58495 +                 reiser4_key * key /* result */ )
58496 +{
58497 +       assert("nikita-338", coord != NULL);
58498 +       assert("nikita-339", coord->node != NULL);
58499 +       assert("nikita-340", znode_is_loaded(coord->node));
58500 +       trace_stamp(TRACE_TREE);
58501 +
58502 +       return node_plugin_by_node(coord->node)->key_at(coord, key);
58503 +}
58504 +
58505 +/* this returns max key in the item */
58506 +reiser4_internal reiser4_key *
58507 +max_item_key_by_coord(const coord_t *coord /* coord to query */ ,
58508 +                     reiser4_key *key /* result */ )
58509 +{
58510 +       coord_t last;
58511 +
58512 +       assert("nikita-338", coord != NULL);
58513 +       assert("nikita-339", coord->node != NULL);
58514 +       assert("nikita-340", znode_is_loaded(coord->node));
58515 +       trace_stamp(TRACE_TREE);
58516 +
58517 +       /* make coord pointing to last item's unit */
58518 +       coord_dup(&last, coord);
58519 +       last.unit_pos = coord_num_units(&last) - 1;
58520 +       assert("vs-1560", coord_is_existing_unit(&last));
58521 +
58522 +       max_unit_key_by_coord(&last, key);
58523 +       return key;
58524 +}
58525 +
58526 +/* return key of unit at @coord */
58527 +reiser4_internal reiser4_key *
58528 +unit_key_by_coord(const coord_t * coord /* coord to query */ ,
58529 +                 reiser4_key * key /* result */ )
58530 +{
58531 +       assert("nikita-772", coord != NULL);
58532 +       assert("nikita-774", coord->node != NULL);
58533 +       assert("nikita-775", znode_is_loaded(coord->node));
58534 +       trace_stamp(TRACE_TREE);
58535 +
58536 +       if (item_plugin_by_coord(coord)->b.unit_key != NULL)
58537 +               return item_plugin_by_coord(coord)->b.unit_key(coord, key);
58538 +       else
58539 +               return item_key_by_coord(coord, key);
58540 +}
58541 +
58542 +/* return the biggest key contained the unit @coord */
58543 +reiser4_internal reiser4_key *
58544 +max_unit_key_by_coord(const coord_t * coord /* coord to query */ ,
58545 +                     reiser4_key * key /* result */ )
58546 +{
58547 +       assert("nikita-772", coord != NULL);
58548 +       assert("nikita-774", coord->node != NULL);
58549 +       assert("nikita-775", znode_is_loaded(coord->node));
58550 +       trace_stamp(TRACE_TREE);
58551 +
58552 +       if (item_plugin_by_coord(coord)->b.max_unit_key != NULL)
58553 +               return item_plugin_by_coord(coord)->b.max_unit_key(coord, key);
58554 +       else
58555 +               return unit_key_by_coord(coord, key);
58556 +}
58557 +
58558 +
58559 +/* ->max_key_inside() method for items consisting of exactly one key (like
58560 +    stat-data) */
58561 +static reiser4_key *
58562 +max_key_inside_single_key(const coord_t * coord /* coord of item */ ,
58563 +                         reiser4_key * result /* resulting key */)
58564 +{
58565 +       assert("nikita-604", coord != NULL);
58566 +
58567 +       /* coord -> key is starting key of this item and it has to be already
58568 +          filled in */
58569 +       return unit_key_by_coord(coord, result);
58570 +}
58571 +
58572 +/* ->nr_units() method for items consisting of exactly one unit always */
58573 +static pos_in_node_t
58574 +nr_units_single_unit(const coord_t * coord UNUSED_ARG  /* coord of item */ )
58575 +{
58576 +       return 1;
58577 +}
58578 +
58579 +static int
58580 +paste_no_paste(coord_t * coord UNUSED_ARG,
58581 +              reiser4_item_data * data UNUSED_ARG,
58582 +              carry_plugin_info * info UNUSED_ARG)
58583 +{
58584 +       return 0;
58585 +}
58586 +
58587 +/* default ->fast_paste() method */
58588 +reiser4_internal int
58589 +agree_to_fast_op(const coord_t * coord UNUSED_ARG /* coord of item */ )
58590 +{
58591 +       return 1;
58592 +}
58593 +
58594 +reiser4_internal int
58595 +item_can_contain_key(const coord_t * item /* coord of item */ ,
58596 +                    const reiser4_key * key /* key to check */ ,
58597 +                    const reiser4_item_data * data     /* parameters of item
58598 +                                                        * being created */ )
58599 +{
58600 +       item_plugin *iplug;
58601 +       reiser4_key min_key_in_item;
58602 +       reiser4_key max_key_in_item;
58603 +
58604 +       assert("nikita-1658", item != NULL);
58605 +       assert("nikita-1659", key != NULL);
58606 +
58607 +       iplug = item_plugin_by_coord(item);
58608 +       if (iplug->b.can_contain_key != NULL)
58609 +               return iplug->b.can_contain_key(item, key, data);
58610 +       else {
58611 +               assert("nikita-1681", iplug->b.max_key_inside != NULL);
58612 +               item_key_by_coord(item, &min_key_in_item);
58613 +               iplug->b.max_key_inside(item, &max_key_in_item);
58614 +
58615 +               /* can contain key if
58616 +                     min_key_in_item <= key &&
58617 +                     key <= max_key_in_item
58618 +               */
58619 +               return keyle(&min_key_in_item, key) && keyle(key, &max_key_in_item);
58620 +       }
58621 +}
58622 +
58623 +/* return 0 if @item1 and @item2 are not mergeable, !0 - otherwise */
58624 +reiser4_internal int
58625 +are_items_mergeable(const coord_t * i1 /* coord of first item */ ,
58626 +                   const coord_t * i2 /* coord of second item */ )
58627 +{
58628 +       item_plugin *iplug;
58629 +       reiser4_key k1;
58630 +       reiser4_key k2;
58631 +
58632 +       assert("nikita-1336", i1 != NULL);
58633 +       assert("nikita-1337", i2 != NULL);
58634 +
58635 +       iplug = item_plugin_by_coord(i1);
58636 +       assert("nikita-1338", iplug != NULL);
58637 +
58638 +       IF_TRACE(TRACE_NODES, print_key("k1", item_key_by_coord(i1, &k1)));
58639 +       IF_TRACE(TRACE_NODES, print_key("k2", item_key_by_coord(i2, &k2)));
58640 +
58641 +       /* NOTE-NIKITA are_items_mergeable() is also called by assertions in
58642 +          shifting code when nodes are in "suspended" state. */
58643 +       assert("nikita-1663", keyle(item_key_by_coord(i1, &k1), item_key_by_coord(i2, &k2)));
58644 +
58645 +       if (iplug->b.mergeable != NULL) {
58646 +               return iplug->b.mergeable(i1, i2);
58647 +       } else if (iplug->b.max_key_inside != NULL) {
58648 +               iplug->b.max_key_inside(i1, &k1);
58649 +               item_key_by_coord(i2, &k2);
58650 +
58651 +               /* mergeable if ->max_key_inside() >= key of i2; */
58652 +               return keyge(iplug->b.max_key_inside(i1, &k1), item_key_by_coord(i2, &k2));
58653 +       } else {
58654 +               item_key_by_coord(i1, &k1);
58655 +               item_key_by_coord(i2, &k2);
58656 +
58657 +               return
58658 +                   (get_key_locality(&k1) == get_key_locality(&k2)) &&
58659 +                   (get_key_objectid(&k1) == get_key_objectid(&k2)) && (iplug == item_plugin_by_coord(i2));
58660 +       }
58661 +}
58662 +
58663 +reiser4_internal int
58664 +item_is_extent(const coord_t * item)
58665 +{
58666 +       assert("vs-482", coord_is_existing_item(item));
58667 +       return item_id_by_coord(item) == EXTENT_POINTER_ID;
58668 +}
58669 +
58670 +reiser4_internal int
58671 +item_is_tail(const coord_t * item)
58672 +{
58673 +       assert("vs-482", coord_is_existing_item(item));
58674 +       return item_id_by_coord(item) == FORMATTING_ID;
58675 +}
58676 +
58677 +reiser4_internal int
58678 +item_is_statdata(const coord_t * item)
58679 +{
58680 +       assert("vs-516", coord_is_existing_item(item));
58681 +       return item_type_by_coord(item) == STAT_DATA_ITEM_TYPE;
58682 +}
58683 +
58684 +static int
58685 +change_item(struct inode * inode, reiser4_plugin * plugin)
58686 +{
58687 +       /* cannot change constituent item (sd, or dir_item) */
58688 +       return RETERR(-EINVAL);
58689 +}
58690 +
58691 +static reiser4_plugin_ops item_plugin_ops = {
58692 +       .init     = NULL,
58693 +       .load     = NULL,
58694 +       .save_len = NULL,
58695 +       .save     = NULL,
58696 +       .change   = change_item
58697 +};
58698 +
58699 +
58700 +item_plugin item_plugins[LAST_ITEM_ID] = {
58701 +       [STATIC_STAT_DATA_ID] = {
58702 +               .h = {
58703 +                       .type_id = REISER4_ITEM_PLUGIN_TYPE,
58704 +                       .id      = STATIC_STAT_DATA_ID,
58705 +                       .pops    = &item_plugin_ops,
58706 +                       .label   = "sd",
58707 +                       .desc    = "stat-data",
58708 +                       .linkage = TYPE_SAFE_LIST_LINK_ZERO
58709 +               },
58710 +               .b = {
58711 +                       .item_type         = STAT_DATA_ITEM_TYPE,
58712 +                       .max_key_inside    = max_key_inside_single_key,
58713 +                       .can_contain_key   = NULL,
58714 +                       .mergeable         = NULL,
58715 +                       .nr_units          = nr_units_single_unit,
58716 +                       .lookup            = NULL,
58717 +                       .init              = NULL,
58718 +                       .paste             = paste_no_paste,
58719 +                       .fast_paste        = NULL,
58720 +                       .can_shift         = NULL,
58721 +                       .copy_units        = NULL,
58722 +                       .create_hook       = NULL,
58723 +                       .kill_hook         = NULL,
58724 +                       .shift_hook        = NULL,
58725 +                       .cut_units         = NULL,
58726 +                       .kill_units        = NULL,
58727 +                       .unit_key          = NULL,
58728 +                       .max_unit_key      = NULL,
58729 +                       .estimate          = NULL,
58730 +                       .item_data_by_flow = NULL,
58731 +#if REISER4_DEBUG_OUTPUT
58732 +                       .print             = print_sd,
58733 +                       .item_stat         = item_stat_static_sd,
58734 +#endif
58735 +#if REISER4_DEBUG
58736 +                       .check             = NULL
58737 +#endif
58738 +               },
58739 +               .f = {
58740 +                       .utmost_child            = NULL,
58741 +                       .utmost_child_real_block = NULL,
58742 +                       .update                  = NULL,
58743 +                       .scan                    = NULL,
58744 +                       .squeeze                 = NULL
58745 +               },
58746 +               .s = {
58747 +                       .sd = {
58748 +                               .init_inode = init_inode_static_sd,
58749 +                               .save_len   = save_len_static_sd,
58750 +                               .save       = save_static_sd
58751 +                       }
58752 +               }
58753 +       },
58754 +       [SIMPLE_DIR_ENTRY_ID] = {
58755 +               .h = {
58756 +                       .type_id = REISER4_ITEM_PLUGIN_TYPE,
58757 +                       .id      = SIMPLE_DIR_ENTRY_ID,
58758 +                       .pops    = &item_plugin_ops,
58759 +                       .label   = "de",
58760 +                       .desc    = "directory entry",
58761 +                       .linkage = TYPE_SAFE_LIST_LINK_ZERO
58762 +               },
58763 +               .b = {
58764 +                       .item_type         = DIR_ENTRY_ITEM_TYPE,
58765 +                       .max_key_inside    = max_key_inside_single_key,
58766 +                       .can_contain_key   = NULL,
58767 +                       .mergeable         = NULL,
58768 +                       .nr_units          = nr_units_single_unit,
58769 +                       .lookup            = NULL,
58770 +                       .init              = NULL,
58771 +                       .paste             = NULL,
58772 +                       .fast_paste        = NULL,
58773 +                       .can_shift         = NULL,
58774 +                       .copy_units        = NULL,
58775 +                       .create_hook       = NULL,
58776 +                       .kill_hook         = NULL,
58777 +                       .shift_hook        = NULL,
58778 +                       .cut_units         = NULL,
58779 +                       .kill_units        = NULL,
58780 +                       .unit_key          = NULL,
58781 +                       .max_unit_key      = NULL,
58782 +                       .estimate          = NULL,
58783 +                       .item_data_by_flow = NULL,
58784 +#if REISER4_DEBUG_OUTPUT
58785 +                       .print             = print_de,
58786 +                       .item_stat         = NULL,
58787 +#endif
58788 +#if REISER4_DEBUG
58789 +                       .check             = NULL
58790 +#endif
58791 +               },
58792 +               .f = {
58793 +                       .utmost_child            = NULL,
58794 +                       .utmost_child_real_block = NULL,
58795 +                       .update                  = NULL,
58796 +                       .scan                    = NULL,
58797 +                       .squeeze                 = NULL
58798 +               },
58799 +               .s = {
58800 +                       .dir = {
58801 +                               .extract_key       = extract_key_de,
58802 +                               .update_key        = update_key_de,
58803 +                               .extract_name      = extract_name_de,
58804 +                               .extract_file_type = extract_file_type_de,
58805 +                               .add_entry         = add_entry_de,
58806 +                               .rem_entry         = rem_entry_de,
58807 +                               .max_name_len      = max_name_len_de
58808 +                       }
58809 +               }
58810 +       },
58811 +       [COMPOUND_DIR_ID] = {
58812 +               .h = {
58813 +                       .type_id = REISER4_ITEM_PLUGIN_TYPE,
58814 +                       .id      = COMPOUND_DIR_ID,
58815 +                       .pops    = &item_plugin_ops,
58816 +                       .label   = "cde",
58817 +                       .desc    = "compressed directory entry",
58818 +                       .linkage = TYPE_SAFE_LIST_LINK_ZERO
58819 +               },
58820 +               .b = {
58821 +                       .item_type         = DIR_ENTRY_ITEM_TYPE,
58822 +                       .max_key_inside    = max_key_inside_cde,
58823 +                       .can_contain_key   = can_contain_key_cde,
58824 +                       .mergeable         = mergeable_cde,
58825 +                       .nr_units          = nr_units_cde,
58826 +                       .lookup            = lookup_cde,
58827 +                       .init              = init_cde,
58828 +                       .paste             = paste_cde,
58829 +                       .fast_paste        = agree_to_fast_op,
58830 +                       .can_shift         = can_shift_cde,
58831 +                       .copy_units        = copy_units_cde,
58832 +                       .create_hook       = NULL,
58833 +                       .kill_hook         = NULL,
58834 +                       .shift_hook        = NULL,
58835 +                       .cut_units         = cut_units_cde,
58836 +                       .kill_units        = kill_units_cde,
58837 +                       .unit_key          = unit_key_cde,
58838 +                       .max_unit_key      = unit_key_cde,
58839 +                       .estimate          = estimate_cde,
58840 +                       .item_data_by_flow = NULL
58841 +#if REISER4_DEBUG_OUTPUT
58842 +                       , .print           = print_cde,
58843 +                       .item_stat         = NULL
58844 +#endif
58845 +#if REISER4_DEBUG
58846 +                       , .check           = check_cde
58847 +#endif
58848 +               },
58849 +               .f = {
58850 +                       .utmost_child            = NULL,
58851 +                       .utmost_child_real_block = NULL,
58852 +                       .update                  = NULL,
58853 +                       .scan                    = NULL,
58854 +                       .squeeze                 = NULL
58855 +               },
58856 +               .s = {
58857 +                       .dir = {
58858 +                               .extract_key       = extract_key_cde,
58859 +                               .update_key        = update_key_cde,
58860 +                               .extract_name      = extract_name_cde,
58861 +                               .extract_file_type = extract_file_type_de,
58862 +                               .add_entry         = add_entry_cde,
58863 +                               .rem_entry         = rem_entry_cde,
58864 +                               .max_name_len      = max_name_len_cde
58865 +                       }
58866 +               }
58867 +       },
58868 +       [NODE_POINTER_ID] = {
58869 +               .h = {
58870 +                       .type_id = REISER4_ITEM_PLUGIN_TYPE,
58871 +                       .id      = NODE_POINTER_ID,
58872 +                       .pops    = NULL,
58873 +                       .label   = "internal",
58874 +                       .desc    = "internal item",
58875 +                       .linkage = TYPE_SAFE_LIST_LINK_ZERO
58876 +               },
58877 +               .b = {
58878 +                       .item_type         = INTERNAL_ITEM_TYPE,
58879 +                       .max_key_inside    = NULL,
58880 +                       .can_contain_key   = NULL,
58881 +                       .mergeable         = mergeable_internal,
58882 +                       .nr_units          = nr_units_single_unit,
58883 +                       .lookup            = lookup_internal,
58884 +                       .init              = NULL,
58885 +                       .paste             = NULL,
58886 +                       .fast_paste        = NULL,
58887 +                       .can_shift         = NULL,
58888 +                       .copy_units        = NULL,
58889 +                       .create_hook       = create_hook_internal,
58890 +                       .kill_hook         = kill_hook_internal,
58891 +                       .shift_hook        = shift_hook_internal,
58892 +                       .cut_units         = NULL,
58893 +                       .kill_units        = NULL,
58894 +                       .unit_key          = NULL,
58895 +                       .max_unit_key      = NULL,
58896 +                       .estimate          = NULL,
58897 +                       .item_data_by_flow = NULL
58898 +#if REISER4_DEBUG_OUTPUT
58899 +                       , .print           = print_internal,
58900 +                       .item_stat         = NULL
58901 +#endif
58902 +#if REISER4_DEBUG
58903 +                       , .check           = check__internal
58904 +#endif
58905 +               },
58906 +               .f = {
58907 +                       .utmost_child            = utmost_child_internal,
58908 +                       .utmost_child_real_block = utmost_child_real_block_internal,
58909 +                       .update                  = update_internal,
58910 +                       .scan                    = NULL,
58911 +                       .squeeze                 = NULL
58912 +               },
58913 +               .s = {
58914 +                       .internal = {
58915 +                               .down_link      = down_link_internal,
58916 +                               .has_pointer_to = has_pointer_to_internal
58917 +                       }
58918 +               }
58919 +       },
58920 +       [EXTENT_POINTER_ID] = {
58921 +               .h = {
58922 +                       .type_id = REISER4_ITEM_PLUGIN_TYPE,
58923 +                       .id      = EXTENT_POINTER_ID,
58924 +                       .pops    = NULL,
58925 +                       .label   = "extent",
58926 +                       .desc    = "extent item",
58927 +                       .linkage = TYPE_SAFE_LIST_LINK_ZERO
58928 +               },
58929 +               .b = {
58930 +                       .item_type         = UNIX_FILE_METADATA_ITEM_TYPE,
58931 +                       .max_key_inside    = max_key_inside_extent,
58932 +                       .can_contain_key   = can_contain_key_extent,
58933 +                       .mergeable         = mergeable_extent,
58934 +                       .nr_units          = nr_units_extent,
58935 +                       .lookup            = lookup_extent,
58936 +                       .init              = NULL,
58937 +                       .paste             = paste_extent,
58938 +                       .fast_paste        = agree_to_fast_op,
58939 +                       .can_shift         = can_shift_extent,
58940 +                       .create_hook       = create_hook_extent,
58941 +                       .copy_units        = copy_units_extent,
58942 +                       .kill_hook         = kill_hook_extent,
58943 +                       .shift_hook        = NULL,
58944 +                       .cut_units         = cut_units_extent,
58945 +                       .kill_units        = kill_units_extent,
58946 +                       .unit_key          = unit_key_extent,
58947 +                       .max_unit_key      = max_unit_key_extent,
58948 +                       .estimate          = NULL,
58949 +                       .item_data_by_flow = NULL,
58950 +                       .show              = show_extent,
58951 +#if REISER4_DEBUG_OUTPUT
58952 +                       .print             = print_extent,
58953 +                       .item_stat         = item_stat_extent,
58954 +#endif
58955 +#if REISER4_DEBUG
58956 +                       .check = check_extent
58957 +#endif
58958 +               },
58959 +               .f = {
58960 +                       .utmost_child            = utmost_child_extent,
58961 +                       .utmost_child_real_block = utmost_child_real_block_extent,
58962 +                       .update                  = NULL,
58963 +                       .scan                    = scan_extent,
58964 +                       .squeeze                 = NULL,
58965 +                       .key_by_offset           = key_by_offset_extent
58966 +               },
58967 +               .s = {
58968 +                       .file = {
58969 +                               .write                = write_extent,
58970 +                               .read                 = read_extent,
58971 +                               .readpage             = readpage_extent,
58972 +                               .capture              = capture_extent,
58973 +                               .get_block            = get_block_address_extent,
58974 +                               .readpages            = readpages_extent,
58975 +                               .append_key           = append_key_extent,
58976 +                               .init_coord_extension = init_coord_extension_extent
58977 +                       }
58978 +               }
58979 +       },
58980 +       [FORMATTING_ID] = {
58981 +               .h = {
58982 +                       .type_id = REISER4_ITEM_PLUGIN_TYPE,
58983 +                       .id      = FORMATTING_ID,
58984 +                       .pops    = NULL,
58985 +                       .label   = "body",
58986 +                       .desc    = "body (or tail?) item",
58987 +                       .linkage = TYPE_SAFE_LIST_LINK_ZERO
58988 +               },
58989 +               .b = {
58990 +                       .item_type         = UNIX_FILE_METADATA_ITEM_TYPE,
58991 +                       .max_key_inside    = max_key_inside_tail,
58992 +                       .can_contain_key   = can_contain_key_tail,
58993 +                       .mergeable         = mergeable_tail,
58994 +                       .nr_units          = nr_units_tail,
58995 +                       .lookup            = lookup_tail,
58996 +                       .init              = NULL,
58997 +                       .paste             = paste_tail,
58998 +                       .fast_paste        = agree_to_fast_op,
58999 +                       .can_shift         = can_shift_tail,
59000 +                       .create_hook       = NULL,
59001 +                       .copy_units        = copy_units_tail,
59002 +                       .kill_hook         = kill_hook_tail,
59003 +                       .shift_hook        = NULL,
59004 +                       .cut_units         = cut_units_tail,
59005 +                       .kill_units        = kill_units_tail,
59006 +                       .unit_key          = unit_key_tail,
59007 +                       .max_unit_key      = unit_key_tail,
59008 +                       .estimate          = NULL,
59009 +                       .item_data_by_flow = NULL,
59010 +                       .show              = show_tail,
59011 +#if REISER4_DEBUG_OUTPUT
59012 +                       .print             = NULL,
59013 +                       .item_stat         = NULL,
59014 +#endif
59015 +#if REISER4_DEBUG
59016 +                       .check             = NULL
59017 +#endif
59018 +               },
59019 +               .f = {
59020 +                       .utmost_child            = NULL,
59021 +                       .utmost_child_real_block = NULL,
59022 +                       .update                  = NULL,
59023 +                       .scan                    = NULL,
59024 +                       .squeeze                 = NULL
59025 +               },
59026 +               .s = {
59027 +                       .file = {
59028 +                               .write                = write_tail,
59029 +                               .read                 = read_tail,
59030 +                               .readpage             = readpage_tail,
59031 +                               .capture              = NULL,
59032 +                               .get_block            = NULL,
59033 +                               .readpages            = NULL,
59034 +                               .append_key           = append_key_tail,
59035 +                               .init_coord_extension = init_coord_extension_tail
59036 +                       }
59037 +               }
59038 +       },
59039 +       [CTAIL_ID] = {
59040 +               .h = {
59041 +                       .type_id = REISER4_ITEM_PLUGIN_TYPE,
59042 +                       .id      = CTAIL_ID,
59043 +                       .pops    = NULL,
59044 +                       .label   = "ctail",
59045 +                       .desc    = "cryptcompress tail item",
59046 +                       .linkage = TYPE_SAFE_LIST_LINK_ZERO
59047 +               },
59048 +               .b = {
59049 +                       .item_type         = UNIX_FILE_METADATA_ITEM_TYPE,
59050 +                       .max_key_inside    = max_key_inside_tail,
59051 +                       .can_contain_key   = can_contain_key_ctail,
59052 +                       .mergeable         = mergeable_ctail,
59053 +                       .nr_units          = nr_units_ctail,
59054 +                       .lookup            = NULL,
59055 +                       .init              = init_ctail,
59056 +                       .paste             = paste_ctail,
59057 +                       .fast_paste        = agree_to_fast_op,
59058 +                       .can_shift         = can_shift_ctail,
59059 +                       .create_hook       = NULL,
59060 +                       .copy_units        = copy_units_ctail,
59061 +                       .kill_hook         = kill_hook_ctail,
59062 +                       .shift_hook        = shift_hook_ctail,
59063 +                       .cut_units         = cut_units_ctail,
59064 +                       .kill_units        = kill_units_ctail,
59065 +                       .unit_key          = unit_key_tail,
59066 +                       .max_unit_key      = unit_key_tail,
59067 +                       .estimate          = estimate_ctail,
59068 +                       .item_data_by_flow = NULL
59069 +#if REISER4_DEBUG_OUTPUT
59070 +                       , .print           = print_ctail,
59071 +                       .item_stat         = NULL
59072 +#endif
59073 +#if REISER4_DEBUG
59074 +                       , .check           = NULL
59075 +#endif
59076 +               },
59077 +               .f = {
59078 +                       .utmost_child            = utmost_child_ctail,
59079 +                       /* FIXME-EDWARD: write this */
59080 +                       .utmost_child_real_block = NULL,
59081 +                       .update                  = NULL,
59082 +                       .scan                    = scan_ctail,
59083 +                       .squeeze                 = squeeze_ctail
59084 +               },
59085 +               .s = {
59086 +                       .file = {
59087 +                               .write                = NULL,
59088 +                               .read                 = read_ctail,
59089 +                               .readpage             = readpage_ctail,
59090 +                               .capture              = NULL,
59091 +                               .get_block            = get_block_address_tail,
59092 +                               .readpages            = readpages_ctail,
59093 +                               .append_key           = append_key_ctail,
59094 +                               .init_coord_extension = init_coord_extension_tail
59095 +                       }
59096 +               }
59097 +       },
59098 +       [BLACK_BOX_ID] = {
59099 +               .h = {
59100 +                       .type_id = REISER4_ITEM_PLUGIN_TYPE,
59101 +                       .id      = BLACK_BOX_ID,
59102 +                       .pops    = NULL,
59103 +                       .label   = "blackbox",
59104 +                       .desc    = "black box item",
59105 +                       .linkage = TYPE_SAFE_LIST_LINK_ZERO
59106 +               },
59107 +               .b = {
59108 +                       .item_type         = OTHER_ITEM_TYPE,
59109 +                       .max_key_inside    = NULL,
59110 +                       .can_contain_key   = NULL,
59111 +                       .mergeable         = NULL,
59112 +                       .nr_units          = nr_units_single_unit,
59113 +                       /* to need for ->lookup method */
59114 +                       .lookup            = NULL,
59115 +                       .init              = NULL,
59116 +                       .paste             = NULL,
59117 +                       .fast_paste        = NULL,
59118 +                       .can_shift         = NULL,
59119 +                       .copy_units        = NULL,
59120 +                       .create_hook       = NULL,
59121 +                       .kill_hook         = NULL,
59122 +                       .shift_hook        = NULL,
59123 +                       .cut_units         = NULL,
59124 +                       .kill_units        = NULL,
59125 +                       .unit_key          = NULL,
59126 +                       .max_unit_key      = NULL,
59127 +                       .estimate          = NULL,
59128 +                       .item_data_by_flow = NULL,
59129 +#if REISER4_DEBUG_OUTPUT
59130 +                       .print             = NULL,
59131 +                       .item_stat         = NULL,
59132 +#endif
59133 +#if REISER4_DEBUG
59134 +                       .check             = NULL
59135 +#endif
59136 +               }
59137 +       }
59138 +};
59139 +
59140 +/* Make Linus happy.
59141 +   Local variables:
59142 +   c-indentation-style: "K&R"
59143 +   mode-name: "LC"
59144 +   c-basic-offset: 8
59145 +   tab-width: 8
59146 +   fill-column: 120
59147 +   End:
59148 +*/
59149 diff -rupN linux-2.6.8-rc3/fs/reiser4/plugin/item/item.h linux-2.6.8-rc3-a/fs/reiser4/plugin/item/item.h
59150 --- linux-2.6.8-rc3/fs/reiser4/plugin/item/item.h       1970-01-01 03:00:00.000000000 +0300
59151 +++ linux-2.6.8-rc3-a/fs/reiser4/plugin/item/item.h     2004-08-05 21:20:53.380597797 +0400
59152 @@ -0,0 +1,396 @@
59153 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
59154 +
59155 +/* first read balance.c comments before reading this */
59156 +
59157 +/* An item_plugin implements all of the operations required for
59158 +   balancing that are item specific. */
59159 +
59160 +/* an item plugin also implements other operations that are specific to that
59161 +   item.  These go into the item specific operations portion of the item
59162 +   handler, and all of the item specific portions of the item handler are put
59163 +   into a union. */
59164 +
59165 +#if !defined( __REISER4_ITEM_H__ )
59166 +#define __REISER4_ITEM_H__
59167 +
59168 +#include "../../forward.h"
59169 +#include "../plugin_header.h"
59170 +#include "../../dformat.h"
59171 +#include "../../seal.h"
59172 +#include "../../plugin/file/file.h"
59173 +
59174 +#include <linux/fs.h>          /* for struct file, struct inode  */
59175 +#include <linux/mm.h>          /* for struct page */
59176 +#include <linux/dcache.h>      /* for struct dentry */
59177 +
59178 +typedef enum {
59179 +       STAT_DATA_ITEM_TYPE,
59180 +       DIR_ENTRY_ITEM_TYPE,
59181 +       INTERNAL_ITEM_TYPE,
59182 +       UNIX_FILE_METADATA_ITEM_TYPE,
59183 +       OTHER_ITEM_TYPE
59184 +} item_type_id;
59185 +
59186 +
59187 +/* this is the part of each item plugin that all items are expected to
59188 +   support or at least explicitly fail to support by setting the
59189 +   pointer to null. */
59190 +typedef struct {
59191 +       item_type_id item_type;
59192 +
59193 +       /* operations called by balancing
59194 +
59195 +       It is interesting to consider that some of these item
59196 +       operations could be given sources or targets that are not
59197 +       really items in nodes.  This could be ok/useful.
59198 +
59199 +       */
59200 +       /* maximal key that can _possibly_ be occupied by this item
59201 +
59202 +           When inserting, and node ->lookup() method (called by
59203 +           coord_by_key()) reaches an item after binary search,
59204 +           the  ->max_key_inside() item plugin method is used to determine
59205 +           whether new item should pasted into existing item
59206 +            (new_key<=max_key_inside()) or new item has to be created
59207 +           (new_key>max_key_inside()).
59208 +
59209 +           For items that occupy exactly one key (like stat-data)
59210 +           this method should return this key. For items that can
59211 +           grow indefinitely (extent, directory item) this should
59212 +           return max_key().
59213 +
59214 +          For example extent with the key
59215 +
59216 +          (LOCALITY,4,OBJID,STARTING-OFFSET), and length BLK blocks,
59217 +
59218 +          ->max_key_inside is (LOCALITY,4,OBJID,0xffffffffffffffff), and
59219 +       */
59220 +       reiser4_key *(*max_key_inside) (const coord_t *, reiser4_key *);
59221 +
59222 +       /* true if item @coord can merge data at @key. */
59223 +       int (*can_contain_key) (const coord_t *, const reiser4_key *, const reiser4_item_data *);
59224 +       /* mergeable() - check items for mergeability
59225 +
59226 +          Optional method. Returns true if two items can be merged.
59227 +
59228 +       */
59229 +       int (*mergeable) (const coord_t *, const coord_t *);
59230 +
59231 +       /* number of atomic things in an item */
59232 +       pos_in_node_t (*nr_units) (const coord_t *);
59233 +
59234 +       /* search within item for a unit within the item, and return a
59235 +          pointer to it.  This can be used to calculate how many
59236 +          bytes to shrink an item if you use pointer arithmetic and
59237 +          compare to the start of the item body if the item's data
59238 +          are continuous in the node, if the item's data are not
59239 +          continuous in the node, all sorts of other things are maybe
59240 +          going to break as well. */
59241 +       lookup_result(*lookup) (const reiser4_key *, lookup_bias, coord_t *);
59242 +       /* method called by ode_plugin->create_item() to initialise new
59243 +          item */
59244 +       int (*init) (coord_t * target, coord_t * from, reiser4_item_data * data);
59245 +       /* method called (e.g., by resize_item()) to place new data into
59246 +           item when it grows*/
59247 +       int (*paste) (coord_t *, reiser4_item_data *, carry_plugin_info *);
59248 +       /* return true if paste into @coord is allowed to skip
59249 +          carry. That is, if such paste would require any changes
59250 +          at the parent level
59251 +       */
59252 +       int (*fast_paste) (const coord_t *);
59253 +       /* how many but not more than @want units of @source can be
59254 +          shifted into @target node. If pend == append - we try to
59255 +          append last item of @target by first units of @source. If
59256 +          pend == prepend - we try to "prepend" first item in @target
59257 +          by last units of @source. @target node has @free_space
59258 +          bytes of free space. Total size of those units are returned
59259 +          via @size.
59260 +
59261 +          @target is not NULL if shifting to the mergeable item and
59262 +          NULL is new item will be created during shifting.
59263 +       */
59264 +       int (*can_shift) (unsigned free_space, coord_t *,
59265 +                         znode *, shift_direction, unsigned *size, unsigned want);
59266 +
59267 +       /* starting off @from-th unit of item @source append or
59268 +          prepend @count units to @target. @target has been already
59269 +          expanded by @free_space bytes. That must be exactly what is
59270 +          needed for those items in @target. If @where_is_free_space
59271 +          == SHIFT_LEFT - free space is at the end of @target item,
59272 +          othersize - it is in the beginning of it. */
59273 +       void (*copy_units) (coord_t *, coord_t *,
59274 +                           unsigned from, unsigned count, shift_direction where_is_free_space, unsigned free_space);
59275 +
59276 +       int (*create_hook) (const coord_t *, void *);
59277 +       /* do whatever is necessary to do when @count units starting
59278 +          from @from-th one are removed from the tree */
59279 +       /* FIXME-VS: this is used to be here for, in particular,
59280 +          extents and items of internal type to free blocks they point
59281 +          to at the same time with removing items from a
59282 +          tree. Problems start, however, when dealloc_block fails due
59283 +          to some reason. Item gets removed, but blocks it pointed to
59284 +          are not freed. It is not clear how to fix this for items of
59285 +          internal type because a need to remove internal item may
59286 +          appear in the middle of balancing, and there is no way to
59287 +          undo changes made. OTOH, if space allocator involves
59288 +          balancing to perform dealloc_block - this will probably
59289 +          break balancing due to deadlock issues
59290 +       */
59291 +       int (*kill_hook) (const coord_t *, pos_in_node_t from, pos_in_node_t count, struct carry_kill_data *);
59292 +       int (*shift_hook) (const coord_t *, unsigned from, unsigned count, znode *_node);
59293 +
59294 +       /* unit @*from contains @from_key. unit @*to contains @to_key. Cut all keys between @from_key and @to_key
59295 +          including boundaries. When units are cut from item beginning - move space which gets freed to head of
59296 +          item. When units are cut from item end - move freed space to item end. When units are cut from the middle of
59297 +          item - move freed space to item head. Return amount of space which got freed. Save smallest removed key in
59298 +          @smallest_removed if it is not 0. Save new first item key in @new_first_key if it is not 0
59299 +       */
59300 +       int (*cut_units) (coord_t *, pos_in_node_t from, pos_in_node_t to, struct carry_cut_data *,
59301 +                         reiser4_key *smallest_removed, reiser4_key *new_first_key);
59302 +
59303 +       /* like cut_units, except that these units are removed from the
59304 +          tree, not only from a node */
59305 +       int (*kill_units) (coord_t *, pos_in_node_t from, pos_in_node_t to, struct carry_kill_data *,
59306 +                          reiser4_key *smallest_removed, reiser4_key *new_first);
59307 +
59308 +       /* if @key_of_coord == 1 - returned key of coord, otherwise -
59309 +          key of unit is returned. If @coord is not set to certain
59310 +          unit - ERR_PTR(-ENOENT) is returned */
59311 +       reiser4_key *(*unit_key) (const coord_t *, reiser4_key *);
59312 +       reiser4_key *(*max_unit_key) (const coord_t *, reiser4_key *);
59313 +       /* estimate how much space is needed for paste @data into item at
59314 +          @coord. if @coord==0 - estimate insertion, otherwise - estimate
59315 +          pasting
59316 +       */
59317 +       int (*estimate) (const coord_t *, const reiser4_item_data *);
59318 +
59319 +       /* converts flow @f to item data. @coord == 0 on insert */
59320 +       int (*item_data_by_flow) (const coord_t *, const flow_t *, reiser4_item_data *);
59321 +
59322 +       void (*show) (struct seq_file *, coord_t *);
59323 +
59324 +#if REISER4_DEBUG_OUTPUT
59325 +       /* used for debugging only, prints an ascii description of the
59326 +          item contents */
59327 +       void (*print) (const char *, coord_t *);
59328 +       /* gather statistics */
59329 +       void (*item_stat) (const coord_t *, void *);
59330 +#endif
59331 +
59332 +#if REISER4_DEBUG
59333 +       /* used for debugging, every item should have here the most
59334 +          complete possible check of the consistency of the item that
59335 +          the inventor can construct */
59336 +       int (*check) (const coord_t *, const char **error);
59337 +#endif
59338 +
59339 +} balance_ops;
59340 +
59341 +typedef struct {
59342 +       /* return the right or left child of @coord, only if it is in memory */
59343 +       int (*utmost_child) (const coord_t *, sideof side, jnode ** child);
59344 +
59345 +       /* return whether the right or left child of @coord has a non-fake
59346 +          block number. */
59347 +       int (*utmost_child_real_block) (const coord_t *, sideof side, reiser4_block_nr *);
59348 +       /* relocate child at @coord to the @block */
59349 +       void (*update) (const coord_t *, const reiser4_block_nr *);
59350 +       /* count unformatted nodes per item for leave relocation policy, etc.. */
59351 +       int (*scan) (flush_scan * scan);
59352 +       /* squeeze by unformatted child */
59353 +       int (*squeeze) (flush_pos_t * pos);
59354 +       /* backward mapping from jnode offset to a key.  */
59355 +       int (*key_by_offset) (struct inode *, loff_t, reiser4_key *);
59356 +} flush_ops;
59357 +
59358 +/* operations specific to the directory item */
59359 +typedef struct {
59360 +       /* extract stat-data key from directory entry at @coord and place it
59361 +          into @key. */
59362 +       int (*extract_key) (const coord_t *, reiser4_key * key);
59363 +       /* update object key in item. */
59364 +       int (*update_key) (const coord_t *, const reiser4_key *, lock_handle *);
59365 +       /* extract name from directory entry at @coord and return it */
59366 +       char *(*extract_name) (const coord_t *, char *buf);
59367 +       /* extract file type (DT_* stuff) from directory entry at @coord and
59368 +          return it */
59369 +       unsigned (*extract_file_type) (const coord_t *);
59370 +       int (*add_entry) (struct inode *dir,
59371 +                         coord_t *, lock_handle *,
59372 +                         const struct dentry *name, reiser4_dir_entry_desc *entry);
59373 +       int (*rem_entry) (struct inode *dir, const struct qstr *name,
59374 +                         coord_t *, lock_handle *,
59375 +                         reiser4_dir_entry_desc *entry);
59376 +       int (*max_name_len) (const struct inode *dir);
59377 +} dir_entry_ops;
59378 +
59379 +/* operations specific to items regular (unix) file metadata are built of */
59380 +typedef struct {
59381 +       int (*write)(struct inode *, flow_t *, hint_t *, int grabbed, write_mode_t);
59382 +       int (*read)(struct file *, flow_t *, hint_t *);
59383 +       int (*readpage) (void *, struct page *);
59384 +       int (*capture) (reiser4_key *, uf_coord_t *, struct page *, write_mode_t);
59385 +       int (*get_block) (const coord_t *, sector_t, struct buffer_head *);
59386 +       void (*readpages) (void *, struct address_space *, struct list_head *pages);
59387 +       /* key of first byte which is not addressed by the item @coord is set to
59388 +          For example extent with the key
59389 +
59390 +          (LOCALITY,4,OBJID,STARTING-OFFSET), and length BLK blocks,
59391 +
59392 +          ->append_key is
59393 +
59394 +          (LOCALITY,4,OBJID,STARTING-OFFSET + BLK * block_size) */
59395 +       /* FIXME: could be uf_coord also */
59396 +       reiser4_key *(*append_key) (const coord_t *, reiser4_key *);
59397 +
59398 +       void (*init_coord_extension)(uf_coord_t *, loff_t);
59399 +} file_ops;
59400 +
59401 +/* operations specific to items of stat data type */
59402 +typedef struct {
59403 +       int (*init_inode) (struct inode * inode, char *sd, int len);
59404 +       int (*save_len) (struct inode * inode);
59405 +       int (*save) (struct inode * inode, char **area);
59406 +} sd_ops;
59407 +
59408 +/* operations specific to internal item */
59409 +typedef struct {
59410 +       /* all tree traversal want to know from internal item is where
59411 +           to go next. */
59412 +       void (*down_link) (const coord_t * coord,
59413 +                          const reiser4_key * key, reiser4_block_nr * block);
59414 +       /* check that given internal item contains given pointer. */
59415 +       int (*has_pointer_to) (const coord_t * coord,
59416 +                              const reiser4_block_nr * block);
59417 +} internal_item_ops;
59418 +
59419 +struct item_plugin {
59420 +       /* generic fields */
59421 +       plugin_header h;
59422 +
59423 +       /* methods common for all item types */
59424 +       balance_ops b;
59425 +       /* methods used during flush */
59426 +       flush_ops f;
59427 +
59428 +       /* methods specific to particular type of item */
59429 +       union {
59430 +               dir_entry_ops dir;
59431 +               file_ops file;
59432 +               sd_ops sd;
59433 +               internal_item_ops internal;
59434 +       } s;
59435 +
59436 +};
59437 +
59438 +static inline item_id
59439 +item_id_by_plugin(item_plugin * plugin)
59440 +{
59441 +       return plugin->h.id;
59442 +}
59443 +
59444 +static inline char
59445 +get_iplugid(item_plugin *iplug)
59446 +{
59447 +       assert("nikita-2838", iplug != NULL);
59448 +       assert("nikita-2839", 0 <= iplug->h.id && iplug->h.id < 0xff);
59449 +       return (char)item_id_by_plugin(iplug);
59450 +}
59451 +
59452 +extern unsigned long znode_times_locked(const znode *z);
59453 +
59454 +static inline void
59455 +coord_set_iplug(coord_t * coord, item_plugin *iplug)
59456 +{
59457 +       assert("nikita-2837", coord != NULL);
59458 +       assert("nikita-2838", iplug != NULL);
59459 +       coord->iplugid = get_iplugid(iplug);
59460 +       ON_DEBUG(coord->plug_v = znode_times_locked(coord->node));
59461 +}
59462 +
59463 +static inline item_plugin *
59464 +coord_iplug(const coord_t * coord)
59465 +{
59466 +       assert("nikita-2833", coord != NULL);
59467 +       assert("nikita-2834", coord->iplugid != INVALID_PLUGID);
59468 +       assert("nikita-3549", coord->plug_v == znode_times_locked(coord->node));
59469 +       return (item_plugin *)plugin_by_id(REISER4_ITEM_PLUGIN_TYPE,
59470 +                                          coord->iplugid);
59471 +}
59472 +
59473 +extern int item_can_contain_key(const coord_t * item, const reiser4_key * key, const reiser4_item_data *);
59474 +extern int are_items_mergeable(const coord_t * i1, const coord_t * i2);
59475 +extern int item_is_extent(const coord_t *);
59476 +extern int item_is_tail(const coord_t *);
59477 +extern int item_is_statdata(const coord_t * item);
59478 +
59479 +extern pos_in_node_t item_length_by_coord(const coord_t * coord);
59480 +extern item_type_id item_type_by_coord(const coord_t * coord);
59481 +extern item_id item_id_by_coord(const coord_t * coord /* coord to query */ );
59482 +extern reiser4_key *item_key_by_coord(const coord_t * coord, reiser4_key * key);
59483 +extern reiser4_key *max_item_key_by_coord(const coord_t *, reiser4_key *);
59484 +extern reiser4_key *unit_key_by_coord(const coord_t * coord, reiser4_key * key);
59485 +extern reiser4_key *max_unit_key_by_coord(const coord_t * coord, reiser4_key * key);
59486 +
59487 +extern void obtain_item_plugin(const coord_t * coord);
59488 +
59489 +#if defined(REISER4_DEBUG) || defined(REISER4_DEBUG_MODIFY) || defined(REISER4_DEBUG_OUTPUT)
59490 +extern int znode_is_loaded(const znode * node);
59491 +#endif
59492 +
59493 +/* return plugin of item at @coord */
59494 +static inline item_plugin *
59495 +item_plugin_by_coord(const coord_t * coord /* coord to query */ )
59496 +{
59497 +       assert("nikita-330", coord != NULL);
59498 +       assert("nikita-331", coord->node != NULL);
59499 +       assert("nikita-332", znode_is_loaded(coord->node));
59500 +       trace_stamp(TRACE_TREE);
59501 +
59502 +       if (unlikely(!coord_is_iplug_set(coord)))
59503 +               obtain_item_plugin(coord);
59504 +       return coord_iplug(coord);
59505 +}
59506 +
59507 +/* this returns true if item is of internal type */
59508 +static inline int
59509 +item_is_internal(const coord_t * item)
59510 +{
59511 +       assert("vs-483", coord_is_existing_item(item));
59512 +       return item_type_by_coord(item) == INTERNAL_ITEM_TYPE;
59513 +}
59514 +
59515 +extern void item_body_by_coord_hard(coord_t * coord);
59516 +extern void *item_body_by_coord_easy(const coord_t * coord);
59517 +#if REISER4_DEBUG
59518 +extern int item_body_is_valid(const coord_t * coord);
59519 +#endif
59520 +
59521 +/* return pointer to item body */
59522 +static inline void *
59523 +item_body_by_coord(const coord_t * coord /* coord to query */ )
59524 +{
59525 +       assert("nikita-324", coord != NULL);
59526 +       assert("nikita-325", coord->node != NULL);
59527 +       assert("nikita-326", znode_is_loaded(coord->node));
59528 +       trace_stamp(TRACE_TREE);
59529 +
59530 +       if (coord->offset == INVALID_OFFSET)
59531 +               item_body_by_coord_hard((coord_t *)coord);
59532 +       assert("nikita-3201", item_body_is_valid(coord));
59533 +       assert("nikita-3550", coord->body_v == znode_times_locked(coord->node));
59534 +       return item_body_by_coord_easy(coord);
59535 +}
59536 +
59537 +/* __REISER4_ITEM_H__ */
59538 +#endif
59539 +/* Make Linus happy.
59540 +   Local variables:
59541 +   c-indentation-style: "K&R"
59542 +   mode-name: "LC"
59543 +   c-basic-offset: 8
59544 +   tab-width: 8
59545 +   fill-column: 120
59546 +   scroll-step: 1
59547 +   End:
59548 +*/
59549 diff -rupN linux-2.6.8-rc3/fs/reiser4/plugin/item/sde.c linux-2.6.8-rc3-a/fs/reiser4/plugin/item/sde.c
59550 --- linux-2.6.8-rc3/fs/reiser4/plugin/item/sde.c        1970-01-01 03:00:00.000000000 +0300
59551 +++ linux-2.6.8-rc3-a/fs/reiser4/plugin/item/sde.c      2004-08-05 21:20:53.348604545 +0400
59552 @@ -0,0 +1,213 @@
59553 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
59554 +
59555 +/* Directory entry implementation */
59556 +#include "../../forward.h"
59557 +#include "../../debug.h"
59558 +#include "../../dformat.h"
59559 +#include "../../kassign.h"
59560 +#include "../../coord.h"
59561 +#include "sde.h"
59562 +#include "item.h"
59563 +#include "../plugin.h"
59564 +#include "../../znode.h"
59565 +#include "../../carry.h"
59566 +#include "../../tree.h"
59567 +#include "../../inode.h"
59568 +
59569 +#include <linux/fs.h>          /* for struct inode */
59570 +#include <linux/dcache.h>      /* for struct dentry */
59571 +#include <linux/quotaops.h>
59572 +
59573 +#if REISER4_DEBUG_OUTPUT
59574 +reiser4_internal void
59575 +print_de(const char *prefix /* prefix to print */ ,
59576 +        coord_t * coord /* item to print */ )
59577 +{
59578 +       assert("nikita-1456", prefix != NULL);
59579 +       assert("nikita-1457", coord != NULL);
59580 +
59581 +       if (item_length_by_coord(coord) < (int) sizeof (directory_entry_format)) {
59582 +               printk("%s: wrong size: %i < %i\n", prefix, item_length_by_coord(coord), sizeof (directory_entry_format));
59583 +       } else {
59584 +               reiser4_key sdkey;
59585 +               char *name;
59586 +               char buf[DE_NAME_BUF_LEN];
59587 +
59588 +               extract_key_de(coord, &sdkey);
59589 +               name = extract_name_de(coord, buf);
59590 +               printk("%s: name: %s\n", prefix, name);
59591 +               print_key("\tsdkey", &sdkey);
59592 +       }
59593 +}
59594 +#endif
59595 +
59596 +/* ->extract_key() method of simple directory item plugin. */
59597 +reiser4_internal int
59598 +extract_key_de(const coord_t * coord /* coord of item */ ,
59599 +              reiser4_key * key /* resulting key */ )
59600 +{
59601 +       directory_entry_format *dent;
59602 +
59603 +       assert("nikita-1458", coord != NULL);
59604 +       assert("nikita-1459", key != NULL);
59605 +
59606 +       dent = (directory_entry_format *) item_body_by_coord(coord);
59607 +       assert("nikita-1158", item_length_by_coord(coord) >= (int) sizeof *dent);
59608 +       return extract_key_from_id(&dent->id, key);
59609 +}
59610 +
59611 +reiser4_internal int
59612 +update_key_de(const coord_t * coord, const reiser4_key * key, lock_handle * lh UNUSED_ARG)
59613 +{
59614 +       directory_entry_format *dent;
59615 +       obj_key_id obj_id;
59616 +       int result;
59617 +
59618 +       assert("nikita-2342", coord != NULL);
59619 +       assert("nikita-2343", key != NULL);
59620 +
59621 +       dent = (directory_entry_format *) item_body_by_coord(coord);
59622 +       result = build_obj_key_id(key, &obj_id);
59623 +       if (result == 0) {
59624 +               dent->id = obj_id;
59625 +               znode_make_dirty(coord->node);
59626 +       }
59627 +       return 0;
59628 +}
59629 +
59630 +reiser4_internal char *
59631 +extract_dent_name(const coord_t * coord, directory_entry_format *dent, char *buf)
59632 +{
59633 +       reiser4_key key;
59634 +
59635 +       unit_key_by_coord(coord, &key);
59636 +       if (get_key_type(&key) != KEY_FILE_NAME_MINOR)
59637 +               print_address("oops", znode_get_block(coord->node));
59638 +       if (!is_longname_key(&key)) {
59639 +               if (is_dot_key(&key))
59640 +                       return (char *) ".";
59641 +               else
59642 +                       return extract_name_from_key(&key, buf);
59643 +       } else
59644 +               return (char *) dent->name;
59645 +}
59646 +
59647 +/* ->extract_name() method of simple directory item plugin. */
59648 +reiser4_internal char *
59649 +extract_name_de(const coord_t * coord /* coord of item */, char *buf)
59650 +{
59651 +       directory_entry_format *dent;
59652 +
59653 +       assert("nikita-1460", coord != NULL);
59654 +
59655 +       dent = (directory_entry_format *) item_body_by_coord(coord);
59656 +       return extract_dent_name(coord, dent, buf);
59657 +}
59658 +
59659 +/* ->extract_file_type() method of simple directory item plugin. */
59660 +reiser4_internal unsigned
59661 +extract_file_type_de(const coord_t * coord UNUSED_ARG  /* coord of
59662 +                                                          * item */ )
59663 +{
59664 +       assert("nikita-1764", coord != NULL);
59665 +       /* we don't store file type in the directory entry yet.
59666 +
59667 +          But see comments at kassign.h:obj_key_id
59668 +       */
59669 +       return DT_UNKNOWN;
59670 +}
59671 +
59672 +reiser4_internal int
59673 +add_entry_de(struct inode *dir /* directory of item */ ,
59674 +            coord_t * coord /* coord of item */ ,
59675 +            lock_handle * lh /* insertion lock handle */ ,
59676 +            const struct dentry *de /* name to add */ ,
59677 +            reiser4_dir_entry_desc * entry     /* parameters of new directory
59678 +                                                * entry */ )
59679 +{
59680 +       reiser4_item_data data;
59681 +       directory_entry_format *dent;
59682 +       int result;
59683 +       const char *name;
59684 +       int len;
59685 +       int longname;
59686 +
59687 +       name = de->d_name.name;
59688 +       len  = de->d_name.len;
59689 +       assert("nikita-1163", strlen(name) == len);
59690 +
59691 +       longname = is_longname(name, len);
59692 +
59693 +       data.length = sizeof *dent;
59694 +       if (longname)
59695 +               data.length += len + 1;
59696 +       data.data = NULL;
59697 +       data.user = 0;
59698 +       data.iplug = item_plugin_by_id(SIMPLE_DIR_ENTRY_ID);
59699 +
59700 +       /* NOTE-NIKITA quota plugin */
59701 +       if (DQUOT_ALLOC_SPACE_NODIRTY(dir, data.length))
59702 +               return -EDQUOT;
59703 +
59704 +       result = insert_by_coord(coord, &data, &entry->key, lh, 0 /*flags */ );
59705 +       if (result != 0)
59706 +               return result;
59707 +
59708 +       dent = (directory_entry_format *) item_body_by_coord(coord);
59709 +       build_inode_key_id(entry->obj, &dent->id);
59710 +       if (longname) {
59711 +               xmemcpy(dent->name, name, len);
59712 +               cputod8(0, &dent->name[len]);
59713 +       }
59714 +       return 0;
59715 +}
59716 +
59717 +reiser4_internal int
59718 +rem_entry_de(struct inode *dir /* directory of item */ ,
59719 +            const struct qstr * name UNUSED_ARG,
59720 +            coord_t * coord /* coord of item */ ,
59721 +            lock_handle * lh UNUSED_ARG        /* lock handle for
59722 +                                                  * removal */ ,
59723 +            reiser4_dir_entry_desc * entry UNUSED_ARG  /* parameters of
59724 +                                                        * directory entry
59725 +                                                        * being removed */ )
59726 +{
59727 +       coord_t shadow;
59728 +       int result;
59729 +       int length;
59730 +
59731 +       length = item_length_by_coord(coord);
59732 +       if (inode_get_bytes(dir) < length) {
59733 +               warning("nikita-2627", "Dir is broke: %llu: %llu", get_inode_oid(dir), inode_get_bytes(dir));
59734 +               return RETERR(-EIO);
59735 +       }
59736 +
59737 +       /* cut_node() is supposed to take pointers to _different_
59738 +          coords, because it will modify them without respect to
59739 +          possible aliasing. To work around this, create temporary copy
59740 +          of @coord.
59741 +       */
59742 +       coord_dup(&shadow, coord);
59743 +       result = kill_node_content(coord, &shadow, NULL, NULL, NULL, NULL, NULL);
59744 +       if (result == 0) {
59745 +               /* NOTE-NIKITA quota plugin */
59746 +               DQUOT_FREE_SPACE_NODIRTY(dir, length);
59747 +       }
59748 +       return result;
59749 +}
59750 +
59751 +reiser4_internal int
59752 +max_name_len_de(const struct inode *dir)
59753 +{
59754 +       return tree_by_inode(dir)->nplug->max_item_size() - sizeof (directory_entry_format) - 2;
59755 +}
59756 +
59757 +/* Make Linus happy.
59758 +   Local variables:
59759 +   c-indentation-style: "K&R"
59760 +   mode-name: "LC"
59761 +   c-basic-offset: 8
59762 +   tab-width: 8
59763 +   fill-column: 120
59764 +   End:
59765 +*/
59766 diff -rupN linux-2.6.8-rc3/fs/reiser4/plugin/item/sde.h linux-2.6.8-rc3-a/fs/reiser4/plugin/item/sde.h
59767 --- linux-2.6.8-rc3/fs/reiser4/plugin/item/sde.h        1970-01-01 03:00:00.000000000 +0300
59768 +++ linux-2.6.8-rc3-a/fs/reiser4/plugin/item/sde.h      2004-08-05 21:20:53.096657687 +0400
59769 @@ -0,0 +1,64 @@
59770 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
59771 +
59772 +/* Directory entry. */
59773 +
59774 +#if !defined( __FS_REISER4_PLUGIN_DIRECTORY_ENTRY_H__ )
59775 +#define __FS_REISER4_PLUGIN_DIRECTORY_ENTRY_H__
59776 +
59777 +#include "../../forward.h"
59778 +#include "../../dformat.h"
59779 +#include "../../kassign.h"
59780 +#include "../../key.h"
59781 +
59782 +#include <linux/fs.h>
59783 +#include <linux/dcache.h>      /* for struct dentry */
59784 +
59785 +typedef struct directory_entry_format {
59786 +       /* key of object stat-data. It's not necessary to store whole
59787 +          key here, because it's always key of stat-data, so minor
59788 +          packing locality and offset can be omitted here. But this
59789 +          relies on particular key allocation scheme for stat-data, so,
59790 +          for extensibility sake, whole key can be stored here.
59791 +
59792 +          We store key as array of bytes, because we don't want 8-byte
59793 +          alignment of dir entries.
59794 +       */
59795 +       obj_key_id id;
59796 +       /* file name. Null terminated string. */
59797 +       d8 name[0];
59798 +} directory_entry_format;
59799 +
59800 +void print_de(const char *prefix, coord_t * coord);
59801 +int extract_key_de(const coord_t * coord, reiser4_key * key);
59802 +int update_key_de(const coord_t * coord, const reiser4_key * key, lock_handle * lh);
59803 +char *extract_name_de(const coord_t * coord, char *buf);
59804 +unsigned extract_file_type_de(const coord_t * coord);
59805 +int add_entry_de(struct inode *dir, coord_t * coord,
59806 +                lock_handle * lh, const struct dentry *name, reiser4_dir_entry_desc * entry);
59807 +int rem_entry_de(struct inode *dir, const struct qstr * name, coord_t * coord, lock_handle * lh, reiser4_dir_entry_desc * entry);
59808 +int max_name_len_de(const struct inode *dir);
59809 +
59810 +
59811 +int de_rem_and_shrink(struct inode *dir, coord_t * coord, int length);
59812 +
59813 +char *extract_dent_name(const coord_t * coord,
59814 +                       directory_entry_format *dent, char *buf);
59815 +
59816 +#if REISER4_LARGE_KEY
59817 +#define DE_NAME_BUF_LEN (24)
59818 +#else
59819 +#define DE_NAME_BUF_LEN (16)
59820 +#endif
59821 +
59822 +/* __FS_REISER4_PLUGIN_DIRECTORY_ENTRY_H__ */
59823 +#endif
59824 +
59825 +/* Make Linus happy.
59826 +   Local variables:
59827 +   c-indentation-style: "K&R"
59828 +   mode-name: "LC"
59829 +   c-basic-offset: 8
59830 +   tab-width: 8
59831 +   fill-column: 120
59832 +   End:
59833 +*/
59834 diff -rupN linux-2.6.8-rc3/fs/reiser4/plugin/item/static_stat.c linux-2.6.8-rc3-a/fs/reiser4/plugin/item/static_stat.c
59835 --- linux-2.6.8-rc3/fs/reiser4/plugin/item/static_stat.c        1970-01-01 03:00:00.000000000 +0300
59836 +++ linux-2.6.8-rc3-a/fs/reiser4/plugin/item/static_stat.c      2004-08-05 21:20:52.923694169 +0400
59837 @@ -0,0 +1,1306 @@
59838 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
59839 +
59840 +/* stat data manipulation. */
59841 +
59842 +#include "../../forward.h"
59843 +#include "../../super.h"
59844 +#include "../../vfs_ops.h"
59845 +#include "../../inode.h"
59846 +#include "../../debug.h"
59847 +#include "../../dformat.h"
59848 +#include "../object.h"
59849 +#include "../plugin.h"
59850 +#include "../plugin_header.h"
59851 +#include "static_stat.h"
59852 +#include "item.h"
59853 +
59854 +#include <linux/types.h>
59855 +#include <linux/fs.h>
59856 +
59857 +/* see static_stat.h for explanation */
59858 +
59859 +/* helper function used while we are dumping/loading inode/plugin state
59860 +    to/from the stat-data. */
59861 +
59862 +static void
59863 +move_on(int *length /* space remaining in stat-data */ ,
59864 +       char **area /* current coord in stat data */ ,
59865 +       int size_of /* how many bytes to move forward */ )
59866 +{
59867 +       assert("nikita-615", length != NULL);
59868 +       assert("nikita-616", area != NULL);
59869 +
59870 +       *length -= size_of;
59871 +       *area += size_of;
59872 +
59873 +       assert("nikita-617", *length >= 0);
59874 +}
59875 +
59876 +#if REISER4_DEBUG_OUTPUT
59877 +/* ->print() method of static sd item. Prints human readable information about
59878 +   sd at @coord */
59879 +reiser4_internal void
59880 +print_sd(const char *prefix /* prefix to print */ ,
59881 +        coord_t * coord /* coord of item */ )
59882 +{
59883 +       char *sd;
59884 +       int len;
59885 +       int bit;
59886 +       int chunk;
59887 +       __u16 mask;
59888 +       reiser4_stat_data_base *sd_base;
59889 +
59890 +       assert("nikita-1254", prefix != NULL);
59891 +       assert("nikita-1255", coord != NULL);
59892 +
59893 +       sd = item_body_by_coord(coord);
59894 +       len = item_length_by_coord(coord);
59895 +
59896 +       sd_base = (reiser4_stat_data_base *) sd;
59897 +       if (len < (int) sizeof *sd_base) {
59898 +               printk("%s: wrong size: %i < %i\n", prefix, item_length_by_coord(coord), sizeof *sd_base);
59899 +               return;
59900 +       }
59901 +
59902 +       mask = d16tocpu(&sd_base->extmask);
59903 +       printk("%s: extmask: %x\n", prefix, mask);
59904 +
59905 +       move_on(&len, &sd, sizeof *sd_base);
59906 +
59907 +       for (bit = 0, chunk = 0; mask != 0; ++bit, mask >>= 1) {
59908 +               if (((bit + 1) % 16) != 0) {
59909 +                       /* handle extension */
59910 +                       sd_ext_plugin *sdplug;
59911 +
59912 +                       sdplug = sd_ext_plugin_by_id(bit);
59913 +                       if (sdplug == NULL) {
59914 +                               continue;
59915 +                       }
59916 +                       if ((mask & 1) && sdplug->print != NULL) {
59917 +                               /* alignment is not supported in node layout
59918 +                                  plugin yet.
59919 +                                result = align( inode, &len, &sd,
59920 +                                sdplug -> alignment );
59921 +                                if( result != 0 )
59922 +                                return result; */
59923 +                               sdplug->print(prefix, &sd, &len);
59924 +                       }
59925 +               } else if (mask & 1) {
59926 +                       /* next portion of bitmask */
59927 +                       if (len < (int) sizeof (d16)) {
59928 +                               warning("nikita-2708", "No space for bitmap");
59929 +                               break;
59930 +                       }
59931 +                       mask = d16tocpu((d16 *) sd);
59932 +                       move_on(&len, &sd, sizeof (d16));
59933 +                       ++chunk;
59934 +                       if (chunk == 3) {
59935 +                               if (!(mask & 0x8000)) {
59936 +                                       /* clear last bit */
59937 +                                       mask &= ~0x8000;
59938 +                                       continue;
59939 +                               }
59940 +                               /* too much */
59941 +                               warning("nikita-2709", "Too many extensions");
59942 +                               break;
59943 +                       }
59944 +               } else
59945 +                       /* bitmask exhausted */
59946 +                       break;
59947 +       }
59948 +}
59949 +#endif
59950 +
59951 +reiser4_internal void
59952 +item_stat_static_sd(const coord_t * coord, void *vp)
59953 +{
59954 +       reiser4_stat_data_base *sd;
59955 +       mode_t mode;
59956 +       sd_stat *stat;
59957 +
59958 +       stat = (sd_stat *) vp;
59959 +       sd = (reiser4_stat_data_base *) item_body_by_coord(coord);
59960 +       mode = 0;               // d16tocpu( &sd -> mode );
59961 +
59962 +       if (S_ISREG(mode))
59963 +               stat->files++;
59964 +       else if (S_ISDIR(mode))
59965 +               stat->dirs++;
59966 +       else
59967 +               stat->others++;
59968 +}
59969 +
59970 +/* helper function used while loading inode/plugin state from stat-data.
59971 +    Complain if there is less space in stat-data than was expected.
59972 +    Can only happen on disk corruption. */
59973 +static int
59974 +not_enough_space(struct inode *inode /* object being processed */ ,
59975 +                const char *where /* error message */ )
59976 +{
59977 +       assert("nikita-618", inode != NULL);
59978 +
59979 +       warning("nikita-619", "Not enough space in %llu while loading %s", get_inode_oid(inode), where);
59980 +       return RETERR(-EINVAL);
59981 +}
59982 +
59983 +/* helper function used while loading inode/plugin state from
59984 +    stat-data. Call it if invalid plugin id was found. */
59985 +static int
59986 +unknown_plugin(reiser4_plugin_id id /* invalid id */ ,
59987 +              struct inode *inode /* object being processed */ )
59988 +{
59989 +       warning("nikita-620", "Unknown plugin %i in %llu", id, get_inode_oid(inode));
59990 +       return RETERR(-EINVAL);
59991 +}
59992 +
59993 +#if 0 /* Item alignment is not yet supported  */
59994 +
59995 +/* helper function used while storing/loading inode/plugin data to/from
59996 +    stat-data. Move current coord in stat-data ("area") to position
59997 +    aligned up to "alignment" bytes. */
59998 +static int
59999 +align(struct inode *inode /* object being processed */ ,
60000 +      int *length /* space remaining in stat-data */ ,
60001 +      char **area /* current coord in stat data */ ,
60002 +      int alignment /* required alignment */ )
60003 +{
60004 +       int delta;
60005 +
60006 +       assert("nikita-621", inode != NULL);
60007 +       assert("nikita-622", length != NULL);
60008 +       assert("nikita-623", area != NULL);
60009 +       assert("nikita-624", alignment > 0);
60010 +
60011 +       delta = round_up(*area, alignment) - *area;
60012 +       if (delta > *length)
60013 +               return not_enough_space(inode, "padding");
60014 +       if (delta > 0)
60015 +               move_on(length, area, delta);
60016 +       return 0;
60017 +}
60018 +
60019 +#endif /* 0 */
60020 +
60021 +/* this is installed as ->init_inode() method of
60022 +    item_plugins[ STATIC_STAT_DATA_IT ] (fs/reiser4/plugin/item/item.c).
60023 +    Copies data from on-disk stat-data format into inode.
60024 +    Handles stat-data extensions. */
60025 +/* was sd_load */
60026 +reiser4_internal int
60027 +init_inode_static_sd(struct inode *inode /* object being processed */ ,
60028 +       char *sd /* stat-data body */ ,
60029 +       int len /* length of stat-data */ )
60030 +{
60031 +       int result;
60032 +       int bit;
60033 +       int chunk;
60034 +       __u16 mask;
60035 +       __u64 bigmask;
60036 +       reiser4_stat_data_base *sd_base;
60037 +       reiser4_inode *state;
60038 +
60039 +       assert("nikita-625", inode != NULL);
60040 +       assert("nikita-626", sd != NULL);
60041 +
60042 +       result = 0;
60043 +       sd_base = (reiser4_stat_data_base *) sd;
60044 +       state = reiser4_inode_data(inode);
60045 +       mask = d16tocpu(&sd_base->extmask);
60046 +       bigmask = mask;
60047 +       inode_set_flag(inode, REISER4_SDLEN_KNOWN);
60048 +
60049 +       move_on(&len, &sd, sizeof *sd_base);
60050 +       for (bit = 0, chunk = 0; mask != 0 || bit <= LAST_IMPORTANT_SD_EXTENSION; ++bit, mask >>= 1) {
60051 +               if (((bit + 1) % 16) != 0) {
60052 +                       /* handle extension */
60053 +                       sd_ext_plugin *sdplug;
60054 +
60055 +                       sdplug = sd_ext_plugin_by_id(bit);
60056 +                       if (sdplug == NULL) {
60057 +                               warning("nikita-627", "No such extension %i in inode %llu", bit, get_inode_oid(inode));
60058 +                               result = RETERR(-EINVAL);
60059 +                               break;
60060 +                       }
60061 +                       if (mask & 1) {
60062 +                               assert("nikita-628", sdplug->present);
60063 +                               /* alignment is not supported in node layout
60064 +                                  plugin yet.
60065 +                                result = align( inode, &len, &sd,
60066 +                                sdplug -> alignment );
60067 +                                if( result != 0 )
60068 +                                return result; */
60069 +                               result = sdplug->present(inode, &sd, &len);
60070 +                       } else if (sdplug->absent != NULL)
60071 +                               result = sdplug->absent(inode);
60072 +                       if (result)
60073 +                               break;
60074 +                       /* else, we are looking at the last bit in 16-bit
60075 +                          portion of bitmask */
60076 +               } else if (mask & 1) {
60077 +                       /* next portion of bitmask */
60078 +                       if (len < (int) sizeof (d16)) {
60079 +                               warning("nikita-629", "No space for bitmap in inode %llu", get_inode_oid(inode));
60080 +                               result = RETERR(-EINVAL);
60081 +                               break;
60082 +                       }
60083 +                       mask = d16tocpu((d16 *) sd);
60084 +                       bigmask <<= 16;
60085 +                       bigmask |= mask;
60086 +                       move_on(&len, &sd, sizeof (d16));
60087 +                       ++chunk;
60088 +                       if (chunk == 3) {
60089 +                               if (!(mask & 0x8000)) {
60090 +                                       /* clear last bit */
60091 +                                       mask &= ~0x8000;
60092 +                                       continue;
60093 +                               }
60094 +                               /* too much */
60095 +                               warning("nikita-630", "Too many extensions in %llu", get_inode_oid(inode));
60096 +                               result = RETERR(-EINVAL);
60097 +                               break;
60098 +                       }
60099 +               } else
60100 +                       /* bitmask exhausted */
60101 +                       break;
60102 +       }
60103 +       state->extmask = bigmask;
60104 +       /* common initialisations */
60105 +       inode->i_blksize = get_super_private(inode->i_sb)->optimal_io_size;
60106 +       if (len - (sizeof (d16) * bit / 16) > 0)
60107 +               /* alignment in save_len_static_sd() is taken into account
60108 +                  -edward */
60109 +               warning("nikita-631", "unused space in inode %llu", get_inode_oid(inode));
60110 +       return result;
60111 +}
60112 +
60113 +/* estimates size of stat-data required to store inode.
60114 +    Installed as ->save_len() method of
60115 +    item_plugins[ STATIC_STAT_DATA_IT ] (fs/reiser4/plugin/item/item.c). */
60116 +/* was sd_len */
60117 +reiser4_internal int
60118 +save_len_static_sd(struct inode *inode /* object being processed */ )
60119 +{
60120 +       unsigned int result;
60121 +       __u64 mask;
60122 +       int bit;
60123 +
60124 +       assert("nikita-632", inode != NULL);
60125 +
60126 +       result = sizeof (reiser4_stat_data_base);
60127 +       mask = reiser4_inode_data(inode)->extmask;
60128 +       for (bit = 0; mask != 0; ++bit, mask >>= 1) {
60129 +               if (mask & 1) {
60130 +                       sd_ext_plugin *sdplug;
60131 +
60132 +                       sdplug = sd_ext_plugin_by_id(bit);
60133 +                       assert("nikita-633", sdplug != NULL);
60134 +                       /* no aligment support
60135 +                          result +=
60136 +                          round_up( result, sdplug -> alignment ) - result; */
60137 +                       result += sdplug->save_len(inode);
60138 +               }
60139 +       }
60140 +       result += sizeof (d16) * bit / 16;
60141 +       return result;
60142 +}
60143 +
60144 +/* saves inode into stat-data.
60145 +    Installed as ->save() method of
60146 +    item_plugins[ STATIC_STAT_DATA_IT ] (fs/reiser4/plugin/item/item.c). */
60147 +/* was sd_save */
60148 +reiser4_internal int
60149 +save_static_sd(struct inode *inode /* object being processed */ ,
60150 +              char **area /* where to save stat-data */ )
60151 +{
60152 +       int result;
60153 +       __u64 emask;
60154 +       int bit;
60155 +       unsigned int len;
60156 +       reiser4_stat_data_base *sd_base;
60157 +
60158 +       assert("nikita-634", inode != NULL);
60159 +       assert("nikita-635", area != NULL);
60160 +
60161 +       result = 0;
60162 +       emask = reiser4_inode_data(inode)->extmask;
60163 +       sd_base = (reiser4_stat_data_base *) * area;
60164 +       cputod16((unsigned) (emask & 0xffff), &sd_base->extmask);
60165 +
60166 +       *area += sizeof *sd_base;
60167 +       len = 0xffffffffu;
60168 +       for (bit = 0; emask != 0; ++bit, emask >>= 1) {
60169 +               if (emask & 1) {
60170 +                       if ((bit + 1) % 16 != 0) {
60171 +                               sd_ext_plugin *sdplug;
60172 +                               sdplug = sd_ext_plugin_by_id(bit);
60173 +                               assert("nikita-636", sdplug != NULL);
60174 +                               /* no alignment support yet
60175 +                                  align( inode, &len, area,
60176 +                                  sdplug -> alignment ); */
60177 +                               result = sdplug->save(inode, area);
60178 +                               if (result)
60179 +                                       break;
60180 +                       } else {
60181 +                               cputod16((unsigned) (emask & 0xffff), (d16 *) * area);
60182 +                               *area += sizeof (d16);
60183 +                       }
60184 +               }
60185 +       }
60186 +       return result;
60187 +}
60188 +
60189 +/* stat-data extension handling functions. */
60190 +
60191 +static int
60192 +present_lw_sd(struct inode *inode /* object being processed */ ,
60193 +             char **area /* position in stat-data */ ,
60194 +             int *len /* remaining length */ )
60195 +{
60196 +       if (*len >= (int) sizeof (reiser4_light_weight_stat)) {
60197 +               reiser4_light_weight_stat *sd_lw;
60198 +
60199 +               sd_lw = (reiser4_light_weight_stat *) * area;
60200 +
60201 +               inode->i_mode = d16tocpu(&sd_lw->mode);
60202 +               inode->i_nlink = d32tocpu(&sd_lw->nlink);
60203 +               inode->i_size = d64tocpu(&sd_lw->size);
60204 +               if ((inode->i_mode & S_IFMT) == (S_IFREG | S_IFIFO)) {
60205 +                       inode->i_mode &= ~S_IFIFO;
60206 +                       inode_set_flag(inode, REISER4_PART_CONV);
60207 +               }
60208 +               move_on(len, area, sizeof *sd_lw);
60209 +               return 0;
60210 +       } else
60211 +               return not_enough_space(inode, "lw sd");
60212 +}
60213 +
60214 +static int
60215 +save_len_lw_sd(struct inode *inode UNUSED_ARG  /* object being
60216 +                                                * processed */ )
60217 +{
60218 +       return sizeof (reiser4_light_weight_stat);
60219 +}
60220 +
60221 +static int
60222 +save_lw_sd(struct inode *inode /* object being processed */ ,
60223 +          char **area /* position in stat-data */ )
60224 +{
60225 +       reiser4_light_weight_stat *sd;
60226 +       mode_t delta;
60227 +
60228 +       assert("nikita-2705", inode != NULL);
60229 +       assert("nikita-2706", area != NULL);
60230 +       assert("nikita-2707", *area != NULL);
60231 +
60232 +       sd = (reiser4_light_weight_stat *) * area;
60233 +
60234 +       delta = inode_get_flag(inode, REISER4_PART_CONV) ? S_IFIFO : 0;
60235 +       cputod16(inode->i_mode | delta, &sd->mode);
60236 +       cputod32(inode->i_nlink, &sd->nlink);
60237 +       cputod64((__u64) inode->i_size, &sd->size);
60238 +       *area += sizeof *sd;
60239 +       return 0;
60240 +}
60241 +
60242 +#if REISER4_DEBUG_OUTPUT
60243 +static void
60244 +print_lw_sd(const char *prefix, char **area /* position in stat-data */ ,
60245 +           int *len /* remaining length */ )
60246 +{
60247 +       reiser4_light_weight_stat *sd;
60248 +
60249 +       sd = (reiser4_light_weight_stat *) * area;
60250 +       printk("%s: mode: %o, nlink: %i, size: %llu\n", prefix,
60251 +              d16tocpu(&sd->mode), d32tocpu(&sd->nlink), d64tocpu(&sd->size));
60252 +       move_on(len, area, sizeof *sd);
60253 +}
60254 +#endif
60255 +
60256 +static int
60257 +present_unix_sd(struct inode *inode /* object being processed */ ,
60258 +               char **area /* position in stat-data */ ,
60259 +               int *len /* remaining length */ )
60260 +{
60261 +       assert("nikita-637", inode != NULL);
60262 +       assert("nikita-638", area != NULL);
60263 +       assert("nikita-639", *area != NULL);
60264 +       assert("nikita-640", len != NULL);
60265 +       assert("nikita-641", *len > 0);
60266 +
60267 +       if (*len >= (int) sizeof (reiser4_unix_stat)) {
60268 +               reiser4_unix_stat *sd;
60269 +
60270 +               sd = (reiser4_unix_stat *) * area;
60271 +
60272 +               inode->i_uid = d32tocpu(&sd->uid);
60273 +               inode->i_gid = d32tocpu(&sd->gid);
60274 +               inode->i_atime.tv_sec = d32tocpu(&sd->atime);
60275 +               inode->i_mtime.tv_sec = d32tocpu(&sd->mtime);
60276 +               inode->i_ctime.tv_sec = d32tocpu(&sd->ctime);
60277 +               if (S_ISBLK(inode->i_mode) || S_ISCHR(inode->i_mode))
60278 +                       inode->i_rdev = d64tocpu(&sd->u.rdev);
60279 +               else
60280 +                       inode_set_bytes(inode, (loff_t) d64tocpu(&sd->u.bytes));
60281 +               move_on(len, area, sizeof *sd);
60282 +               return 0;
60283 +       } else
60284 +               return not_enough_space(inode, "unix sd");
60285 +}
60286 +
60287 +static int
60288 +absent_unix_sd(struct inode *inode /* object being processed */ )
60289 +{
60290 +       inode->i_uid = get_super_private(inode->i_sb)->default_uid;
60291 +       inode->i_gid = get_super_private(inode->i_sb)->default_gid;
60292 +       inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
60293 +       inode_set_bytes(inode, inode->i_size);
60294 +       /* mark inode as lightweight, so that caller (reiser4_lookup) will
60295 +          complete initialisation by copying [ug]id from a parent. */
60296 +       inode_set_flag(inode, REISER4_LIGHT_WEIGHT);
60297 +       return 0;
60298 +}
60299 +
60300 +/* Audited by: green(2002.06.14) */
60301 +static int
60302 +save_len_unix_sd(struct inode *inode UNUSED_ARG        /* object being
60303 +                                                * processed */ )
60304 +{
60305 +       return sizeof (reiser4_unix_stat);
60306 +}
60307 +
60308 +static int
60309 +save_unix_sd(struct inode *inode /* object being processed */ ,
60310 +            char **area /* position in stat-data */ )
60311 +{
60312 +       reiser4_unix_stat *sd;
60313 +
60314 +       assert("nikita-642", inode != NULL);
60315 +       assert("nikita-643", area != NULL);
60316 +       assert("nikita-644", *area != NULL);
60317 +
60318 +       sd = (reiser4_unix_stat *) * area;
60319 +       cputod32(inode->i_uid, &sd->uid);
60320 +       cputod32(inode->i_gid, &sd->gid);
60321 +       cputod32((__u32) inode->i_atime.tv_sec, &sd->atime);
60322 +       cputod32((__u32) inode->i_ctime.tv_sec, &sd->ctime);
60323 +       cputod32((__u32) inode->i_mtime.tv_sec, &sd->mtime);
60324 +       if (S_ISBLK(inode->i_mode) || S_ISCHR(inode->i_mode))
60325 +               cputod64(inode->i_rdev, &sd->u.rdev);
60326 +       else
60327 +               cputod64((__u64) inode_get_bytes(inode), &sd->u.bytes);
60328 +       *area += sizeof *sd;
60329 +       return 0;
60330 +}
60331 +
60332 +#if REISER4_DEBUG_OUTPUT
60333 +static void
60334 +print_unix_sd(const char *prefix, char **area /* position in stat-data */ ,
60335 +             int *len /* remaining length */ )
60336 +{
60337 +       reiser4_unix_stat *sd;
60338 +
60339 +       sd = (reiser4_unix_stat *) * area;
60340 +       printk("%s: uid: %i, gid: %i, atime: %i, mtime: %i, ctime: %i, "
60341 +              "rdev: %llo, bytes: %llu\n", prefix,
60342 +              d32tocpu(&sd->uid),
60343 +              d32tocpu(&sd->gid),
60344 +              d32tocpu(&sd->atime),
60345 +              d32tocpu(&sd->mtime), d32tocpu(&sd->ctime), d64tocpu(&sd->u.rdev), d64tocpu(&sd->u.bytes));
60346 +       move_on(len, area, sizeof *sd);
60347 +}
60348 +#endif
60349 +
60350 +static int
60351 +present_large_times_sd(struct inode *inode /* object being processed */,
60352 +                      char **area /* position in stat-data */,
60353 +                      int *len /* remaining length */)
60354 +{
60355 +       if (*len >= (int) sizeof (reiser4_large_times_stat)) {
60356 +               reiser4_large_times_stat *sd_lt;
60357 +
60358 +               sd_lt = (reiser4_large_times_stat *) * area;
60359 +
60360 +               inode->i_atime.tv_nsec = d32tocpu(&sd_lt->atime);
60361 +               inode->i_mtime.tv_nsec = d32tocpu(&sd_lt->mtime);
60362 +               inode->i_ctime.tv_nsec = d32tocpu(&sd_lt->ctime);
60363 +
60364 +               move_on(len, area, sizeof *sd_lt);
60365 +               return 0;
60366 +       } else
60367 +               return not_enough_space(inode, "large times sd");
60368 +}
60369 +
60370 +static int
60371 +save_len_large_times_sd(struct inode *inode UNUSED_ARG /* object being processed */ )
60372 +{
60373 +       return sizeof (reiser4_large_times_stat);
60374 +}
60375 +
60376 +static int
60377 +save_large_times_sd(struct inode *inode /* object being processed */ ,
60378 +                   char **area /* position in stat-data */ )
60379 +{
60380 +       reiser4_large_times_stat *sd;
60381 +
60382 +       assert("nikita-2817", inode != NULL);
60383 +       assert("nikita-2818", area != NULL);
60384 +       assert("nikita-2819", *area != NULL);
60385 +
60386 +       sd = (reiser4_large_times_stat *) * area;
60387 +
60388 +       cputod32((__u32) inode->i_atime.tv_nsec, &sd->atime);
60389 +       cputod32((__u32) inode->i_ctime.tv_nsec, &sd->ctime);
60390 +       cputod32((__u32) inode->i_mtime.tv_nsec, &sd->mtime);
60391 +
60392 +       *area += sizeof *sd;
60393 +       return 0;
60394 +}
60395 +
60396 +#if REISER4_DEBUG_OUTPUT
60397 +static void
60398 +print_large_times_sd(const char *prefix, char **area /* position in stat-data */,
60399 +                    int *len /* remaining length */ )
60400 +{
60401 +       reiser4_large_times_stat *sd;
60402 +
60403 +       sd = (reiser4_large_times_stat *) * area;
60404 +       printk("%s: nanotimes: a: %i, m: %i, c: %i\n", prefix,
60405 +              d32tocpu(&sd->atime), d32tocpu(&sd->mtime), d32tocpu(&sd->ctime));
60406 +       move_on(len, area, sizeof *sd);
60407 +}
60408 +#endif
60409 +
60410 +/* symlink stat data extention */
60411 +
60412 +/* allocate memory for symlink target and attach it to inode->u.generic_ip */
60413 +static int
60414 +symlink_target_to_inode(struct inode *inode, const char *target, int len)
60415 +{
60416 +       assert("vs-845", inode->u.generic_ip == 0);
60417 +       assert("vs-846", !inode_get_flag(inode, REISER4_GENERIC_PTR_USED));
60418 +
60419 +       /* FIXME-VS: this is prone to deadlock. Not more than other similar
60420 +          places, though */
60421 +       inode->u.generic_ip = reiser4_kmalloc((size_t) len + 1, GFP_KERNEL);
60422 +       if (!inode->u.generic_ip)
60423 +               return RETERR(-ENOMEM);
60424 +
60425 +       xmemcpy((char *) (inode->u.generic_ip), target, (size_t) len);
60426 +       ((char *) (inode->u.generic_ip))[len] = 0;
60427 +       inode_set_flag(inode, REISER4_GENERIC_PTR_USED);
60428 +       return 0;
60429 +}
60430 +
60431 +/* this is called on read_inode. There is nothing to do actually, but some
60432 +   sanity checks */
60433 +static int
60434 +present_symlink_sd(struct inode *inode, char **area, int *len)
60435 +{
60436 +       int result;
60437 +       int length;
60438 +       reiser4_symlink_stat *sd;
60439 +
60440 +       length = (int) inode->i_size;
60441 +       /*
60442 +        * *len is number of bytes in stat data item from *area to the end of
60443 +        * item. It must be not less than size of symlink + 1 for ending 0
60444 +        */
60445 +       if (length > *len)
60446 +               return not_enough_space(inode, "symlink");
60447 +
60448 +       if (*(*area + length) != 0) {
60449 +               warning("vs-840", "Symlink is not zero terminated");
60450 +               return RETERR(-EIO);
60451 +       }
60452 +
60453 +       sd = (reiser4_symlink_stat *) * area;
60454 +       result = symlink_target_to_inode(inode, sd->body, length);
60455 +
60456 +       move_on(len, area, length + 1);
60457 +       return result;
60458 +}
60459 +
60460 +static int
60461 +save_len_symlink_sd(struct inode *inode)
60462 +{
60463 +       return inode->i_size + 1;
60464 +}
60465 +
60466 +/* this is called on create and update stat data. Do nothing on update but
60467 +   update @area */
60468 +static int
60469 +save_symlink_sd(struct inode *inode, char **area)
60470 +{
60471 +       int result;
60472 +       int length;
60473 +       reiser4_symlink_stat *sd;
60474 +
60475 +       length = (int) inode->i_size;
60476 +       /* inode->i_size must be set already */
60477 +       assert("vs-841", length);
60478 +
60479 +       result = 0;
60480 +       sd = (reiser4_symlink_stat *) * area;
60481 +       if (!inode_get_flag(inode, REISER4_GENERIC_PTR_USED)) {
60482 +               const char *target;
60483 +
60484 +               target = (const char *) (inode->u.generic_ip);
60485 +               inode->u.generic_ip = 0;
60486 +
60487 +               result = symlink_target_to_inode(inode, target, length);
60488 +
60489 +               /* copy symlink to stat data */
60490 +               xmemcpy(sd->body, target, (size_t) length);
60491 +               (*area)[length] = 0;
60492 +       } else {
60493 +               /* there is nothing to do in update but move area */
60494 +               assert("vs-844", !memcmp(inode->u.generic_ip, sd->body, (size_t) length + 1));
60495 +       }
60496 +
60497 +       *area += (length + 1);
60498 +       return result;
60499 +}
60500 +
60501 +#if REISER4_DEBUG_OUTPUT
60502 +static void
60503 +print_symlink_sd(const char *prefix, char **area /* position in stat-data */ ,
60504 +                int *len /* remaining length */ )
60505 +{
60506 +       reiser4_symlink_stat *sd;
60507 +       int length;
60508 +
60509 +       sd = (reiser4_symlink_stat *) * area;
60510 +       length = strlen(sd->body);
60511 +       printk("%s: \"%s\"\n", prefix, sd->body);
60512 +       move_on(len, area, length + 1);
60513 +}
60514 +#endif
60515 +
60516 +static int
60517 +present_flags_sd(struct inode *inode /* object being processed */ ,
60518 +              char **area /* position in stat-data */ ,
60519 +              int *len /* remaining length */ )
60520 +{
60521 +       assert("nikita-645", inode != NULL);
60522 +       assert("nikita-646", area != NULL);
60523 +       assert("nikita-647", *area != NULL);
60524 +       assert("nikita-648", len != NULL);
60525 +       assert("nikita-649", *len > 0);
60526 +
60527 +       if (*len >= (int) sizeof (reiser4_flags_stat)) {
60528 +               reiser4_flags_stat *sd;
60529 +
60530 +               sd = (reiser4_flags_stat *) * area;
60531 +               inode->i_flags = d32tocpu(&sd->flags);
60532 +               move_on(len, area, sizeof *sd);
60533 +               return 0;
60534 +       } else
60535 +               return not_enough_space(inode, "generation and attrs");
60536 +}
60537 +
60538 +/* Audited by: green(2002.06.14) */
60539 +static int
60540 +save_len_flags_sd(struct inode *inode UNUSED_ARG       /* object being
60541 +                                                * processed */ )
60542 +{
60543 +       return sizeof (reiser4_flags_stat);
60544 +}
60545 +
60546 +static int
60547 +save_flags_sd(struct inode *inode /* object being processed */ ,
60548 +           char **area /* position in stat-data */ )
60549 +{
60550 +       reiser4_flags_stat *sd;
60551 +
60552 +       assert("nikita-650", inode != NULL);
60553 +       assert("nikita-651", area != NULL);
60554 +       assert("nikita-652", *area != NULL);
60555 +
60556 +       sd = (reiser4_flags_stat *) * area;
60557 +       cputod32(inode->i_flags, &sd->flags);
60558 +       *area += sizeof *sd;
60559 +       return 0;
60560 +}
60561 +
60562 +static int absent_plugin_sd(struct inode *inode);
60563 +static int
60564 +present_plugin_sd(struct inode *inode /* object being processed */ ,
60565 +                 char **area /* position in stat-data */ ,
60566 +                 int *len /* remaining length */ )
60567 +{
60568 +       reiser4_plugin_stat *sd;
60569 +       reiser4_plugin *plugin;
60570 +       int i;
60571 +       __u16 mask;
60572 +       int result;
60573 +       int num_of_plugins;
60574 +
60575 +       assert("nikita-653", inode != NULL);
60576 +       assert("nikita-654", area != NULL);
60577 +       assert("nikita-655", *area != NULL);
60578 +       assert("nikita-656", len != NULL);
60579 +       assert("nikita-657", *len > 0);
60580 +
60581 +       if (*len < (int) sizeof (reiser4_plugin_stat))
60582 +               return not_enough_space(inode, "plugin");
60583 +
60584 +       sd = (reiser4_plugin_stat *) * area;
60585 +
60586 +       mask = 0;
60587 +       num_of_plugins = d16tocpu(&sd->plugins_no);
60588 +       move_on(len, area, sizeof *sd);
60589 +       result = 0;
60590 +       for (i = 0; i < num_of_plugins; ++i) {
60591 +               reiser4_plugin_slot *slot;
60592 +               reiser4_plugin_type  type;
60593 +               pset_member          memb;
60594 +
60595 +               slot = (reiser4_plugin_slot *) * area;
60596 +               if (*len < (int) sizeof *slot)
60597 +                       return not_enough_space(inode, "additional plugin");
60598 +
60599 +               memb = d16tocpu(&slot->pset_memb);
60600 +               type = pset_member_to_type_unsafe(memb);
60601 +               if (type == REISER4_PLUGIN_TYPES) {
60602 +                       warning("nikita-3502", "wrong pset member (%i) for %llu",
60603 +                               memb, get_inode_oid(inode));
60604 +                       return RETERR(-EINVAL);
60605 +               }
60606 +               plugin = plugin_by_disk_id(tree_by_inode(inode),
60607 +                                          type, &slot->id);
60608 +               if (plugin == NULL)
60609 +                       return unknown_plugin(d16tocpu(&slot->id), inode);
60610 +
60611 +               /* plugin is loaded into inode, mark this into inode's
60612 +                  bitmask of loaded non-standard plugins */
60613 +               if (!(mask & (1 << memb))) {
60614 +                       mask |= (1 << memb);
60615 +               } else {
60616 +                       warning("nikita-658", "duplicate plugin for %llu", get_inode_oid(inode));
60617 +                       print_plugin("plugin", plugin);
60618 +                       return RETERR(-EINVAL);
60619 +               }
60620 +               move_on(len, area, sizeof *slot);
60621 +               /* load plugin data, if any */
60622 +               if (plugin->h.pops != NULL && plugin->h.pops->load) {
60623 +                       result = plugin->h.pops->load(inode, plugin, area, len);
60624 +                       if (result != 0)
60625 +                               return result;
60626 +               } else
60627 +                       result = grab_plugin_from(inode, memb, plugin);
60628 +       }
60629 +       /* if object plugin wasn't loaded from stat-data, guess it by
60630 +          mode bits */
60631 +       plugin = file_plugin_to_plugin(inode_file_plugin(inode));
60632 +       if (plugin == NULL)
60633 +               result = absent_plugin_sd(inode);
60634 +
60635 +       reiser4_inode_data(inode)->plugin_mask = mask;
60636 +       return result;
60637 +}
60638 +
60639 +/* Audited by: green(2002.06.14) */
60640 +static int
60641 +absent_plugin_sd(struct inode *inode /* object being processed */ )
60642 +{
60643 +       int result;
60644 +
60645 +       assert("nikita-659", inode != NULL);
60646 +
60647 +       result = guess_plugin_by_mode(inode);
60648 +       /* if mode was wrong, guess_plugin_by_mode() returns "regular file",
60649 +          but setup_inode_ops() will call make_bad_inode().
60650 +          Another, more logical but bit more complex solution is to add
60651 +          "bad-file plugin". */
60652 +       /* FIXME-VS: activate was called here */
60653 +       return result;
60654 +}
60655 +
60656 +/* helper function for plugin_sd_save_len(): calculate how much space
60657 +    required to save state of given plugin */
60658 +/* Audited by: green(2002.06.14) */
60659 +static int
60660 +len_for(reiser4_plugin * plugin /* plugin to save */ ,
60661 +       struct inode *inode /* object being processed */ ,
60662 +       pset_member memb, int len)
60663 +{
60664 +       reiser4_inode *info;
60665 +       assert("nikita-661", inode != NULL);
60666 +
60667 +       info = reiser4_inode_data(inode);
60668 +       if (plugin != NULL && (info->plugin_mask & (1 << memb))) {
60669 +               len += sizeof (reiser4_plugin_slot);
60670 +               if (plugin->h.pops && plugin->h.pops->save_len != NULL) {
60671 +                       /* non-standard plugin, call method */
60672 +                       /* commented as it is incompatible with alignment
60673 +                        * policy in save_plug() -edward */
60674 +                       /* len = round_up(len, plugin->h.pops->alignment); */
60675 +                       len += plugin->h.pops->save_len(inode, plugin);
60676 +               }
60677 +       }
60678 +       return len;
60679 +}
60680 +
60681 +/* calculate how much space is required to save state of all plugins,
60682 +    associated with inode */
60683 +static int
60684 +save_len_plugin_sd(struct inode *inode /* object being processed */ )
60685 +{
60686 +       int len;
60687 +       reiser4_inode *state;
60688 +       pset_member memb;
60689 +
60690 +       assert("nikita-663", inode != NULL);
60691 +
60692 +       state = reiser4_inode_data(inode);
60693 +       /* common case: no non-standard plugins */
60694 +       if (state->plugin_mask == 0)
60695 +               return 0;
60696 +       len = sizeof (reiser4_plugin_stat);
60697 +       for (memb = 0; memb < PSET_LAST; ++ memb)
60698 +               len = len_for(pset_get(state->pset, memb), inode, memb, len);
60699 +       assert("nikita-664", len > (int) sizeof (reiser4_plugin_stat));
60700 +       return len;
60701 +}
60702 +
60703 +/* helper function for plugin_sd_save(): save plugin, associated with
60704 +    inode. */
60705 +static int
60706 +save_plug(reiser4_plugin * plugin /* plugin to save */ ,
60707 +         struct inode *inode /* object being processed */ ,
60708 +         pset_member memb /* what element of pset is saved*/,
60709 +         char **area /* position in stat-data */ ,
60710 +         int *count            /* incremented if plugin were actually
60711 +                                * saved. */ )
60712 +{
60713 +       reiser4_plugin_slot *slot;
60714 +       int fake_len;
60715 +       int result;
60716 +
60717 +       assert("nikita-665", inode != NULL);
60718 +       assert("nikita-666", area != NULL);
60719 +       assert("nikita-667", *area != NULL);
60720 +
60721 +       if (plugin == NULL)
60722 +               return 0;
60723 +       if (!(reiser4_inode_data(inode)->plugin_mask & (1 << memb)))
60724 +               return 0;
60725 +       slot = (reiser4_plugin_slot *) * area;
60726 +       cputod16(memb, &slot->pset_memb);
60727 +       cputod16((unsigned) plugin->h.id, &slot->id);
60728 +       fake_len = (int) 0xffff;
60729 +       move_on(&fake_len, area, sizeof *slot);
60730 +       ++*count;
60731 +       result = 0;
60732 +       if (plugin->h.pops != NULL) {
60733 +               if (plugin->h.pops->save != NULL)
60734 +                       result = plugin->h.pops->save(inode, plugin, area);
60735 +       }
60736 +       return result;
60737 +}
60738 +
60739 +/* save state of all non-standard plugins associated with inode */
60740 +static int
60741 +save_plugin_sd(struct inode *inode /* object being processed */ ,
60742 +              char **area /* position in stat-data */ )
60743 +{
60744 +       int result = 0;
60745 +       int num_of_plugins;
60746 +       reiser4_plugin_stat *sd;
60747 +       reiser4_inode *state;
60748 +       int fake_len;
60749 +       pset_member memb;
60750 +
60751 +       assert("nikita-669", inode != NULL);
60752 +       assert("nikita-670", area != NULL);
60753 +       assert("nikita-671", *area != NULL);
60754 +
60755 +       state = reiser4_inode_data(inode);
60756 +       if (state->plugin_mask == 0)
60757 +               return 0;
60758 +       sd = (reiser4_plugin_stat *) * area;
60759 +       fake_len = (int) 0xffff;
60760 +       move_on(&fake_len, area, sizeof *sd);
60761 +
60762 +       num_of_plugins = 0;
60763 +       for (memb = 0; memb < PSET_LAST; ++ memb) {
60764 +               result = save_plug(pset_get(state->pset, memb),
60765 +                                  inode, memb, area, &num_of_plugins);
60766 +               if (result != 0)
60767 +                       break;
60768 +       }
60769 +
60770 +       cputod16((unsigned) num_of_plugins, &sd->plugins_no);
60771 +       return result;
60772 +}
60773 +
60774 +
60775 +/* helper function for crypto_sd_present(), crypto_sd_save.
60776 +   Allocates memory for crypto stat, keyid and attaches it to the inode */
60777 +
60778 +static int crypto_stat_to_inode (struct inode *inode,
60779 +                                crypto_stat_t * tmp,
60780 +                                unsigned int size /* fingerprint size */)
60781 +{
60782 +       crypto_stat_t * stat;
60783 +
60784 +       assert ("edward-11", (reiser4_inode_data(inode))->crypt == NULL);
60785 +       assert ("edward-33", !inode_get_flag(inode, REISER4_CRYPTO_STAT_LOADED));
60786 +
60787 +       stat = reiser4_kmalloc(sizeof(*stat), GFP_KERNEL);
60788 +       if (!stat)
60789 +               return RETERR(-ENOMEM);
60790 +       stat->keyid = reiser4_kmalloc((size_t)size, GFP_KERNEL);
60791 +       if (!stat->keyid) {
60792 +               reiser4_kfree(stat);
60793 +               return RETERR(-ENOMEM);
60794 +       }
60795 +       /* load inode crypto-stat */
60796 +       stat->keysize = tmp->keysize;
60797 +       xmemcpy(stat->keyid, tmp->keyid, (size_t)size);
60798 +       reiser4_inode_data(inode)->crypt = stat;
60799 +
60800 +       inode_set_flag(inode, REISER4_CRYPTO_STAT_LOADED);
60801 +       return 0;
60802 +}
60803 +
60804 +/* crypto stat-data extension */
60805 +
60806 +static int present_crypto_sd(struct inode *inode, char **area, int *len)
60807 +{
60808 +       int result;
60809 +       reiser4_crypto_stat *sd;
60810 +       crypto_stat_t stat;
60811 +       digest_plugin * dplug = inode_digest_plugin(inode);
60812 +
60813 +       unsigned int keyid_size;
60814 +
60815 +       assert("edward-06", dplug != NULL);
60816 +       assert("edward-684", dplug->dsize);
60817 +       assert("edward-07", area != NULL);
60818 +       assert("edward-08", *area != NULL);
60819 +       assert("edward-09", len != NULL);
60820 +       assert("edward-10", *len > 0);
60821 +
60822 +       if (*len < (int) sizeof (reiser4_crypto_stat)) {
60823 +               return not_enough_space(inode, "crypto-sd");
60824 +       }
60825 +       keyid_size = dplug->dsize;
60826 +       /* *len is number of bytes in stat data item from *area to the end of
60827 +          item. It must be not less than size of this extension */
60828 +       assert("edward-75", sizeof(*sd) + keyid_size <= *len);
60829 +
60830 +       sd = (reiser4_crypto_stat *) * area;
60831 +       stat.keysize = d16tocpu(&sd->keysize);
60832 +       stat.keyid = (__u8 *)sd->keyid;
60833 +
60834 +       result = crypto_stat_to_inode(inode, &stat, keyid_size);
60835 +       move_on(len, area, sizeof(*sd) + keyid_size);
60836 +       return result;
60837 +}
60838 +
60839 +static int absent_crypto_sd(struct inode * inode)
60840 +{
60841 +       return -EIO;
60842 +}
60843 +
60844 +static int save_len_crypto_sd(struct inode *inode)
60845 +{
60846 +       return (sizeof(reiser4_crypto_stat) + inode_digest_plugin(inode)->dsize);
60847 +}
60848 +
60849 +static int save_crypto_sd(struct inode *inode, char **area)
60850 +{
60851 +       int result = 0;
60852 +       reiser4_crypto_stat *sd;
60853 +       digest_plugin * dplug = inode_digest_plugin(inode);
60854 +
60855 +       assert("edward-12", dplug != NULL);
60856 +       assert("edward-13", area != NULL);
60857 +       assert("edward-14", *area != NULL);
60858 +       assert("edward-76", reiser4_inode_data(inode) != NULL);
60859 +
60860 +       sd = (reiser4_crypto_stat *) *area;
60861 +       if (!inode_get_flag(inode, REISER4_CRYPTO_STAT_LOADED)) {
60862 +               /* file is just created */
60863 +               crypto_stat_t * stat = reiser4_inode_data(inode)->crypt;
60864 +
60865 +               assert("edward-15", stat != NULL);
60866 +
60867 +               /* copy inode crypto-stat to the disk stat-data */
60868 +               cputod16(stat->keysize, &sd->keysize);
60869 +               xmemcpy(sd->keyid, stat->keyid, (size_t)dplug->dsize);
60870 +               inode_set_flag(inode, REISER4_CRYPTO_STAT_LOADED);
60871 +       } else {
60872 +               /* do nothing */
60873 +       }
60874 +       *area += (sizeof(*sd) + dplug->dsize);
60875 +       return result;
60876 +}
60877 +
60878 +#if REISER4_DEBUG_OUTPUT
60879 +static void
60880 +print_crypto_sd(const char *prefix, char **area /* position in stat-data */ ,
60881 +                int *len /* remaining length */ )
60882 +{
60883 +       /* FIXME-EDWARD Make sure we debug only with none digest plugin */
60884 +       digest_plugin * dplug = digest_plugin_by_id(NONE_DIGEST_ID);
60885 +       reiser4_crypto_stat *sd = (reiser4_crypto_stat *) * area;
60886 +
60887 +       printk("%s: keysize: %u keyid: \"%llx\"\n", prefix, d16tocpu(&sd->keysize), *(__u64 *)(sd->keyid));
60888 +       move_on(len, area, sizeof(*sd) + dplug->dsize);
60889 +}
60890 +#endif
60891 +
60892 +/* cluster stat-data extension */
60893 +
60894 +static int present_cluster_sd(struct inode *inode, char **area, int *len)
60895 +{
60896 +       reiser4_inode * info;
60897 +
60898 +       assert("edward-77", inode != NULL);
60899 +       assert("edward-78", area != NULL);
60900 +       assert("edward-79", *area != NULL);
60901 +       assert("edward-80", len != NULL);
60902 +       assert("edward-81", !inode_get_flag(inode, REISER4_CLUSTER_KNOWN));
60903 +
60904 +       info = reiser4_inode_data(inode);
60905 +
60906 +       assert("edward-82", info != NULL);
60907 +
60908 +       if (*len >= (int) sizeof (reiser4_cluster_stat)) {
60909 +               reiser4_cluster_stat *sd;
60910 +               sd = (reiser4_cluster_stat *) * area;
60911 +               info->cluster_shift = d8tocpu(&sd->cluster_shift);
60912 +               inode_set_flag(inode, REISER4_CLUSTER_KNOWN);
60913 +               move_on(len, area, sizeof *sd);
60914 +               return 0;
60915 +       }
60916 +       else
60917 +               return not_enough_space(inode, "cluster sd");
60918 +}
60919 +
60920 +static int absent_cluster_sd(struct inode * inode)
60921 +{
60922 +       return -EIO;
60923 +}
60924 +
60925 +static int save_len_cluster_sd(struct inode *inode UNUSED_ARG)
60926 +{
60927 +       return sizeof (reiser4_cluster_stat);
60928 +}
60929 +
60930 +static int save_cluster_sd(struct inode *inode, char **area)
60931 +{
60932 +       reiser4_cluster_stat *sd;
60933 +
60934 +       assert("edward-106", inode != NULL);
60935 +       assert("edward-107", area != NULL);
60936 +       assert("edward-108", *area != NULL);
60937 +
60938 +       sd = (reiser4_cluster_stat *) * area;
60939 +       if (!inode_get_flag(inode, REISER4_CLUSTER_KNOWN)) {
60940 +               cputod8(reiser4_inode_data(inode)->cluster_shift, &sd->cluster_shift);
60941 +               inode_set_flag(inode, REISER4_CLUSTER_KNOWN);
60942 +       }
60943 +       else {
60944 +               /* do nothing */
60945 +       }
60946 +       *area += sizeof *sd;
60947 +       return 0;
60948 +}
60949 +
60950 +#if REISER4_DEBUG_OUTPUT
60951 +static void
60952 +print_cluster_sd(const char *prefix, char **area /* position in stat-data */,
60953 +                int *len /* remaining length */ )
60954 +{
60955 +       reiser4_cluster_stat *sd = (reiser4_cluster_stat *) * area;
60956 +
60957 +       printk("%s: %u\n", prefix, d8tocpu(&sd->cluster_shift));
60958 +       move_on(len, area, sizeof *sd);
60959 +}
60960 +#endif
60961 +
60962 +static int eio(struct inode *inode, char **area, int *len)
60963 +{
60964 +       return RETERR(-EIO);
60965 +}
60966 +
60967 +sd_ext_plugin sd_ext_plugins[LAST_SD_EXTENSION] = {
60968 +       [LIGHT_WEIGHT_STAT] = {
60969 +                              .h = {
60970 +                                    .type_id = REISER4_SD_EXT_PLUGIN_TYPE,
60971 +                                    .id = LIGHT_WEIGHT_STAT,
60972 +                                    .pops = NULL,
60973 +                                    .label = "light-weight sd",
60974 +                                    .desc = "sd for light-weight files",
60975 +                                    .linkage = TYPE_SAFE_LIST_LINK_ZERO
60976 +                              },
60977 +                              .present = present_lw_sd,
60978 +                              .absent = NULL,
60979 +                              .save_len = save_len_lw_sd,
60980 +                              .save = save_lw_sd,
60981 +#if REISER4_DEBUG_OUTPUT
60982 +                              .print = print_lw_sd,
60983 +#endif
60984 +                              .alignment = 8
60985 +       },
60986 +       [UNIX_STAT] = {
60987 +                      .h = {
60988 +                            .type_id = REISER4_SD_EXT_PLUGIN_TYPE,
60989 +                            .id = UNIX_STAT,
60990 +                            .pops = NULL,
60991 +                            .label = "unix-sd",
60992 +                            .desc = "unix stat-data fields",
60993 +                            .linkage = TYPE_SAFE_LIST_LINK_ZERO
60994 +                      },
60995 +                      .present = present_unix_sd,
60996 +                      .absent = absent_unix_sd,
60997 +                      .save_len = save_len_unix_sd,
60998 +                      .save = save_unix_sd,
60999 +#if REISER4_DEBUG_OUTPUT
61000 +                      .print = print_unix_sd,
61001 +#endif
61002 +                      .alignment = 8
61003 +       },
61004 +       [LARGE_TIMES_STAT] = {
61005 +                      .h = {
61006 +                            .type_id = REISER4_SD_EXT_PLUGIN_TYPE,
61007 +                            .id = LARGE_TIMES_STAT,
61008 +                            .pops = NULL,
61009 +                            .label = "64time-sd",
61010 +                            .desc = "nanosecond resolution for times",
61011 +                            .linkage = TYPE_SAFE_LIST_LINK_ZERO
61012 +                      },
61013 +                      .present = present_large_times_sd,
61014 +                      .absent = NULL,
61015 +                      .save_len = save_len_large_times_sd,
61016 +                      .save = save_large_times_sd,
61017 +#if REISER4_DEBUG_OUTPUT
61018 +                      .print = print_large_times_sd,
61019 +#endif
61020 +                      .alignment = 8
61021 +       },
61022 +       [SYMLINK_STAT] = {
61023 +                         /* stat data of symlink has this extension */
61024 +                         .h = {
61025 +                               .type_id = REISER4_SD_EXT_PLUGIN_TYPE,
61026 +                               .id = SYMLINK_STAT,
61027 +                               .pops = NULL,
61028 +                               .label = "symlink-sd",
61029 +                               .desc = "stat data is appended with symlink name",
61030 +                               .linkage = TYPE_SAFE_LIST_LINK_ZERO
61031 +                         },
61032 +                         .present = present_symlink_sd,
61033 +                         .absent = NULL,
61034 +                         .save_len = save_len_symlink_sd,
61035 +                         .save = save_symlink_sd,
61036 +#if REISER4_DEBUG_OUTPUT
61037 +                         .print = print_symlink_sd,
61038 +#endif
61039 +                         .alignment = 8
61040 +       },
61041 +       [PLUGIN_STAT] = {
61042 +                        .h = {
61043 +                              .type_id = REISER4_SD_EXT_PLUGIN_TYPE,
61044 +                              .id = PLUGIN_STAT,
61045 +                              .pops = NULL,
61046 +                              .label = "plugin-sd",
61047 +                              .desc = "plugin stat-data fields",
61048 +                              .linkage = TYPE_SAFE_LIST_LINK_ZERO
61049 +                        },
61050 +                        .present = present_plugin_sd,
61051 +                        .absent = absent_plugin_sd,
61052 +                        .save_len = save_len_plugin_sd,
61053 +                        .save = save_plugin_sd,
61054 +#if REISER4_DEBUG_OUTPUT
61055 +                        .print = NULL,
61056 +#endif
61057 +                        .alignment = 8
61058 +       },
61059 +       [FLAGS_STAT] = {
61060 +                               .h = {
61061 +                                     .type_id = REISER4_SD_EXT_PLUGIN_TYPE,
61062 +                                     .id = FLAGS_STAT,
61063 +                                     .pops = NULL,
61064 +                                     .label = "flags-sd",
61065 +                                     .desc = "inode bit flags",
61066 +                                     .linkage = TYPE_SAFE_LIST_LINK_ZERO}
61067 +                               ,
61068 +                               .present = present_flags_sd,
61069 +                               .absent = NULL,
61070 +                               .save_len = save_len_flags_sd,
61071 +                               .save = save_flags_sd,
61072 +#if REISER4_DEBUG_OUTPUT
61073 +                               .print = NULL,
61074 +#endif
61075 +                               .alignment = 8
61076 +       },
61077 +       [CAPABILITIES_STAT] = {
61078 +                               .h = {
61079 +                                     .type_id = REISER4_SD_EXT_PLUGIN_TYPE,
61080 +                                     .id = CAPABILITIES_STAT,
61081 +                                     .pops = NULL,
61082 +                                     .label = "capabilities-sd",
61083 +                                     .desc = "capabilities",
61084 +                                     .linkage = TYPE_SAFE_LIST_LINK_ZERO
61085 +                               },
61086 +                               .present = eio,
61087 +                               .absent = NULL,
61088 +                               .save_len = save_len_flags_sd,
61089 +                               .save = save_flags_sd,
61090 +#if REISER4_DEBUG_OUTPUT
61091 +                               .print = NULL,
61092 +#endif
61093 +                               .alignment = 8
61094 +       },
61095 +       [CLUSTER_STAT] = {
61096 +                               .h = {
61097 +                                     .type_id = REISER4_SD_EXT_PLUGIN_TYPE,
61098 +                                     .id = CLUSTER_STAT,
61099 +                                     .pops = NULL,
61100 +                                     .label = "cluster-sd",
61101 +                                     .desc = "cluster shift",
61102 +                                     .linkage = TYPE_SAFE_LIST_LINK_ZERO}
61103 +                               ,
61104 +                               .present = present_cluster_sd,
61105 +                               .absent = absent_cluster_sd,
61106 +                               /* return IO_ERROR if smthng is wrong */
61107 +                               .save_len = save_len_cluster_sd,
61108 +                               .save = save_cluster_sd,
61109 +#if REISER4_DEBUG_OUTPUT
61110 +                               .print = print_cluster_sd,
61111 +#endif
61112 +                               .alignment = 8
61113 +       },
61114 +       [CRYPTO_STAT] = {
61115 +                               .h = {
61116 +                                     .type_id = REISER4_SD_EXT_PLUGIN_TYPE,
61117 +                                     .id = CRYPTO_STAT,
61118 +                                     .pops = NULL,
61119 +                                     .label = "crypto-sd",
61120 +                                     .desc = "secret key size and id",
61121 +                                     .linkage = TYPE_SAFE_LIST_LINK_ZERO}
61122 +                               ,
61123 +                               .present = present_crypto_sd,
61124 +                               .absent = absent_crypto_sd,
61125 +                               /* return IO_ERROR if smthng is wrong */
61126 +                               .save_len = save_len_crypto_sd,
61127 +                               .save = save_crypto_sd,
61128 +#if REISER4_DEBUG_OUTPUT
61129 +                               .print = print_crypto_sd,
61130 +#endif
61131 +                               .alignment = 8
61132 +       }
61133 +};
61134 +
61135 +/* Make Linus happy.
61136 +   Local variables:
61137 +   c-indentation-style: "K&R"
61138 +   mode-name: "LC"
61139 +   c-basic-offset: 8
61140 +   tab-width: 8
61141 +   fill-column: 120
61142 +   End:
61143 +*/
61144 diff -rupN linux-2.6.8-rc3/fs/reiser4/plugin/item/static_stat.h linux-2.6.8-rc3-a/fs/reiser4/plugin/item/static_stat.h
61145 --- linux-2.6.8-rc3/fs/reiser4/plugin/item/static_stat.h        1970-01-01 03:00:00.000000000 +0300
61146 +++ linux-2.6.8-rc3-a/fs/reiser4/plugin/item/static_stat.h      2004-08-05 21:20:53.093658319 +0400
61147 @@ -0,0 +1,220 @@
61148 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
61149 +
61150 +/* This describes the static_stat item, used to hold all information needed by the stat() syscall.
61151 +
61152 +In the case where each file has not less than the fields needed by the
61153 +stat() syscall, it is more compact to store those fields in this
61154 +struct.
61155 +
61156 +If this item does not exist, then all stats are dynamically resolved.
61157 +At the moment, we either resolve all stats dynamically or all of them
61158 +statically.  If you think this is not fully optimal, and the rest of
61159 +reiser4 is working, then fix it...:-)
61160 +
61161 +*/
61162 +
61163 +#if !defined( __FS_REISER4_PLUGIN_ITEM_STATIC_STAT_H__ )
61164 +#define __FS_REISER4_PLUGIN_ITEM_STATIC_STAT_H__
61165 +
61166 +#include "../../forward.h"
61167 +#include "../../dformat.h"
61168 +
61169 +#include <linux/fs.h>          /* for struct inode */
61170 +
61171 +/* Stat data layout: goals and implementation.
61172 +
61173 +We want to be able to have lightweight files which have complete flexibility in what semantic metadata is attached to
61174 +them, including not having semantic metadata attached to them.
61175 +
61176 +There is one problem with doing that, which is that if in fact you have exactly the same metadata for most files you want to store, then it takes more space to store that metadata in a dynamically sized structure than in a statically sized structure because the statically sized structure knows without recording it what the names and lengths of the attributes are.
61177 +
61178 +This leads to a natural compromise, which is to special case those files which have simply the standard unix file
61179 +attributes, and only employ the full dynamic stat data mechanism for those files that differ from the standard unix file
61180 +in their use of file attributes.
61181 +
61182 +Yet this compromise deserves to be compromised a little.
61183 +
61184 +We accomodate the case where you have no more than the standard unix file attributes by using an "extension bitmask":
61185 +each bit in it indicates presence or absence of or particular stat data extension (see sd_ext_bits enum).
61186 +
61187 +  If the first
61188 +bit of the extension bitmask bit is 0, we have light-weight file whose attributes are either inherited from parent
61189 +directory (as uid, gid) or initialised to some sane values.
61190 +
61191 +   To capitalize on existing code infrastructure, extensions are
61192 +   implemented as plugins of type REISER4_SD_EXT_PLUGIN_TYPE.
61193 +   Each stat-data extension plugin implements four methods:
61194 +
61195 +    ->present() called by sd_load() when this extension is found in stat-data
61196 +    ->absent() called by sd_load() when this extension is not found in stat-data
61197 +    ->save_len() called by sd_len() to calculate total length of stat-data
61198 +    ->save() called by sd_save() to store extension data into stat-data
61199 +
61200 +    Implementation is in fs/reiser4/plugin/item/static_stat.c
61201 +*/
61202 +
61203 +/* stat-data extension. Please order this by presumed frequency of use */
61204 +typedef enum {
61205 +       /* support for light-weight files */
61206 +       LIGHT_WEIGHT_STAT,
61207 +       /* data required to implement unix stat(2) call. Layout is in
61208 +           reiser4_unix_stat. If this is not present, file is light-weight */
61209 +       UNIX_STAT,
61210 +       /* this contains additional set of 32bit [anc]time fields to implement
61211 +          nanosecond resolution. Layout is in reiser4_large_times_stat. Usage
61212 +          if this extension is governed by 32bittimes mount option. */
61213 +       LARGE_TIMES_STAT,
61214 +       /* stat data has link name included */
61215 +       SYMLINK_STAT,
61216 +       /* if this is present, file is controlled by non-standard
61217 +           plugin (that is, plugin that cannot be deduced from file
61218 +           mode bits), for example, aggregation, interpolation etc. */
61219 +       PLUGIN_STAT,
61220 +       /* this extension contains persistent inode flags. These flags are
61221 +          single bits: immutable, append, only, etc. Layout is in
61222 +          reiser4_flags_stat. */
61223 +       FLAGS_STAT,
61224 +       /* this extension contains capabilities sets, associated with this
61225 +           file. Layout is in reiser4_capabilities_stat */
61226 +       CAPABILITIES_STAT,
61227 +        /* this extension contains the information about minimal unit size for
61228 +          file data processing. Layout is in reiser4_cluster_stat */
61229 +       CLUSTER_STAT,
61230 +       /* this extension contains size and public id of the secret key.
61231 +          Layout is in reiser4_crypto_stat */
61232 +       CRYPTO_STAT,
61233 +       LAST_SD_EXTENSION,
61234 +       /*
61235 +        * init_inode_static_sd() iterates over extension mask until all
61236 +        * non-zero bits are processed. This means, that neither ->present(),
61237 +        * nor ->absent() methods will be called for stat-data extensions that
61238 +        * go after last present extension. But some basic extensions, we want
61239 +        * either ->absent() or ->present() method to be called, because these
61240 +        * extensions set up something in inode even when they are not
61241 +        * present. This is what LAST_IMPORTANT_SD_EXTENSION is for: for all
61242 +        * extensions before and including LAST_IMPORTANT_SD_EXTENSION either
61243 +        * ->present(), or ->absent() method will be called, independently of
61244 +        * what other extensions are present.
61245 +        */
61246 +       LAST_IMPORTANT_SD_EXTENSION = PLUGIN_STAT,
61247 +} sd_ext_bits;
61248 +
61249 +/* minimal stat-data. This allows to support light-weight files. */
61250 +typedef struct reiser4_stat_data_base {
61251 +       /*  0 */ d16 extmask;
61252 +       /*  2 */
61253 +} PACKED reiser4_stat_data_base;
61254 +
61255 +typedef struct reiser4_light_weight_stat {
61256 +       /*  0 */ d16 mode;
61257 +       /*  2 */ d32 nlink;
61258 +       /*  8 */ d64 size;
61259 +       /* size in bytes */
61260 +       /* 16 */
61261 +} PACKED reiser4_light_weight_stat;
61262 +
61263 +typedef struct reiser4_unix_stat {
61264 +       /* owner id */
61265 +       /*  0 */ d32 uid;
61266 +       /* group id */
61267 +       /*  4 */ d32 gid;
61268 +       /* access time */
61269 +       /*  8 */ d32 atime;
61270 +       /* modification time */
61271 +       /* 12 */ d32 mtime;
61272 +       /* change time */
61273 +       /* 16 */ d32 ctime;
61274 +       union {
61275 +       /* minor:major for device files */
61276 +       /* 20 */         d64 rdev;
61277 +       /* bytes used by file */
61278 +       /* 20 */         d64 bytes;
61279 +       } u;
61280 +       /* 28 */
61281 +} PACKED reiser4_unix_stat;
61282 +
61283 +/* symlink stored as part of inode */
61284 +typedef struct reiser4_symlink_stat {
61285 +       char body[0];
61286 +} PACKED reiser4_symlink_stat;
61287 +
61288 +typedef struct reiser4_plugin_slot {
61289 +       /*  0 */ d16 pset_memb;
61290 +       /*  2 */ d16 id;
61291 +/*  4 *//* here plugin stores its persistent state */
61292 +} PACKED reiser4_plugin_slot;
61293 +
61294 +/* stat-data extension for files with non-standard plugin. */
61295 +typedef struct reiser4_plugin_stat {
61296 +       /* number of additional plugins, associated with this object */
61297 +       /*  0 */ d16 plugins_no;
61298 +       /*  2 */ reiser4_plugin_slot slot[0];
61299 +       /*  2 */
61300 +} PACKED reiser4_plugin_stat;
61301 +
61302 +/* stat-data extension for inode flags. Currently it is just fixed-width 32
61303 + * bit mask. If need arise, this can be replaced with variable width
61304 + * bitmask. */
61305 +typedef struct reiser4_flags_stat {
61306 +       /*  0 */ d32 flags;
61307 +       /*  4 */
61308 +} PACKED reiser4_flags_stat;
61309 +
61310 +typedef struct reiser4_capabilities_stat {
61311 +       /*  0 */ d32 effective;
61312 +       /*  8 */ d32 permitted;
61313 +       /* 16 */
61314 +} PACKED reiser4_capabilities_stat;
61315 +
61316 +typedef struct reiser4_cluster_stat {
61317 +/* this defines cluster size (an attribute of cryptcompress objects) as PAGE_SIZE << cluster shift */
61318 +       /* 0 */ d8 cluster_shift;
61319 +       /* 1 */
61320 +} PACKED reiser4_cluster_stat;
61321 +
61322 +typedef struct reiser4_crypto_stat {
61323 +       /* secret key size, bits */
61324 +       /*  0 */ d16 keysize;
61325 +       /* secret key id */
61326 +       /*  2 */ d8 keyid[0];
61327 +       /* 2 */
61328 +} PACKED reiser4_crypto_stat;
61329 +
61330 +typedef struct reiser4_large_times_stat {
61331 +       /* access time */
61332 +       /*  0 */ d32 atime;
61333 +       /* modification time */
61334 +       /*  8 */ d32 mtime;
61335 +       /* change time */
61336 +       /* 16 */ d32 ctime;
61337 +       /* 24 */
61338 +} PACKED reiser4_large_times_stat;
61339 +
61340 +/* this structure is filled by sd_item_stat */
61341 +typedef struct sd_stat {
61342 +       int dirs;
61343 +       int files;
61344 +       int others;
61345 +} sd_stat;
61346 +
61347 +/* plugin->item.common.* */
61348 +extern void print_sd(const char *prefix, coord_t * coord);
61349 +extern void item_stat_static_sd(const coord_t * coord, void *vp);
61350 +
61351 +/* plugin->item.s.sd.* */
61352 +extern int init_inode_static_sd(struct inode *inode, char *sd, int len);
61353 +extern int save_len_static_sd(struct inode *inode);
61354 +extern int save_static_sd(struct inode *inode, char **area);
61355 +
61356 +/* __FS_REISER4_PLUGIN_ITEM_STATIC_STAT_H__ */
61357 +#endif
61358 +
61359 +/* Make Linus happy.
61360 +   Local variables:
61361 +   c-indentation-style: "K&R"
61362 +   mode-name: "LC"
61363 +   c-basic-offset: 8
61364 +   tab-width: 8
61365 +   fill-column: 120
61366 +   End:
61367 +*/
61368 diff -rupN linux-2.6.8-rc3/fs/reiser4/plugin/item/tail.c linux-2.6.8-rc3-a/fs/reiser4/plugin/item/tail.c
61369 --- linux-2.6.8-rc3/fs/reiser4/plugin/item/tail.c       1970-01-01 03:00:00.000000000 +0300
61370 +++ linux-2.6.8-rc3-a/fs/reiser4/plugin/item/tail.c     2004-08-05 21:20:53.036670340 +0400
61371 @@ -0,0 +1,690 @@
61372 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
61373 +
61374 +#include "item.h"
61375 +#include "../../inode.h"
61376 +#include "../../page_cache.h"
61377 +#include "../../carry.h"
61378 +
61379 +#include <linux/quotaops.h>
61380 +#include <asm/uaccess.h>
61381 +#include <linux/swap.h>
61382 +#include <linux/writeback.h>
61383 +
61384 +/* plugin->u.item.b.max_key_inside */
61385 +reiser4_internal reiser4_key *
61386 +max_key_inside_tail(const coord_t *coord, reiser4_key *key)
61387 +{
61388 +       item_key_by_coord(coord, key);
61389 +       set_key_offset(key, get_key_offset(max_key()));
61390 +       return key;
61391 +}
61392 +
61393 +/* plugin->u.item.b.can_contain_key */
61394 +reiser4_internal int
61395 +can_contain_key_tail(const coord_t *coord, const reiser4_key *key, const reiser4_item_data *data)
61396 +{
61397 +       reiser4_key item_key;
61398 +
61399 +       if (item_plugin_by_coord(coord) != data->iplug)
61400 +               return 0;
61401 +
61402 +       item_key_by_coord(coord, &item_key);
61403 +       if (get_key_locality(key) != get_key_locality(&item_key) ||
61404 +           get_key_objectid(key) != get_key_objectid(&item_key)) return 0;
61405 +
61406 +       return 1;
61407 +}
61408 +
61409 +/* plugin->u.item.b.mergeable
61410 +   first item is of tail type */
61411 +/* Audited by: green(2002.06.14) */
61412 +reiser4_internal int
61413 +mergeable_tail(const coord_t *p1, const coord_t *p2)
61414 +{
61415 +       reiser4_key key1, key2;
61416 +
61417 +       assert("vs-535", item_type_by_coord(p1) == UNIX_FILE_METADATA_ITEM_TYPE);
61418 +       assert("vs-365", item_id_by_coord(p1) == FORMATTING_ID);
61419 +
61420 +       if (item_id_by_coord(p2) != FORMATTING_ID) {
61421 +               /* second item is of another type */
61422 +               return 0;
61423 +       }
61424 +
61425 +       item_key_by_coord(p1, &key1);
61426 +       item_key_by_coord(p2, &key2);
61427 +       if (get_key_locality(&key1) != get_key_locality(&key2) ||
61428 +           get_key_objectid(&key1) != get_key_objectid(&key2) || get_key_type(&key1) != get_key_type(&key2)) {
61429 +               /* items of different objects */
61430 +               return 0;
61431 +       }
61432 +       if (get_key_offset(&key1) + nr_units_tail(p1) != get_key_offset(&key2)) {
61433 +               /* not adjacent items */
61434 +               return 0;
61435 +       }
61436 +       return 1;
61437 +}
61438 +
61439 +reiser4_internal void show_tail(struct seq_file *m, coord_t *coord)
61440 +{
61441 +       seq_printf(m, "length: %i", item_length_by_coord(coord));
61442 +}
61443 +
61444 +/* plugin->u.item.b.print
61445 +   plugin->u.item.b.check */
61446 +
61447 +/* plugin->u.item.b.nr_units */
61448 +reiser4_internal pos_in_node_t
61449 +nr_units_tail(const coord_t *coord)
61450 +{
61451 +       return item_length_by_coord(coord);
61452 +}
61453 +
61454 +/* plugin->u.item.b.lookup */
61455 +reiser4_internal lookup_result
61456 +lookup_tail(const reiser4_key *key, lookup_bias bias, coord_t *coord)
61457 +{
61458 +       reiser4_key item_key;
61459 +       __u64 lookuped, offset;
61460 +       unsigned nr_units;
61461 +
61462 +       item_key_by_coord(coord, &item_key);
61463 +       offset = get_key_offset(item_key_by_coord(coord, &item_key));
61464 +       nr_units = nr_units_tail(coord);
61465 +
61466 +       /* key we are looking for must be greater than key of item @coord */
61467 +       assert("vs-416", keygt(key, &item_key));
61468 +
61469 +       /* offset we are looking for */
61470 +       lookuped = get_key_offset(key);
61471 +
61472 +       if (lookuped >= offset && lookuped < offset + nr_units) {
61473 +               /* byte we are looking for is in this item */
61474 +               coord->unit_pos = lookuped - offset;
61475 +               coord->between = AT_UNIT;
61476 +               return CBK_COORD_FOUND;
61477 +       }
61478 +
61479 +       /* set coord after last unit */
61480 +       coord->unit_pos = nr_units - 1;
61481 +       coord->between = AFTER_UNIT;
61482 +       return bias == FIND_MAX_NOT_MORE_THAN ? CBK_COORD_FOUND : CBK_COORD_NOTFOUND;
61483 +}
61484 +
61485 +/* plugin->u.item.b.paste */
61486 +reiser4_internal int
61487 +paste_tail(coord_t *coord, reiser4_item_data *data, carry_plugin_info *info UNUSED_ARG)
61488 +{
61489 +       unsigned old_item_length;
61490 +       char *item;
61491 +
61492 +       /* length the item had before resizing has been performed */
61493 +       old_item_length = item_length_by_coord(coord) - data->length;
61494 +
61495 +       /* tail items never get pasted in the middle */
61496 +       assert("vs-363",
61497 +              (coord->unit_pos == 0 && coord->between == BEFORE_UNIT) ||
61498 +              (coord->unit_pos == old_item_length - 1 &&
61499 +               coord->between == AFTER_UNIT) ||
61500 +              (coord->unit_pos == 0 && old_item_length == 0 && coord->between == AT_UNIT));
61501 +
61502 +       item = item_body_by_coord(coord);
61503 +       if (coord->unit_pos == 0)
61504 +               /* make space for pasted data when pasting at the beginning of
61505 +                  the item */
61506 +               xmemmove(item + data->length, item, old_item_length);
61507 +
61508 +       if (coord->between == AFTER_UNIT)
61509 +               coord->unit_pos++;
61510 +
61511 +       if (data->data) {
61512 +               assert("vs-554", data->user == 0 || data->user == 1);
61513 +               if (data->user) {
61514 +                       assert("nikita-3035", schedulable());
61515 +                       /* AUDIT: return result is not checked! */
61516 +                       /* copy from user space */
61517 +                       __copy_from_user(item + coord->unit_pos, data->data, (unsigned) data->length);
61518 +               } else
61519 +                       /* copy from kernel space */
61520 +                       xmemcpy(item + coord->unit_pos, data->data, (unsigned) data->length);
61521 +       } else {
61522 +               xmemset(item + coord->unit_pos, 0, (unsigned) data->length);
61523 +       }
61524 +       return 0;
61525 +}
61526 +
61527 +/* plugin->u.item.b.fast_paste */
61528 +
61529 +/* plugin->u.item.b.can_shift
61530 +   number of units is returned via return value, number of bytes via @size. For
61531 +   tail items they coincide */
61532 +reiser4_internal int
61533 +can_shift_tail(unsigned free_space, coord_t *source UNUSED_ARG,
61534 +              znode *target UNUSED_ARG, shift_direction direction UNUSED_ARG, unsigned *size, unsigned want)
61535 +{
61536 +       /* make sure that that we do not want to shift more than we have */
61537 +       assert("vs-364", want > 0 && want <= (unsigned) item_length_by_coord(source));
61538 +
61539 +       *size = min(want, free_space);
61540 +       return *size;
61541 +}
61542 +
61543 +/* plugin->u.item.b.copy_units */
61544 +reiser4_internal void
61545 +copy_units_tail(coord_t *target, coord_t *source,
61546 +               unsigned from, unsigned count, shift_direction where_is_free_space, unsigned free_space UNUSED_ARG)
61547 +{
61548 +       /* make sure that item @target is expanded already */
61549 +       assert("vs-366", (unsigned) item_length_by_coord(target) >= count);
61550 +       assert("vs-370", free_space >= count);
61551 +
61552 +       if (where_is_free_space == SHIFT_LEFT) {
61553 +               /* append item @target with @count first bytes of @source */
61554 +               assert("vs-365", from == 0);
61555 +
61556 +               xmemcpy((char *) item_body_by_coord(target) +
61557 +                       item_length_by_coord(target) - count, (char *) item_body_by_coord(source), count);
61558 +       } else {
61559 +               /* target item is moved to right already */
61560 +               reiser4_key key;
61561 +
61562 +               assert("vs-367", (unsigned) item_length_by_coord(source) == from + count);
61563 +
61564 +               xmemcpy((char *) item_body_by_coord(target), (char *) item_body_by_coord(source) + from, count);
61565 +
61566 +               /* new units are inserted before first unit in an item,
61567 +                  therefore, we have to update item key */
61568 +               item_key_by_coord(source, &key);
61569 +               set_key_offset(&key, get_key_offset(&key) + from);
61570 +
61571 +               node_plugin_by_node(target->node)->update_item_key(target, &key, 0 /*info */);
61572 +       }
61573 +}
61574 +
61575 +/* plugin->u.item.b.create_hook */
61576 +
61577 +
61578 +/* item_plugin->b.kill_hook
61579 +   this is called when @count units starting from @from-th one are going to be removed
61580 +   */
61581 +reiser4_internal int
61582 +kill_hook_tail(const coord_t *coord, pos_in_node_t from,
61583 +              pos_in_node_t count, struct carry_kill_data *kdata)
61584 +{
61585 +       reiser4_key key;
61586 +       loff_t start, end;
61587 +
61588 +       assert("vs-1577", kdata);
61589 +       assert("vs-1579", kdata->inode);
61590 +
61591 +       item_key_by_coord(coord, &key);
61592 +       start = get_key_offset(&key) + from;
61593 +       end = start + count;
61594 +       fake_kill_hook_tail(kdata->inode, start, end);
61595 +       return 0;
61596 +}
61597 +
61598 +/* plugin->u.item.b.shift_hook */
61599 +
61600 +/* helper for kill_units_tail and cut_units_tail */
61601 +static int
61602 +do_cut_or_kill(coord_t *coord, pos_in_node_t from, pos_in_node_t to,
61603 +              reiser4_key *smallest_removed, reiser4_key *new_first)
61604 +{
61605 +       pos_in_node_t count;
61606 +
61607 +       /* this method is only called to remove part of item */
61608 +       assert("vs-374", (to - from + 1) < item_length_by_coord(coord));
61609 +       /* tails items are never cut from the middle of an item */
61610 +       assert("vs-396", ergo(from != 0, to == coord_last_unit_pos(coord)));
61611 +       assert("vs-1558", ergo(from == 0, to < coord_last_unit_pos(coord)));
61612 +
61613 +       count = to - from + 1;
61614 +
61615 +       if (smallest_removed) {
61616 +               /* store smallest key removed */
61617 +               item_key_by_coord(coord, smallest_removed);
61618 +               set_key_offset(smallest_removed, get_key_offset(smallest_removed) + from);
61619 +       }
61620 +       if (new_first) {
61621 +               /* head of item is cut */
61622 +               assert("vs-1529", from == 0);
61623 +
61624 +               item_key_by_coord(coord, new_first);
61625 +               set_key_offset(new_first, get_key_offset(new_first) + from + count);
61626 +       }
61627 +
61628 +       if (REISER4_DEBUG)
61629 +               xmemset((char *) item_body_by_coord(coord) + from, 0, count);
61630 +       return count;
61631 +}
61632 +
61633 +/* plugin->u.item.b.cut_units */
61634 +reiser4_internal int
61635 +cut_units_tail(coord_t *coord, pos_in_node_t from, pos_in_node_t to,
61636 +              struct carry_cut_data *cdata UNUSED_ARG, reiser4_key *smallest_removed, reiser4_key *new_first)
61637 +{
61638 +       return do_cut_or_kill(coord, from, to, smallest_removed, new_first);
61639 +}
61640 +
61641 +/* plugin->u.item.b.kill_units */
61642 +reiser4_internal int
61643 +kill_units_tail(coord_t *coord, pos_in_node_t from, pos_in_node_t to,
61644 +               struct carry_kill_data *kdata, reiser4_key *smallest_removed, reiser4_key *new_first)
61645 +{
61646 +       kill_hook_tail(coord, from, to - from + 1, kdata);
61647 +       return do_cut_or_kill(coord, from, to, smallest_removed, new_first);
61648 +}
61649 +
61650 +/* plugin->u.item.b.unit_key */
61651 +reiser4_internal reiser4_key *
61652 +unit_key_tail(const coord_t *coord, reiser4_key *key)
61653 +{
61654 +       assert("vs-375", coord_is_existing_unit(coord));
61655 +
61656 +       item_key_by_coord(coord, key);
61657 +       set_key_offset(key, (get_key_offset(key) + coord->unit_pos));
61658 +
61659 +       return key;
61660 +}
61661 +
61662 +/* plugin->u.item.b.estimate
61663 +   plugin->u.item.b.item_data_by_flow */
61664 +
61665 +/* overwrite tail item or its part by use data */
61666 +static int
61667 +overwrite_tail(coord_t *coord, flow_t *f)
61668 +{
61669 +       unsigned count;
61670 +
61671 +       assert("vs-570", f->user == 1);
61672 +       assert("vs-946", f->data);
61673 +       assert("vs-947", coord_is_existing_unit(coord));
61674 +       assert("vs-948", znode_is_write_locked(coord->node));
61675 +       assert("nikita-3036", schedulable());
61676 +
61677 +       count = item_length_by_coord(coord) - coord->unit_pos;
61678 +       if (count > f->length)
61679 +               count = f->length;
61680 +
61681 +       if (__copy_from_user((char *) item_body_by_coord(coord) + coord->unit_pos, f->data, count))
61682 +               return RETERR(-EFAULT);
61683 +
61684 +       znode_make_dirty(coord->node);
61685 +
61686 +       move_flow_forward(f, count);
61687 +       return 0;
61688 +}
61689 +
61690 +/* tail redpage function. It is called from readpage_tail(). */
61691 +reiser4_internal int do_readpage_tail(uf_coord_t *uf_coord, struct page *page) {
61692 +       tap_t tap;
61693 +       int result;
61694 +       coord_t coord;
61695 +       lock_handle lh;
61696 +
61697 +       int count, mapped;
61698 +       struct inode *inode;
61699 +
61700 +       /* saving passed coord in order to do not move it by tap. */
61701 +       init_lh(&lh);
61702 +       copy_lh(&lh, uf_coord->lh);
61703 +       inode = page->mapping->host;
61704 +       coord_dup(&coord, &uf_coord->base_coord);
61705 +
61706 +       tap_init(&tap, &coord, &lh, ZNODE_READ_LOCK);
61707 +
61708 +       if ((result = tap_load(&tap)))
61709 +               goto out_tap_done;
61710 +
61711 +       /* lookup until page is filled up. */
61712 +       for (mapped = 0; mapped < PAGE_CACHE_SIZE; mapped += count) {
61713 +               void *pagedata;
61714 +
61715 +               /* number of bytes to be copied to page. */
61716 +               count = item_length_by_coord(&coord) - coord.unit_pos;
61717 +
61718 +               if (count > PAGE_CACHE_SIZE - mapped)
61719 +                       count = PAGE_CACHE_SIZE - mapped;
61720 +
61721 +               /* attaching @page to address space and getting data address. */
61722 +               pagedata = kmap_atomic(page, KM_USER0);
61723 +
61724 +               /* copying tail body to page. */
61725 +               xmemcpy((char *)(pagedata + mapped),
61726 +                       ((char *)item_body_by_coord(&coord) + coord.unit_pos), count);
61727 +
61728 +               flush_dcache_page(page);
61729 +
61730 +               /* dettaching page from address space. */
61731 +               kunmap_atomic(page, KM_USER0);
61732 +
61733 +               /* Getting next tail item. */
61734 +               if (mapped + count < PAGE_CACHE_SIZE) {
61735 +
61736 +                       /* unlocking page in order to avoid keep it locked durring tree lookup,
61737 +                          which takes long term locks. */
61738 +                       unlock_page(page);
61739 +
61740 +                       /* getting right neighbour. */
61741 +                       result = go_dir_el(&tap, RIGHT_SIDE, 0);
61742 +
61743 +                       /* lock page back */
61744 +                       lock_page(page);
61745 +
61746 +                       /* page is uptodate due to another thread made it up to date. Getting
61747 +                          out of here. */
61748 +                       if (PageUptodate(page)) {
61749 +                               result = 0;
61750 +                               goto out_unlock_page;
61751 +                       }
61752 +
61753 +                       if (result) {
61754 +                               /* check if there is no neighbour node. */
61755 +                               if (result == -E_NO_NEIGHBOR) {
61756 +                                       result = 0;
61757 +                                       goto out_update_page;
61758 +                               } else {
61759 +                                       goto out_tap_relse;
61760 +                               }
61761 +                       } else {
61762 +                               /* check if found coord is not owned by file. */
61763 +                               if (!inode_file_plugin(inode)->owns_item(inode, &coord)) {
61764 +                                       result = 0;
61765 +                                       goto out_update_page;
61766 +                               }
61767 +                       }
61768 +               }
61769 +       }
61770 +
61771 +       /* making page up to date and releasing it. */
61772 +       SetPageUptodate(page);
61773 +       unlock_page(page);
61774 +
61775 +       /* releasing tap */
61776 +       tap_relse(&tap);
61777 +       tap_done(&tap);
61778 +
61779 +       return 0;
61780 +
61781 + out_update_page:
61782 +       SetPageUptodate(page);
61783 + out_unlock_page:
61784 +       unlock_page(page);
61785 + out_tap_relse:
61786 +       tap_relse(&tap);
61787 + out_tap_done:
61788 +       tap_done(&tap);
61789 +       return result;
61790 +}
61791 +
61792 +/*
61793 +   plugin->s.file.readpage
61794 +   reiser4_read->unix_file_read->page_cache_readahead->reiser4_readpage->unix_file_readpage->readpage_tail
61795 +   or
61796 +   filemap_nopage->reiser4_readpage->readpage_unix_file->->readpage_tail
61797 +
61798 +   At the beginning: coord->node is read locked, zloaded, page is locked, coord is set to existing unit inside of tail
61799 +   item. */
61800 +reiser4_internal int
61801 +readpage_tail(void *vp, struct page *page)
61802 +{
61803 +       uf_coord_t *uf_coord = vp;
61804 +       ON_DEBUG(coord_t *coord = &uf_coord->base_coord);
61805 +       ON_DEBUG(reiser4_key key);
61806 +
61807 +       assert("umka-2515", PageLocked(page));
61808 +       assert("umka-2516", !PageUptodate(page));
61809 +       assert("umka-2517", !jprivate(page) && !PagePrivate(page));
61810 +       assert("umka-2518", page->mapping && page->mapping->host);
61811 +
61812 +       assert("umka-2519", znode_is_loaded(coord->node));
61813 +       assert("umka-2520", item_is_tail(coord));
61814 +       assert("umka-2521", coord_is_existing_unit(coord));
61815 +       assert("umka-2522", znode_is_rlocked(coord->node));
61816 +       assert("umka-2523", page->mapping->host->i_ino == get_key_objectid(item_key_by_coord(coord, &key)));
61817 +
61818 +       return do_readpage_tail(uf_coord, page);
61819 +}
61820 +
61821 +reiser4_internal int
61822 +item_balance_dirty_pages(struct address_space *mapping, const flow_t *f,
61823 +                        hint_t *hint, int back_to_dirty, int do_set_hint)
61824 +{
61825 +       int result;
61826 +       struct inode *inode;
61827 +
61828 +       if (do_set_hint) {
61829 +               if (hint->coord.valid)
61830 +                       set_hint(hint, &f->key, ZNODE_WRITE_LOCK);
61831 +               else
61832 +                       unset_hint(hint);
61833 +               longterm_unlock_znode(hint->coord.lh);
61834 +       }
61835 +
61836 +       inode = mapping->host;
61837 +       if (get_key_offset(&f->key) > inode->i_size) {
61838 +               assert("vs-1649", f->user == 1);
61839 +               INODE_SET_FIELD(inode, i_size, get_key_offset(&f->key));
61840 +       }
61841 +       if (f->user != 0) {
61842 +               /* this was writing data from user space. Update timestamps, therefore. Othrewise, this is tail
61843 +                  conversion where we should not update timestamps */
61844 +               inode->i_ctime = inode->i_mtime = CURRENT_TIME;
61845 +               result = reiser4_update_sd(inode);
61846 +               if (result)
61847 +                       return result;
61848 +       }
61849 +
61850 +       /* FIXME-VS: this is temporary: the problem is that bdp takes inodes
61851 +          from sb's dirty list and it looks like nobody puts there inodes of
61852 +          files which are built of tails */
61853 +       if (back_to_dirty)
61854 +               move_inode_out_from_sync_inodes_loop(mapping);
61855 +
61856 +       balance_dirty_pages_ratelimited(inode->i_mapping);
61857 +       return hint_validate(hint, &f->key, 0/* do not check key */, ZNODE_WRITE_LOCK);
61858 +}
61859 +
61860 +/* drop longterm znode lock before calling balance_dirty_pages. balance_dirty_pages may cause transaction to close,
61861 +   therefore we have to update stat data if necessary */
61862 +static int formatting_balance_dirty_pages(struct address_space *mapping, const flow_t *f,
61863 +                                   hint_t *hint)
61864 +{
61865 +       return item_balance_dirty_pages(mapping, f, hint, 1, 1/* set hint */);
61866 +}
61867 +
61868 +/* calculate number of blocks which can be dirtied/added when flow is inserted and stat data gets updated and grab them.
61869 +   FIXME-VS: we may want to call grab_space with BA_CAN_COMMIT flag but that would require all that complexity with
61870 +   sealing coord, releasing long term lock and validating seal later */
61871 +static int
61872 +insert_flow_reserve(reiser4_tree *tree)
61873 +{
61874 +       grab_space_enable();
61875 +       return reiser4_grab_space(estimate_insert_flow(tree->height) + estimate_one_insert_into_item(tree), 0);
61876 +}
61877 +
61878 +/* one block gets overwritten and stat data may get updated */
61879 +static int
61880 +overwrite_reserve(reiser4_tree *tree)
61881 +{
61882 +       grab_space_enable();
61883 +       return reiser4_grab_space(1 + estimate_one_insert_into_item(tree), 0);
61884 +}
61885 +
61886 +/* plugin->u.item.s.file.write
61887 +   access to data stored in tails goes directly through formatted nodes */
61888 +reiser4_internal int
61889 +write_tail(struct inode *inode, flow_t *f, hint_t *hint,
61890 +          int grabbed, /* tail's write may be called from plain unix file write and from tail conversion. In first
61891 +                          case (grabbed == 0) space is not reserved forehand, so, it must be done here. When it is
61892 +                          being called from tail conversion - space is reserved already for whole operation which may
61893 +                          involve several calls to item write. In this case space reservation will not be done here */
61894 +          write_mode_t mode)
61895 +{
61896 +       int result;
61897 +       coord_t *coord;
61898 +
61899 +       assert("vs-1338", hint->coord.valid == 1);
61900 +
61901 +       coord = &hint->coord.base_coord;
61902 +       result = 0;
61903 +       while (f->length && hint->coord.valid == 1) {
61904 +               switch (mode) {
61905 +               case FIRST_ITEM:
61906 +               case APPEND_ITEM:
61907 +                       /* check quota before appending data */
61908 +                       if (DQUOT_ALLOC_SPACE_NODIRTY(inode, f->length)) {
61909 +                               result = RETERR(-EDQUOT);
61910 +                               break;
61911 +                       }
61912 +
61913 +                       if (!grabbed)
61914 +                               result = insert_flow_reserve(znode_get_tree(coord->node));
61915 +                       if (!result)
61916 +                               result = insert_flow(coord, hint->coord.lh, f);
61917 +                       if (f->length)
61918 +                               DQUOT_FREE_SPACE_NODIRTY(inode, f->length);
61919 +                       break;
61920 +
61921 +               case OVERWRITE_ITEM:
61922 +                       if (!grabbed)
61923 +                               result = overwrite_reserve(znode_get_tree(coord->node));
61924 +                       if (!result)
61925 +                               result = overwrite_tail(coord, f);
61926 +                       break;
61927 +
61928 +               default:
61929 +                       impossible("vs-1031", "does this ever happen?");
61930 +                       result = RETERR(-EIO);
61931 +                       break;
61932 +
61933 +               }
61934 +
61935 +               if (result) {
61936 +                       if (!grabbed)
61937 +                               all_grabbed2free();
61938 +                       break;
61939 +               }
61940 +
61941 +               /* FIXME: do not rely on a coord yet */
61942 +               hint->coord.valid = 0;
61943 +
61944 +               /* throttle the writer */
61945 +               result = formatting_balance_dirty_pages(inode->i_mapping, f, hint);
61946 +               if (!grabbed)
61947 +                       all_grabbed2free();
61948 +               if (result) {
61949 +                       // reiser4_stat_tail_add(bdp_caused_repeats);
61950 +                       break;
61951 +               }
61952 +       }
61953 +
61954 +       return result;
61955 +}
61956 +
61957 +#if REISER4_DEBUG
61958 +
61959 +static int
61960 +coord_matches_key_tail(const coord_t *coord, const reiser4_key *key)
61961 +{
61962 +       reiser4_key item_key;
61963 +
61964 +       assert("vs-1356", coord_is_existing_unit(coord));
61965 +       assert("vs-1354", keylt(key, append_key_tail(coord, &item_key)));
61966 +       assert("vs-1355", keyge(key, item_key_by_coord(coord, &item_key)));
61967 +       return get_key_offset(key) == get_key_offset(&item_key) + coord->unit_pos;
61968 +
61969 +}
61970 +
61971 +#endif
61972 +
61973 +/* plugin->u.item.s.file.read */
61974 +reiser4_internal int
61975 +read_tail(struct file *file UNUSED_ARG, flow_t *f, hint_t *hint)
61976 +{
61977 +       unsigned count;
61978 +       int item_length;
61979 +       coord_t *coord;
61980 +       uf_coord_t *uf_coord;
61981 +
61982 +       uf_coord = &hint->coord;
61983 +       coord = &uf_coord->base_coord;
61984 +
61985 +       assert("vs-571", f->user == 1);
61986 +       assert("vs-571", f->data);
61987 +       assert("vs-967", coord && coord->node);
61988 +       assert("vs-1117", znode_is_rlocked(coord->node));
61989 +       assert("vs-1118", znode_is_loaded(coord->node));
61990 +
61991 +       assert("nikita-3037", schedulable());
61992 +       assert("vs-1357", coord_matches_key_tail(coord, &f->key));
61993 +
61994 +       /* calculate number of bytes to read off the item */
61995 +       item_length = item_length_by_coord(coord);
61996 +       count = item_length_by_coord(coord) - coord->unit_pos;
61997 +       if (count > f->length)
61998 +               count = f->length;
61999 +
62000 +
62001 +       /* FIXME: unlock long term lock ! */
62002 +
62003 +       if (__copy_to_user(f->data, ((char *) item_body_by_coord(coord) + coord->unit_pos), count))
62004 +               return RETERR(-EFAULT);
62005 +
62006 +       /* probably mark_page_accessed() should only be called if
62007 +        * coord->unit_pos is zero. */
62008 +       mark_page_accessed(znode_page(coord->node));
62009 +       move_flow_forward(f, count);
62010 +
62011 +       coord->unit_pos += count;
62012 +       if (item_length == coord->unit_pos) {
62013 +               coord->unit_pos --;
62014 +               coord->between = AFTER_UNIT;
62015 +       }
62016 +
62017 +       return 0;
62018 +}
62019 +
62020 +/*
62021 +   plugin->u.item.s.file.append_key
62022 +   key of first byte which is the next to last byte by addressed by this item
62023 +*/
62024 +reiser4_internal reiser4_key *
62025 +append_key_tail(const coord_t *coord, reiser4_key *key)
62026 +{
62027 +       item_key_by_coord(coord, key);
62028 +       set_key_offset(key, get_key_offset(key) + item_length_by_coord(coord));
62029 +       return key;
62030 +}
62031 +
62032 +/* plugin->u.item.s.file.init_coord_extension */
62033 +reiser4_internal void
62034 +init_coord_extension_tail(uf_coord_t *uf_coord, loff_t lookuped)
62035 +{
62036 +       uf_coord->valid = 1;
62037 +}
62038 +
62039 +/*
62040 +  plugin->u.item.s.file.get_block
62041 +*/
62042 +reiser4_internal int
62043 +get_block_address_tail(const coord_t *coord, sector_t block, struct buffer_head *bh)
62044 +{
62045 +       assert("nikita-3252",
62046 +              znode_get_level(coord->node) == LEAF_LEVEL);
62047 +
62048 +       bh->b_blocknr = *znode_get_block(coord->node);
62049 +       return 0;
62050 +}
62051 +
62052 +/*
62053 +   Local variables:
62054 +   c-indentation-style: "K&R"
62055 +   mode-name: "LC"
62056 +   c-basic-offset: 8
62057 +   tab-width: 8
62058 +   fill-column: 120
62059 +   scroll-step: 1
62060 +   End:
62061 +*/
62062 diff -rupN linux-2.6.8-rc3/fs/reiser4/plugin/item/tail.h linux-2.6.8-rc3-a/fs/reiser4/plugin/item/tail.h
62063 --- linux-2.6.8-rc3/fs/reiser4/plugin/item/tail.h       1970-01-01 03:00:00.000000000 +0300
62064 +++ linux-2.6.8-rc3-a/fs/reiser4/plugin/item/tail.h     2004-08-05 21:20:53.134649673 +0400
62065 @@ -0,0 +1,56 @@
62066 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
62067 +
62068 +#if !defined( __REISER4_TAIL_H__ )
62069 +#define __REISER4_TAIL_H__
62070 +
62071 +typedef struct {
62072 +       int not_used;
62073 +} tail_coord_extension_t;
62074 +
62075 +struct cut_list;
62076 +
62077 +
62078 +/* plugin->u.item.b.* */
62079 +reiser4_key *max_key_inside_tail(const coord_t *, reiser4_key *);
62080 +int can_contain_key_tail(const coord_t * coord, const reiser4_key * key, const reiser4_item_data *);
62081 +int mergeable_tail(const coord_t * p1, const coord_t * p2);
62082 +pos_in_node_t nr_units_tail(const coord_t *);
62083 +lookup_result lookup_tail(const reiser4_key *, lookup_bias, coord_t *);
62084 +int paste_tail(coord_t *, reiser4_item_data *, carry_plugin_info *);
62085 +int can_shift_tail(unsigned free_space, coord_t * source,
62086 +                  znode * target, shift_direction, unsigned *size, unsigned want);
62087 +void copy_units_tail(coord_t * target, coord_t * source,
62088 +                    unsigned from, unsigned count, shift_direction, unsigned free_space);
62089 +int kill_hook_tail(const coord_t *, pos_in_node_t from, pos_in_node_t count, struct carry_kill_data *);
62090 +int cut_units_tail(coord_t *, pos_in_node_t from, pos_in_node_t to,
62091 +                  struct carry_cut_data *, reiser4_key *smallest_removed, reiser4_key *new_first);
62092 +int kill_units_tail(coord_t *, pos_in_node_t from, pos_in_node_t to,
62093 +                   struct carry_kill_data *, reiser4_key *smallest_removed, reiser4_key *new_first);
62094 +reiser4_key *unit_key_tail(const coord_t *, reiser4_key *);
62095 +
62096 +/* plugin->u.item.s.* */
62097 +int write_tail(struct inode *, flow_t *, hint_t *, int grabbed, write_mode_t);
62098 +int read_tail(struct file *, flow_t *, hint_t *);
62099 +int readpage_tail(void *vp, struct page *page);
62100 +reiser4_key *append_key_tail(const coord_t *, reiser4_key *);
62101 +void init_coord_extension_tail(uf_coord_t *, loff_t offset);
62102 +int get_block_address_tail(const coord_t *coord,
62103 +                          sector_t block, struct buffer_head *bh);
62104 +
62105 +void show_tail(struct seq_file *m, coord_t *coord);
62106 +int item_balance_dirty_pages(struct address_space *mapping, const flow_t *f,
62107 +                            hint_t *hint, int back_to_dirty, int set_hint);
62108 +
62109 +/* __REISER4_TAIL_H__ */
62110 +#endif
62111 +
62112 +/* Make Linus happy.
62113 +   Local variables:
62114 +   c-indentation-style: "K&R"
62115 +   mode-name: "LC"
62116 +   c-basic-offset: 8
62117 +   tab-width: 8
62118 +   fill-column: 120
62119 +   scroll-step: 1
62120 +   End:
62121 +*/
62122 diff -rupN linux-2.6.8-rc3/fs/reiser4/plugin/name/invterp.c linux-2.6.8-rc3-a/fs/reiser4/plugin/name/invterp.c
62123 --- linux-2.6.8-rc3/fs/reiser4/plugin/name/invterp.c    1970-01-01 03:00:00.000000000 +0300
62124 +++ linux-2.6.8-rc3-a/fs/reiser4/plugin/name/invterp.c  2004-08-05 21:20:53.138648830 +0400
62125 @@ -0,0 +1,11 @@
62126 +/* Invterp is short for invertable interpolate, and interpolate means to
62127 +substitute in.
62128 +
62129 +Example:
62130 +
62131 +/filenameA/<>
62132 +will resolve to
62133 +/filenameA<-`The contents of filenameA'
62134 +wherever used.
62135 +
62136 +*/
62137 diff -rupN linux-2.6.8-rc3/fs/reiser4/plugin/node/node.c linux-2.6.8-rc3-a/fs/reiser4/plugin/node/node.c
62138 --- linux-2.6.8-rc3/fs/reiser4/plugin/node/node.c       1970-01-01 03:00:00.000000000 +0300
62139 +++ linux-2.6.8-rc3-a/fs/reiser4/plugin/node/node.c     2004-08-05 21:20:53.129650728 +0400
62140 @@ -0,0 +1,375 @@
62141 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
62142 +
62143 +/* Node plugin interface.
62144 +
62145 +   Description: The tree provides the abstraction of flows, which it
62146 +   internally fragments into items which it stores in nodes.
62147 +
62148 +   A key_atom is a piece of data bound to a single key.
62149 +
62150 +   For reasonable space efficiency to be achieved it is often
62151 +   necessary to store key_atoms in the nodes in the form of items, where
62152 +   an item is a sequence of key_atoms of the same or similar type. It is
62153 +   more space-efficient, because the item can implement (very)
62154 +   efficient compression of key_atom's bodies using internal knowledge
62155 +   about their semantics, and it can often avoid having a key for each
62156 +   key_atom. Each type of item has specific operations implemented by its
62157 +   item handler (see balance.c).
62158 +
62159 +   Rationale: the rest of the code (specifically balancing routines)
62160 +   accesses leaf level nodes through this interface. This way we can
62161 +   implement various block layouts and even combine various layouts
62162 +   within the same tree. Balancing/allocating algorithms should not
62163 +   care about peculiarities of splitting/merging specific item types,
62164 +   but rather should leave that to the item's item handler.
62165 +
62166 +   Items, including those that provide the abstraction of flows, have
62167 +   the property that if you move them in part or in whole to another
62168 +   node, the balancing code invokes their is_left_mergeable()
62169 +   item_operation to determine if they are mergeable with their new
62170 +   neighbor in the node you have moved them to.  For some items the
62171 +   is_left_mergeable() function always returns null.
62172 +
62173 +   When moving the bodies of items from one node to another:
62174 +
62175 +     if a partial item is shifted to another node the balancing code invokes
62176 +     an item handler method to handle the item splitting.
62177 +
62178 +     if the balancing code needs to merge with an item in the node it
62179 +     is shifting to, it will invoke an item handler method to handle
62180 +     the item merging.
62181 +
62182 +     if it needs to move whole item bodies unchanged, the balancing code uses xmemcpy()
62183 +     adjusting the item headers after the move is done using the node handler.
62184 +*/
62185 +
62186 +#include "../../forward.h"
62187 +#include "../../debug.h"
62188 +#include "../../key.h"
62189 +#include "../../coord.h"
62190 +#include "../plugin_header.h"
62191 +#include "../item/item.h"
62192 +#include "node.h"
62193 +#include "../plugin.h"
62194 +#include "../../znode.h"
62195 +#include "../../tree.h"
62196 +#include "../../super.h"
62197 +#include "../../reiser4.h"
62198 +
62199 +/* return starting key of the leftmost item in the @node */
62200 +reiser4_internal reiser4_key *
62201 +leftmost_key_in_node(const znode * node /* node to query */ ,
62202 +                    reiser4_key * key /* resulting key */ )
62203 +{
62204 +       assert("nikita-1634", node != NULL);
62205 +       assert("nikita-1635", key != NULL);
62206 +
62207 +       if (!node_is_empty(node)) {
62208 +               coord_t first_item;
62209 +
62210 +               coord_init_first_unit(&first_item, (znode *) node);
62211 +               item_key_by_coord(&first_item, key);
62212 +       } else
62213 +               *key = *max_key();
62214 +       return key;
62215 +}
62216 +
62217 +#if REISER4_DEBUG_OUTPUT
62218 +/* helper function: convert 4 bit integer to its hex representation */
62219 +/* Audited by: green(2002.06.12) */
62220 +static char
62221 +hex_to_ascii(const int hex /* hex digit */ )
62222 +{
62223 +       assert("nikita-1081", (0 <= hex) && (hex < 0x10));
62224 +
62225 +       if (hex < 10)
62226 +               return '0' + hex;
62227 +       else
62228 +               return 'a' + hex - 10;
62229 +}
62230 +
62231 +/* helper function used to indent output during recursive tree printing */
62232 +/* Audited by: green(2002.06.12) */
62233 +reiser4_internal void
62234 +indent(unsigned indentation)
62235 +{
62236 +       unsigned i;
62237 +
62238 +       for (i = 0; i < indentation; ++i)
62239 +               printk("%.1i........", indentation - i);
62240 +}
62241 +
62242 +/* helper function used to indent output for @node during recursive tree
62243 +   printing */
62244 +reiser4_internal void
62245 +indent_znode(const znode * node /* current node */ )
62246 +{
62247 +       if (znode_get_tree(node)->height < znode_get_level(node))
62248 +               indent(0);
62249 +       else
62250 +               indent(znode_get_tree(node)->height - znode_get_level(node));
62251 +}
62252 +
62253 +/* debugging aid: output human readable information about @node */
62254 +reiser4_internal void
62255 +print_node_content(const char *prefix /* output prefix */ ,
62256 +                  const znode * node /* node to print */ ,
62257 +                  __u32 flags /* print flags */ )
62258 +{
62259 +       unsigned short i;
62260 +       coord_t coord;
62261 +       item_plugin *iplug;
62262 +       reiser4_key key;
62263 +
62264 +       if (!znode_is_loaded(node)) {
62265 +               print_znode("znode is not loaded\n", node);
62266 +               return;
62267 +       }
62268 +       if (node_plugin_by_node(node)->print != NULL) {
62269 +               indent_znode(node);
62270 +               node_plugin_by_node(node)->print(prefix, node, flags);
62271 +
62272 +               indent_znode(node);
62273 +               print_key("LDKEY", &node->ld_key);
62274 +
62275 +               indent_znode(node);
62276 +               print_key("RDKEY", &node->rd_key);
62277 +       }
62278 +
62279 +       /*if( flags & REISER4_NODE_SILENT ) {return;} */
62280 +
62281 +       coord.node = (znode *) node;
62282 +       coord.unit_pos = 0;
62283 +       coord.between = AT_UNIT;
62284 +       /*indent_znode (node); */
62285 +       for (i = 0; i < node_num_items(node); i++) {
62286 +               int j;
62287 +               int length;
62288 +               char *data;
62289 +
62290 +               indent_znode(node);
62291 +               printk("%d: ", i);
62292 +
62293 +               coord_set_item_pos(&coord, i);
62294 +
62295 +               iplug = item_plugin_by_coord(&coord);
62296 +               print_plugin("\titem plugin", item_plugin_to_plugin(iplug));
62297 +               indent_znode(node);
62298 +               item_key_by_coord(&coord, &key);
62299 +               print_key("\titem key", &key);
62300 +
62301 +               indent_znode(node);
62302 +               printk("\tlength %d\n", item_length_by_coord(&coord));
62303 +               indent_znode(node);
62304 +               iplug->b.print("\titem", &coord);
62305 +
62306 +               data = item_body_by_coord(&coord);
62307 +               length = item_length_by_coord(&coord);
62308 +               indent_znode(node);
62309 +               printk("\titem length: %i, offset: %i\n", length, data - zdata(node));
62310 +               for (j = 0; j < length; ++j) {
62311 +                       char datum;
62312 +
62313 +                       if ((j % 16) == 0) {
62314 +                               /* next 16 bytes */
62315 +                               if (j == 0) {
62316 +                                       indent_znode(node);
62317 +                                       printk("\tdata % .2i: ", j);
62318 +                               } else {
62319 +                                       printk("\n");
62320 +                                       indent_znode(node);
62321 +                                       printk("\t     % .2i: ", j);
62322 +                               }
62323 +                       }
62324 +                       datum = data[j];
62325 +                       printk("%c", hex_to_ascii((datum & 0xf0) >> 4));
62326 +                       printk("%c ", hex_to_ascii(datum & 0xf));
62327 +               }
62328 +               printk("\n");
62329 +               indent_znode(node);
62330 +               printk("======================\n");
62331 +       }
62332 +       printk("\n");
62333 +}
62334 +
62335 +/* debugging aid: output human readable information about @node
62336 +   the same as the above, but items to be printed must be specified */
62337 +reiser4_internal void
62338 +print_node_items(const char *prefix /* output prefix */ ,
62339 +                const znode * node /* node to print */ ,
62340 +                __u32 flags /* print flags */ ,
62341 +                unsigned from, unsigned count)
62342 +{
62343 +       unsigned i;
62344 +       coord_t coord;
62345 +       item_plugin *iplug;
62346 +       reiser4_key key;
62347 +
62348 +       if (!znode_is_loaded(node)) {
62349 +               print_znode("znode is not loaded\n", node);
62350 +               return;
62351 +       }
62352 +       if (node_plugin_by_node(node)->print != NULL) {
62353 +               indent_znode(node);
62354 +               node_plugin_by_node(node)->print(prefix, node, flags);
62355 +
62356 +               indent_znode(node);
62357 +               print_key("LDKEY", &node->ld_key);
62358 +
62359 +               indent_znode(node);
62360 +               print_key("RDKEY", &node->rd_key);
62361 +       }
62362 +
62363 +       /*if( flags & REISER4_NODE_SILENT ) {return;} */
62364 +
62365 +       coord.node = (znode *) node;
62366 +       coord.unit_pos = 0;
62367 +       coord.between = AT_UNIT;
62368 +       /*indent_znode (node); */
62369 +       if (from >= node_num_items(node) || from + count > node_num_items(node)) {
62370 +               printk("there are no those items (%u-%u) in the node (%u)\n",
62371 +                      from, from + count - 1, node_num_items(node));
62372 +               return;
62373 +       }
62374 +
62375 +       for (i = from; i < from + count; i++) {
62376 +               int j;
62377 +               int length;
62378 +               char *data;
62379 +
62380 +               indent_znode(node);
62381 +               printk("%d: ", i);
62382 +
62383 +               coord_set_item_pos(&coord, i);
62384 +
62385 +               iplug = item_plugin_by_coord(&coord);
62386 +               print_plugin("\titem plugin", item_plugin_to_plugin(iplug));
62387 +               indent_znode(node);
62388 +               item_key_by_coord(&coord, &key);
62389 +               print_key("\titem key", &key);
62390 +
62391 +               if (iplug->b.print) {
62392 +                       indent_znode(node);
62393 +                       printk("\tlength %d\n", item_length_by_coord(&coord));
62394 +                       indent_znode(node);
62395 +                       iplug->b.print("\titem", &coord);
62396 +               }
62397 +               data = item_body_by_coord(&coord);
62398 +               length = item_length_by_coord(&coord);
62399 +               indent_znode(node);
62400 +               printk("\titem length: %i, offset: %i\n", length, data - zdata(node));
62401 +               for (j = 0; j < length; ++j) {
62402 +                       char datum;
62403 +
62404 +                       if ((j % 16) == 0) {
62405 +                               /* next 16 bytes */
62406 +                               if (j == 0) {
62407 +                                       indent_znode(node);
62408 +                                       printk("\tdata % .2i: ", j);
62409 +                               } else {
62410 +                                       printk("\n");
62411 +                                       indent_znode(node);
62412 +                                       printk("\t     % .2i: ", j);
62413 +                               }
62414 +                       }
62415 +                       datum = data[j];
62416 +                       printk("%c", hex_to_ascii((datum & 0xf0) >> 4));
62417 +                       printk("%c ", hex_to_ascii(datum & 0xf));
62418 +               }
62419 +               printk("\n");
62420 +               indent_znode(node);
62421 +               printk("======================\n");
62422 +       }
62423 +       printk("\n");
62424 +}
62425 +#endif
62426 +
62427 +#if REISER4_DEBUG_NODE
62428 +/* debugging aid: check consistency of @node content */
62429 +void
62430 +node_check(znode * node /* node to check */ ,
62431 +          __u32 flags /* check flags */ )
62432 +{
62433 +       const char *mes;
62434 +       int result;
62435 +       reiser4_tree *tree;
62436 +
62437 +       assert("nikita-3534", schedulable());
62438 +
62439 +       if (!reiser4_is_debugged(reiser4_get_current_sb(), REISER4_CHECK_NODE))
62440 +               return;
62441 +
62442 +       if (get_current_context()->disable_node_check)
62443 +               return;
62444 +       tree = znode_get_tree(node);
62445 +
62446 +       if (znode_above_root(node))
62447 +               return;
62448 +       if (znode_just_created(node))
62449 +               return;
62450 +
62451 +       zload(node);
62452 +       result = node_plugin_by_node(node)->check(node, flags, &mes);
62453 +       if (result != 0) {
62454 +               printk("%s\n", mes);
62455 +               print_node_content("check", node, ~0u);
62456 +               reiser4_panic("vs-273", "node corrupted");
62457 +       }
62458 +       zrelse(node);
62459 +}
62460 +#endif
62461 +
62462 +node_plugin node_plugins[LAST_NODE_ID] = {
62463 +       [NODE40_ID] = {
62464 +               .h = {
62465 +                       .type_id = REISER4_NODE_PLUGIN_TYPE,
62466 +                       .id = NODE40_ID,
62467 +                       .pops = NULL,
62468 +                       .label = "unified",
62469 +                       .desc = "unified node layout",
62470 +                       .linkage = TYPE_SAFE_LIST_LINK_ZERO,
62471 +               },
62472 +               .item_overhead = item_overhead_node40,
62473 +               .free_space = free_space_node40,
62474 +               .lookup = lookup_node40,
62475 +               .num_of_items = num_of_items_node40,
62476 +               .item_by_coord = item_by_coord_node40,
62477 +               .length_by_coord = length_by_coord_node40,
62478 +               .plugin_by_coord = plugin_by_coord_node40,
62479 +               .key_at = key_at_node40,
62480 +               .estimate = estimate_node40,
62481 +               .check = check_node40,
62482 +               .parse = parse_node40,
62483 +               .init = init_node40,
62484 +#ifdef GUESS_EXISTS
62485 +               .guess = guess_node40,
62486 +#endif
62487 +#if REISER4_DEBUG_OUTPUT
62488 +               .print = print_node40,
62489 +#endif
62490 +               .change_item_size = change_item_size_node40,
62491 +               .create_item = create_item_node40,
62492 +               .update_item_key = update_item_key_node40,
62493 +               .cut_and_kill = kill_node40,
62494 +               .cut = cut_node40,
62495 +               .shift = shift_node40,
62496 +               .shrink_item = shrink_item_node40,
62497 +               .fast_insert = fast_insert_node40,
62498 +               .fast_paste = fast_paste_node40,
62499 +               .fast_cut = fast_cut_node40,
62500 +               .max_item_size = max_item_size_node40,
62501 +               .prepare_removal = prepare_removal_node40,
62502 +               .set_item_plugin = set_item_plugin_node40
62503 +       }
62504 +};
62505 +
62506 +/*
62507 +   Local variables:
62508 +   c-indentation-style: "K&R"
62509 +   mode-name: "LC"
62510 +   c-basic-offset: 8
62511 +   tab-width: 8
62512 +   fill-column: 120
62513 +   scroll-step: 1
62514 +   End:
62515 +*/
62516 diff -rupN linux-2.6.8-rc3/fs/reiser4/plugin/node/node.h linux-2.6.8-rc3-a/fs/reiser4/plugin/node/node.h
62517 --- linux-2.6.8-rc3/fs/reiser4/plugin/node/node.h       1970-01-01 03:00:00.000000000 +0300
62518 +++ linux-2.6.8-rc3-a/fs/reiser4/plugin/node/node.h     2004-08-05 21:20:53.387596321 +0400
62519 @@ -0,0 +1,292 @@
62520 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
62521 +
62522 +/* We need a definition of the default node layout here. */
62523 +
62524 +/* Generally speaking, it is best to have free space in the middle of the
62525 +   node so that two sets of things can grow towards it, and to have the
62526 +   item bodies on the left so that the last one of them grows into free
62527 +   space.  We optimize for the case where we append new items to the end
62528 +   of the node, or grow the last item, because it hurts nothing to so
62529 +   optimize and it is a common special case to do massive insertions in
62530 +   increasing key order (and one of cases more likely to have a real user
62531 +   notice the delay time for).
62532 +
62533 +   formatted leaf default layout: (leaf1)
62534 +
62535 +   |node header:item bodies:free space:key + pluginid + item offset|
62536 +
62537 +   We grow towards the middle, optimizing layout for the case where we
62538 +   append new items to the end of the node.  The node header is fixed
62539 +   length.  Keys, and item offsets plus pluginids for the items
62540 +   corresponding to them are in increasing key order, and are fixed
62541 +   length.  Item offsets are relative to start of node (16 bits creating
62542 +   a node size limit of 64k, 12 bits might be a better choice....).  Item
62543 +   bodies are in decreasing key order.  Item bodies have a variable size.
62544 +   There is a one to one to one mapping of keys to item offsets to item
62545 +   bodies.  Item offsets consist of pointers to the zeroth byte of the
62546 +   item body.  Item length equals the start of the next item minus the
62547 +   start of this item, except the zeroth item whose length equals the end
62548 +   of the node minus the start of that item (plus a byte).  In other
62549 +   words, the item length is not recorded anywhere, and it does not need
62550 +   to be since it is computable.
62551 +
62552 +   Leaf variable length items and keys layout : (lvar)
62553 +
62554 +   |node header:key offset + item offset + pluginid triplets:free space:key bodies:item bodies|
62555 +
62556 +   We grow towards the middle, optimizing layout for the case where we
62557 +   append new items to the end of the node.  The node header is fixed
62558 +   length.  Keys and item offsets for the items corresponding to them are
62559 +   in increasing key order, and keys are variable length.  Item offsets
62560 +   are relative to start of node (16 bits).  Item bodies are in
62561 +   decreasing key order.  Item bodies have a variable size.  There is a
62562 +   one to one to one mapping of keys to item offsets to item bodies.
62563 +   Item offsets consist of pointers to the zeroth byte of the item body.
62564 +   Item length equals the start of the next item's key minus the start of
62565 +   this item, except the zeroth item whose length equals the end of the
62566 +   node minus the start of that item (plus a byte).
62567 +
62568 +   leaf compressed keys layout: (lcomp)
62569 +
62570 +   |node header:key offset + key inherit + item offset pairs:free space:key bodies:item bodies|
62571 +
62572 +   We grow towards the middle, optimizing layout for the case where we
62573 +   append new items to the end of the node.  The node header is fixed
62574 +   length.  Keys and item offsets for the items corresponding to them are
62575 +   in increasing key order, and keys are variable length.  The "key
62576 +   inherit" field indicates how much of the key prefix is identical to
62577 +   the previous key (stem compression as described in "Managing
62578 +   Gigabytes" is used).  key_inherit is a one byte integer.  The
62579 +   intra-node searches performed through this layout are linear searches,
62580 +   and this is theorized to not hurt performance much due to the high
62581 +   cost of processor stalls on modern CPUs, and the small number of keys
62582 +   in a single node.  Item offsets are relative to start of node (16
62583 +   bits).  Item bodies are in decreasing key order.  Item bodies have a
62584 +   variable size.  There is a one to one to one mapping of keys to item
62585 +   offsets to item bodies.  Item offsets consist of pointers to the
62586 +   zeroth byte of the item body.  Item length equals the start of the
62587 +   next item minus the start of this item, except the zeroth item whose
62588 +   length equals the end of the node minus the start of that item (plus a
62589 +   byte).  In other words, item length and key length is not recorded
62590 +   anywhere, and it does not need to be since it is computable.
62591 +
62592 +   internal node default layout: (idef1)
62593 +
62594 +   just like ldef1 except that item bodies are either blocknrs of
62595 +   children or extents, and moving them may require updating parent
62596 +   pointers in the nodes that they point to.
62597 +*/
62598 +
62599 +/* There is an inherent 3-way tradeoff between optimizing and
62600 +   exchanging disks between different architectures and code
62601 +   complexity.  This is optimal and simple and inexchangeable.
62602 +   Someone else can do the code for exchanging disks and make it
62603 +   complex. It would not be that hard.  Using other than the PAGE_SIZE
62604 +   might be suboptimal.
62605 +*/
62606 +
62607 +#if !defined( __REISER4_NODE_H__ )
62608 +#define __REISER4_NODE_H__
62609 +
62610 +#define LEAF40_NODE_SIZE PAGE_CACHE_SIZE
62611 +
62612 +#include "../../dformat.h"
62613 +#include "../plugin_header.h"
62614 +
62615 +#include <linux/types.h>
62616 +
62617 +typedef enum {
62618 +       NS_FOUND = 0,
62619 +       NS_NOT_FOUND = -ENOENT
62620 +} node_search_result;
62621 +
62622 +/* Maximal possible space overhead for creation of new item in a node */
62623 +#define REISER4_NODE_MAX_OVERHEAD ( sizeof( reiser4_key ) + 32 )
62624 +
62625 +typedef enum {
62626 +       REISER4_NODE_DKEYS       = (1 << 0),
62627 +       REISER4_NODE_TREE_STABLE = (1 << 1)
62628 +} reiser4_node_check_flag;
62629 +
62630 +/* cut and cut_and_kill have too long list of parameters. This structure is just to safe some space on stack */
62631 +struct cut_list {
62632 +       coord_t * from;
62633 +       coord_t * to;
62634 +       const reiser4_key * from_key;
62635 +       const reiser4_key * to_key;
62636 +       reiser4_key * smallest_removed;
62637 +       carry_plugin_info * info;
62638 +       __u32 flags;
62639 +       struct inode *inode; /* this is to pass list of eflushed jnodes down to extent_kill_hook */
62640 +       lock_handle *left;
62641 +       lock_handle *right;
62642 +};
62643 +
62644 +struct carry_cut_data;
62645 +struct carry_kill_data;
62646 +
62647 +/* The responsibility of the node plugin is to store and give access
62648 +   to the sequence of items within the node.  */
62649 +typedef struct node_plugin {
62650 +       /* generic plugin fields */
62651 +       plugin_header h;
62652 +
62653 +       /* calculates the amount of space that will be required to store an
62654 +          item which is in addition to the space consumed by the item body.
62655 +          (the space consumed by the item body can be gotten by calling
62656 +          item->estimate) */
62657 +        size_t(*item_overhead) (const znode * node, flow_t * f);
62658 +
62659 +       /* returns free space by looking into node (i.e., without using
62660 +          znode->free_space). */
62661 +        size_t(*free_space) (znode * node);
62662 +       /* search within the node for the one item which might
62663 +           contain the key, invoking item->search_within to search within
62664 +           that item to see if it is in there */
62665 +        node_search_result(*lookup) (znode * node, const reiser4_key * key, lookup_bias bias, coord_t * coord);
62666 +       /* number of items in node */
62667 +       int (*num_of_items) (const znode * node);
62668 +
62669 +       /* store information about item in @coord in @data */
62670 +       /* break into several node ops, don't add any more uses of this before doing so */
62671 +       /*int ( *item_at )( const coord_t *coord, reiser4_item_data *data ); */
62672 +       char *(*item_by_coord) (const coord_t * coord);
62673 +       int (*length_by_coord) (const coord_t * coord);
62674 +       item_plugin *(*plugin_by_coord) (const coord_t * coord);
62675 +
62676 +       /* store item key in @key */
62677 +       reiser4_key *(*key_at) (const coord_t * coord, reiser4_key * key);
62678 +       /* conservatively estimate whether unit of what size can fit
62679 +           into node. This estimation should be performed without
62680 +           actually looking into the node's content (free space is saved in
62681 +           znode). */
62682 +        size_t(*estimate) (znode * node);
62683 +
62684 +       /* performs every consistency check the node plugin author could
62685 +          imagine. Optional. */
62686 +       int (*check) (const znode * node, __u32 flags, const char **error);
62687 +
62688 +       /* Called when node is read into memory and node plugin is
62689 +          already detected. This should read some data into znode (like free
62690 +          space counter) and, optionally, check data consistency.
62691 +       */
62692 +       int (*parse) (znode * node);
62693 +       /* This method is called on a new node to initialise plugin specific
62694 +          data (header, etc.) */
62695 +       int (*init) (znode * node);
62696 +       /* Check whether @node content conforms to this plugin format.
62697 +          Probably only useful after support for old V3.x formats is added.
62698 +          Uncomment after 4.0 only.
62699 +       */
62700 +       /*      int ( *guess )( const znode *node ); */
62701 +#if REISER4_DEBUG_OUTPUT
62702 +       void (*print) (const char *prefix, const znode * node, __u32 flags);
62703 +#endif
62704 +       /* change size of @item by @by bytes. @item->node has enough free
62705 +          space. When @by > 0 - free space is appended to end of item. When
62706 +          @by < 0 - item is truncated - it is assumed that last @by bytes if
62707 +          the item are freed already */
62708 +       void (*change_item_size) (coord_t * item, int by);
62709 +
62710 +       /* create new item @length bytes long in coord @target */
62711 +       int (*create_item) (coord_t * target, const reiser4_key * key,
62712 +                           reiser4_item_data * data, carry_plugin_info * info);
62713 +
62714 +       /* update key of item. */
62715 +       void (*update_item_key) (coord_t * target, const reiser4_key * key, carry_plugin_info * info);
62716 +
62717 +       int (*cut_and_kill) (struct carry_kill_data *, carry_plugin_info *);
62718 +       int (*cut) (struct carry_cut_data *, carry_plugin_info *);
62719 +
62720 +       /*
62721 +        * shrink item pointed to by @coord by @delta bytes.
62722 +        */
62723 +       int (*shrink_item) (coord_t *coord, int delta);
62724 +
62725 +       /* copy as much as possible but not more than up to @stop from
62726 +          @stop->node to @target. If (pend == append) then data from beginning of
62727 +          @stop->node are copied to the end of @target. If (pend == prepend) then
62728 +          data from the end of @stop->node are copied to the beginning of
62729 +          @target. Copied data are removed from @stop->node. Information
62730 +          about what to do on upper level is stored in @todo */
62731 +       int (*shift) (coord_t * stop, znode * target, shift_direction pend,
62732 +                     int delete_node, int including_insert_coord, carry_plugin_info * info);
62733 +       /* return true if this node allows skip carry() in some situations
62734 +          (see fs/reiser4/tree.c:insert_by_coord()). Reiser3.x format
62735 +          emulation doesn't.
62736 +
62737 +          This will speedup insertions that doesn't require updates to the
62738 +          parent, by bypassing initialisation of carry() structures. It's
62739 +          believed that majority of insertions will fit there.
62740 +
62741 +       */
62742 +       int (*fast_insert) (const coord_t * coord);
62743 +       int (*fast_paste) (const coord_t * coord);
62744 +       int (*fast_cut) (const coord_t * coord);
62745 +       /* this limits max size of item which can be inserted into a node and
62746 +          number of bytes item in a node may be appended with */
62747 +       int (*max_item_size) (void);
62748 +       int (*prepare_removal) (znode * empty, carry_plugin_info * info);
62749 +       /* change plugin id of items which are in a node already. Currently it is Used in tail conversion for regular
62750 +        * files */
62751 +       int (*set_item_plugin) (coord_t * coord, item_id);
62752 +} node_plugin;
62753 +
62754 +typedef enum {
62755 +       /* standard unified node layout used for both leaf and internal
62756 +           nodes */
62757 +       NODE40_ID,
62758 +       LAST_NODE_ID
62759 +} reiser4_node_id;
62760 +
62761 +extern reiser4_key *leftmost_key_in_node(const znode * node, reiser4_key * key);
62762 +#if REISER4_DEBUG_OUTPUT
62763 +extern void print_node_content(const char *prefix, const znode * node, __u32 flags);
62764 +extern void print_node_items(const char *prefix /* output prefix */ ,
62765 +                            const znode * node /* node to print */ ,
62766 +                            __u32 flags /* print flags */ ,
62767 +                            unsigned from, unsigned count);
62768 +#else
62769 +#define print_node_content(p,n,f) noop
62770 +#endif
62771 +
62772 +extern void indent(unsigned indentation);
62773 +extern void indent_znode(const znode * node);
62774 +
62775 +#if REISER4_DEBUG_NODE
62776 +extern void node_check(znode * node, __u32 flags);
62777 +#define DISABLE_NODE_CHECK                             \
62778 +({                                                     \
62779 +       ++ get_current_context() -> disable_node_check; \
62780 +})
62781 +
62782 +#define ENABLE_NODE_CHECK                              \
62783 +({                                                     \
62784 +       -- get_current_context() -> disable_node_check; \
62785 +})
62786 +
62787 +#else
62788 +#define node_check( n, f ) noop
62789 +#define DISABLE_NODE_CHECK noop
62790 +#define ENABLE_NODE_CHECK noop
62791 +#endif
62792 +
62793 +extern void indent_znode(const znode * node);
62794 +
62795 +typedef struct common_node_header {
62796 +       /* identifier of node plugin. Must be located at the very beginning
62797 +          of a node. */
62798 +       d16 plugin_id;
62799 +} common_node_header;
62800 +/* __REISER4_NODE_H__ */
62801 +#endif
62802 +/*
62803 +   Local variables:
62804 +   c-indentation-style: "K&R"
62805 +   mode-name: "LC"
62806 +   c-basic-offset: 8
62807 +   tab-width: 8
62808 +   fill-column: 120
62809 +   scroll-step: 1
62810 +   End:
62811 +*/
62812 diff -rupN linux-2.6.8-rc3/fs/reiser4/plugin/node/node40.c linux-2.6.8-rc3-a/fs/reiser4/plugin/node/node40.c
62813 --- linux-2.6.8-rc3/fs/reiser4/plugin/node/node40.c     1970-01-01 03:00:00.000000000 +0300
62814 +++ linux-2.6.8-rc3-a/fs/reiser4/plugin/node/node40.c   2004-08-05 21:20:53.060665278 +0400
62815 @@ -0,0 +1,2860 @@
62816 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
62817 +
62818 +/*#include "../../forward.h"*/
62819 +#include "../../debug.h"
62820 +#include "../../key.h"
62821 +#include "../../coord.h"
62822 +#include "../plugin_header.h"
62823 +#include "../item/item.h"
62824 +#include "node.h"
62825 +#include "node40.h"
62826 +#include "../plugin.h"
62827 +#include "../../jnode.h"
62828 +#include "../../znode.h"
62829 +#include "../../pool.h"
62830 +#include "../../carry.h"
62831 +#include "../../tap.h"
62832 +#include "../../tree.h"
62833 +#include "../../super.h"
62834 +#include "../../reiser4.h"
62835 +
62836 +#include <asm/uaccess.h>
62837 +#include <linux/types.h>
62838 +#include <linux/prefetch.h>
62839 +
62840 +/* leaf 40 format:
62841 +
62842 +  [node header | item 0, item 1, .., item N-1 |  free space | item_head N-1, .. item_head 1, item head 0 ]
62843 +   plugin_id (16)                                                key
62844 +   free_space (16)                                               pluginid (16)
62845 +   free_space_start (16)                                         offset (16)
62846 +   level (8)
62847 +   num_items (16)
62848 +   magic (32)
62849 +   flush_time (32)
62850 +*/
62851 +/* NIKITA-FIXME-HANS: I told you guys not less than 10 times to not call it r4fs.  Change to "ReIs". */
62852 +/* magic number that is stored in ->magic field of node header */
62853 +const __u32 REISER4_NODE_MAGIC = 0x52344653;   /* (*(__u32 *)"R4FS"); */
62854 +
62855 +static int prepare_for_update(znode * left, znode * right, carry_plugin_info * info);
62856 +
62857 +/* header of node of reiser40 format is at the beginning of node */
62858 +static inline node40_header *
62859 +node40_node_header(const znode * node  /* node to
62860 +                                          * query */ )
62861 +{
62862 +       assert("nikita-567", node != NULL);
62863 +       assert("nikita-568", znode_page(node) != NULL);
62864 +       assert("nikita-569", zdata(node) != NULL);
62865 +       return (node40_header *) zdata(node);
62866 +}
62867 +
62868 +/* functions to get/set fields of node40_header */
62869 +
62870 +static __u32
62871 +nh40_get_magic(node40_header * nh)
62872 +{
62873 +       return d32tocpu(&nh->magic);
62874 +}
62875 +
62876 +static void
62877 +nh40_set_magic(node40_header * nh, __u32 magic)
62878 +{
62879 +       cputod32(magic, &nh->magic);
62880 +}
62881 +
62882 +static void
62883 +nh40_set_free_space(node40_header * nh, unsigned value)
62884 +{
62885 +       cputod16(value, &nh->free_space);
62886 +       /*node->free_space = value; */
62887 +}
62888 +
62889 +static inline unsigned
62890 +nh40_get_free_space(node40_header * nh)
62891 +{
62892 +       return d16tocpu(&nh->free_space);
62893 +}
62894 +
62895 +static void
62896 +nh40_set_free_space_start(node40_header * nh, unsigned value)
62897 +{
62898 +       cputod16(value, &nh->free_space_start);
62899 +}
62900 +
62901 +static inline unsigned
62902 +nh40_get_free_space_start(node40_header * nh)
62903 +{
62904 +       return d16tocpu(&nh->free_space_start);
62905 +}
62906 +
62907 +static inline void
62908 +nh40_set_level(node40_header * nh, unsigned value)
62909 +{
62910 +       cputod8(value, &nh->level);
62911 +}
62912 +
62913 +static unsigned
62914 +nh40_get_level(node40_header * nh)
62915 +{
62916 +       return d8tocpu(&nh->level);
62917 +}
62918 +
62919 +static void
62920 +nh40_set_num_items(node40_header * nh, unsigned value)
62921 +{
62922 +       cputod16(value, &nh->nr_items);
62923 +}
62924 +
62925 +static inline unsigned
62926 +nh40_get_num_items(node40_header * nh)
62927 +{
62928 +       return d16tocpu(&nh->nr_items);
62929 +}
62930 +
62931 +static void
62932 +nh40_set_mkfs_id(node40_header * nh, __u32 id)
62933 +{
62934 +       cputod32(id, &nh->mkfs_id);
62935 +}
62936 +
62937 +static inline __u32
62938 +nh40_get_mkfs_id(node40_header * nh)
62939 +{
62940 +       return d32tocpu(&nh->mkfs_id);
62941 +}
62942 +
62943 +#if 0
62944 +static void
62945 +nh40_set_flush_id(node40_header * nh, __u64 id)
62946 +{
62947 +       cputod64(id, &nh->flush.flush_id);
62948 +}
62949 +#endif
62950 +
62951 +static inline __u64
62952 +nh40_get_flush_id(node40_header * nh)
62953 +{
62954 +       return d64tocpu(&nh->flush_id);
62955 +}
62956 +
62957 +/* plugin field of node header should be read/set by
62958 +   plugin_by_disk_id/save_disk_plugin */
62959 +
62960 +/* array of item headers is at the end of node */
62961 +static inline item_header40 *
62962 +node40_ih_at(const znode * node, unsigned pos)
62963 +{
62964 +       return (item_header40 *) (zdata(node) + znode_size(node)) - pos - 1;
62965 +}
62966 +
62967 +/* ( page_address( node -> pg ) + PAGE_CACHE_SIZE ) - pos - 1
62968 + */
62969 +static inline item_header40 *
62970 +node40_ih_at_coord(const coord_t * coord)
62971 +{
62972 +       return (item_header40 *) (zdata(coord->node) + znode_size(coord->node)) - (coord->item_pos) - 1;
62973 +}
62974 +
62975 +/* functions to get/set fields of item_header40 */
62976 +static void
62977 +ih40_set_offset(item_header40 * ih, unsigned offset)
62978 +{
62979 +       cputod16(offset, &ih->offset);
62980 +}
62981 +
62982 +static inline unsigned
62983 +ih40_get_offset(item_header40 * ih)
62984 +{
62985 +       return d16tocpu(&ih->offset);
62986 +}
62987 +
62988 +/* plugin field of item header should be read/set by
62989 +   plugin_by_disk_id/save_disk_plugin */
62990 +
62991 +/* plugin methods */
62992 +
62993 +/* plugin->u.node.item_overhead
62994 +   look for description of this method in plugin/node/node.h */
62995 +reiser4_internal size_t
62996 +item_overhead_node40(const znode * node UNUSED_ARG, flow_t * f UNUSED_ARG)
62997 +{
62998 +       return sizeof (item_header40);
62999 +}
63000 +
63001 +/* plugin->u.node.free_space
63002 +   look for description of this method in plugin/node/node.h */
63003 +reiser4_internal size_t free_space_node40(znode * node)
63004 +{
63005 +       assert("nikita-577", node != NULL);
63006 +       assert("nikita-578", znode_is_loaded(node));
63007 +       assert("nikita-579", zdata(node) != NULL);
63008 +       trace_stamp(TRACE_NODES);
63009 +
63010 +       return nh40_get_free_space(node40_node_header(node));
63011 +}
63012 +
63013 +/* private inline version of node40_num_of_items() for use in this file. This
63014 +   is necessary, because address of node40_num_of_items() is taken and it is
63015 +   never inlined as a result. */
63016 +static inline short
63017 +node40_num_of_items_internal(const znode * node)
63018 +{
63019 +       trace_stamp(TRACE_NODES);
63020 +       return nh40_get_num_items(node40_node_header(node));
63021 +}
63022 +
63023 +#if REISER4_DEBUG
63024 +static inline void check_num_items(const znode *node)
63025 +{
63026 +       assert("nikita-2749",
63027 +              node40_num_of_items_internal(node) == node->nr_items);
63028 +       assert("nikita-2746", znode_is_write_locked(node));
63029 +}
63030 +#else
63031 +#define check_num_items(node) noop
63032 +#endif
63033 +
63034 +/* plugin->u.node.num_of_items
63035 +   look for description of this method in plugin/node/node.h */
63036 +reiser4_internal int
63037 +num_of_items_node40(const znode * node)
63038 +{
63039 +       trace_stamp(TRACE_NODES);
63040 +       return node40_num_of_items_internal(node);
63041 +}
63042 +
63043 +static void
63044 +node40_set_num_items(znode * node, node40_header * nh, unsigned value)
63045 +{
63046 +       assert("nikita-2751", node != NULL);
63047 +       assert("nikita-2750", nh == node40_node_header(node));
63048 +
63049 +       check_num_items(node);
63050 +       nh40_set_num_items(nh, value);
63051 +       node->nr_items = value;
63052 +       check_num_items(node);
63053 +}
63054 +
63055 +/* plugin->u.node.item_by_coord
63056 +   look for description of this method in plugin/node/node.h */
63057 +reiser4_internal char *
63058 +item_by_coord_node40(const coord_t * coord)
63059 +{
63060 +       item_header40 *ih;
63061 +       char *p;
63062 +
63063 +       /* @coord is set to existing item */
63064 +       assert("nikita-596", coord != NULL);
63065 +       assert("vs-255", coord_is_existing_item(coord));
63066 +
63067 +       ih = node40_ih_at_coord(coord);
63068 +       p = zdata(coord->node) + ih40_get_offset(ih);
63069 +       return p;
63070 +}
63071 +
63072 +/* plugin->u.node.length_by_coord
63073 +   look for description of this method in plugin/node/node.h */
63074 +reiser4_internal int
63075 +length_by_coord_node40(const coord_t * coord)
63076 +{
63077 +       item_header40 *ih;
63078 +       int result;
63079 +
63080 +       /* @coord is set to existing item */
63081 +       assert("vs-256", coord != NULL);
63082 +       assert("vs-257", coord_is_existing_item(coord));
63083 +
63084 +       ih = node40_ih_at_coord(coord);
63085 +       if ((int) coord->item_pos == node40_num_of_items_internal(coord->node) - 1)
63086 +               result = nh40_get_free_space_start(node40_node_header(coord->node)) - ih40_get_offset(ih);
63087 +       else
63088 +               result = ih40_get_offset(ih - 1) - ih40_get_offset(ih);
63089 +
63090 +       return result;
63091 +}
63092 +
63093 +static pos_in_node_t
63094 +node40_item_length(const znode *node, pos_in_node_t item_pos)
63095 +{
63096 +       item_header40 *ih;
63097 +       pos_in_node_t result;
63098 +
63099 +       /* @coord is set to existing item */
63100 +       assert("vs-256", node != NULL);
63101 +       assert("vs-257", node40_num_of_items_internal(node) > item_pos);
63102 +
63103 +       ih = node40_ih_at(node, item_pos);
63104 +       if (item_pos == node40_num_of_items_internal(node) - 1)
63105 +               result = nh40_get_free_space_start(node40_node_header(node)) - ih40_get_offset(ih);
63106 +       else
63107 +               result = ih40_get_offset(ih - 1) - ih40_get_offset(ih);
63108 +
63109 +       return result;
63110 +}
63111 +
63112 +/* plugin->u.node.plugin_by_coord
63113 +   look for description of this method in plugin/node/node.h */
63114 +reiser4_internal item_plugin *
63115 +plugin_by_coord_node40(const coord_t * coord)
63116 +{
63117 +       item_header40 *ih;
63118 +       item_plugin   *result;
63119 +
63120 +       /* @coord is set to existing item */
63121 +       assert("vs-258", coord != NULL);
63122 +       assert("vs-259", coord_is_existing_item(coord));
63123 +
63124 +       ih = node40_ih_at_coord(coord);
63125 +       /* pass NULL in stead of current tree. This is time critical call. */
63126 +       result = item_plugin_by_disk_id(NULL, &ih->plugin_id);
63127 +       return result;
63128 +}
63129 +
63130 +/* plugin->u.node.key_at
63131 +   look for description of this method in plugin/node/node.h */
63132 +reiser4_internal reiser4_key *
63133 +key_at_node40(const coord_t * coord, reiser4_key * key)
63134 +{
63135 +       item_header40 *ih;
63136 +
63137 +       assert("nikita-1765", coord_is_existing_item(coord));
63138 +
63139 +       /* @coord is set to existing item */
63140 +       ih = node40_ih_at_coord(coord);
63141 +       xmemcpy(key, &ih->key, sizeof (reiser4_key));
63142 +       return key;
63143 +}
63144 +
63145 +/* VS-FIXME-HANS: please review whether the below are properly disabled when debugging is disabled */
63146 +
63147 +#define NODE_INCSTAT(n, counter)                                               \
63148 +       reiser4_stat_inc_at_level(znode_get_level(n), node.lookup.counter)
63149 +
63150 +#define NODE_ADDSTAT(n, counter, val)                                          \
63151 +       reiser4_stat_add_at_level(znode_get_level(n), node.lookup.counter, val)
63152 +
63153 +/* plugin->u.node.lookup
63154 +   look for description of this method in plugin/node/node.h */
63155 +reiser4_internal node_search_result
63156 +lookup_node40(znode * node /* node to query */ ,
63157 +             const reiser4_key * key /* key to look for */ ,
63158 +             lookup_bias bias /* search bias */ ,
63159 +             coord_t * coord /* resulting coord */ )
63160 +{
63161 +       int left;
63162 +       int right;
63163 +       int found;
63164 +       int items;
63165 +
63166 +       item_header40 *lefth;
63167 +       item_header40 *righth;
63168 +
63169 +       item_plugin *iplug;
63170 +       item_header40 *bstop;
63171 +       item_header40 *ih;
63172 +       cmp_t order;
63173 +
63174 +       assert("nikita-583", node != NULL);
63175 +       assert("nikita-584", key != NULL);
63176 +       assert("nikita-585", coord != NULL);
63177 +       assert("nikita-2693", znode_is_any_locked(node));
63178 +       cassert(REISER4_SEQ_SEARCH_BREAK > 2);
63179 +
63180 +       trace_stamp(TRACE_NODES);
63181 +
63182 +       items = node_num_items(node);
63183 +       NODE_INCSTAT(node, calls);
63184 +       NODE_ADDSTAT(node, items, items);
63185 +
63186 +       node_check(node, REISER4_NODE_DKEYS);
63187 +
63188 +       if (unlikely(items == 0)) {
63189 +               coord_init_first_unit(coord, node);
63190 +               return NS_NOT_FOUND;
63191 +       }
63192 +
63193 +       /* binary search for item that can contain given key */
63194 +       left = 0;
63195 +       right = items - 1;
63196 +       coord->node = node;
63197 +       coord_clear_iplug(coord);
63198 +       found = 0;
63199 +
63200 +       lefth = node40_ih_at(node, left);
63201 +       righth = node40_ih_at(node, right);
63202 +
63203 +       /* It is known that for small arrays sequential search is on average
63204 +          more efficient than binary. This is because sequential search is
63205 +          coded as tight loop that can be better optimized by compilers and
63206 +          for small array size gain from this optimization makes sequential
63207 +          search the winner. Another, maybe more important, reason for this,
63208 +          is that sequential array is more CPU cache friendly, whereas binary
63209 +          search effectively destroys CPU caching.
63210 +
63211 +          Critical here is the notion of "smallness". Reasonable value of
63212 +          REISER4_SEQ_SEARCH_BREAK can be found by playing with code in
63213 +          fs/reiser4/ulevel/ulevel.c:test_search().
63214 +
63215 +          Don't try to further optimize sequential search by scanning from
63216 +          right to left in attempt to use more efficient loop termination
63217 +          condition (comparison with 0). This doesn't work.
63218 +
63219 +       */
63220 +
63221 +       while (right - left >= REISER4_SEQ_SEARCH_BREAK) {
63222 +               int median;
63223 +               item_header40 *medianh;
63224 +
63225 +               median = (left + right) / 2;
63226 +               medianh = node40_ih_at(node, median);
63227 +
63228 +               assert("nikita-1084", median >= 0);
63229 +               assert("nikita-1085", median < items);
63230 +               NODE_INCSTAT(node, binary);
63231 +               switch (keycmp(key, &medianh->key)) {
63232 +               case LESS_THAN:
63233 +                       right = median;
63234 +                       righth = medianh;
63235 +                       break;
63236 +               default:
63237 +                       wrong_return_value("nikita-586", "keycmp");
63238 +               case GREATER_THAN:
63239 +                       left = median;
63240 +                       lefth = medianh;
63241 +                       break;
63242 +               case EQUAL_TO:
63243 +                       do {
63244 +                               -- median;
63245 +                               /* headers are ordered from right to left */
63246 +                               ++ medianh;
63247 +                       } while (median >= 0 && keyeq(key, &medianh->key));
63248 +                       right = left = median + 1;
63249 +                       ih = lefth = righth = medianh - 1;
63250 +                       found = 1;
63251 +                       break;
63252 +               }
63253 +       }
63254 +       /* sequential scan. Item headers, and, therefore, keys are stored at
63255 +          the rightmost part of a node from right to left. We are trying to
63256 +          access memory from left to right, and hence, scan in _descending_
63257 +          order of item numbers.
63258 +       */
63259 +       if (!found) {
63260 +               for (left = right, ih = righth; left >= 0; ++ ih, -- left) {
63261 +                       cmp_t comparison;
63262 +
63263 +                       NODE_INCSTAT(node, seq);
63264 +                       prefetchkey(&(ih + 1)->key);
63265 +                       comparison = keycmp(&ih->key, key);
63266 +                       if (comparison == GREATER_THAN)
63267 +                               continue;
63268 +                       if (comparison == EQUAL_TO) {
63269 +                               found = 1;
63270 +                               do {
63271 +                                       -- left;
63272 +                                       ++ ih;
63273 +                               } while (left >= 0 && keyeq(&ih->key, key));
63274 +                               ++ left;
63275 +                               -- ih;
63276 +                       } else {
63277 +                               assert("nikita-1256", comparison == LESS_THAN);
63278 +                       }
63279 +                       break;
63280 +               }
63281 +               if (unlikely(left < 0))
63282 +                       left = 0;
63283 +       }
63284 +
63285 +       assert("nikita-3212", right >= left);
63286 +       assert("nikita-3214",
63287 +              equi(found, keyeq(&node40_ih_at(node, left)->key, key)));
63288 +
63289 +#if REISER4_STATS
63290 +       NODE_ADDSTAT(node, found, !!found);
63291 +       NODE_ADDSTAT(node, pos, left);
63292 +       if (items > 1)
63293 +               NODE_ADDSTAT(node, posrelative, (left << 10) / (items - 1));
63294 +       else
63295 +               NODE_ADDSTAT(node, posrelative, 1 << 10);
63296 +       if (left == node->last_lookup_pos)
63297 +               NODE_INCSTAT(node, samepos);
63298 +       if (left == node->last_lookup_pos + 1)
63299 +               NODE_INCSTAT(node, nextpos);
63300 +       node->last_lookup_pos = left;
63301 +#endif
63302 +
63303 +       coord_set_item_pos(coord, left);
63304 +       coord->unit_pos = 0;
63305 +       coord->between = AT_UNIT;
63306 +
63307 +       /* key < leftmost key in a mode or node is corrupted and keys
63308 +          are not sorted  */
63309 +       bstop = node40_ih_at(node, (unsigned) left);
63310 +       order = keycmp(&bstop->key, key);
63311 +       if (unlikely(order == GREATER_THAN)) {
63312 +               if (unlikely(left != 0)) {
63313 +                       /* screw up */
63314 +                       warning("nikita-587", "Key less than %i key in a node", left);
63315 +                       print_key("key", key);
63316 +                       print_key("min", &bstop->key);
63317 +                       print_znode("node", node);
63318 +                       print_coord_content("coord", coord);
63319 +                       return RETERR(-EIO);
63320 +               } else {
63321 +                       coord->between = BEFORE_UNIT;
63322 +                       return NS_NOT_FOUND;
63323 +               }
63324 +       }
63325 +       /* left <= key, ok */
63326 +       iplug = item_plugin_by_disk_id(znode_get_tree(node), &bstop->plugin_id);
63327 +
63328 +       if (unlikely(iplug == NULL)) {
63329 +               warning("nikita-588", "Unknown plugin %i", d16tocpu(&bstop->plugin_id));
63330 +               print_key("key", key);
63331 +               print_znode("node", node);
63332 +               print_coord_content("coord", coord);
63333 +               return RETERR(-EIO);
63334 +       }
63335 +
63336 +       coord_set_iplug(coord, iplug);
63337 +
63338 +       /* if exact key from item header was found by binary search, no
63339 +          further checks are necessary. */
63340 +       if (found) {
63341 +               assert("nikita-1259", order == EQUAL_TO);
63342 +               return NS_FOUND;
63343 +       }
63344 +       if (iplug->b.max_key_inside != NULL) {
63345 +               reiser4_key max_item_key;
63346 +
63347 +               /* key > max_item_key --- outside of an item */
63348 +               if (keygt(key, iplug->b.max_key_inside(coord, &max_item_key))) {
63349 +                       coord->unit_pos = 0;
63350 +                       coord->between = AFTER_ITEM;
63351 +                       /* FIXME-VS: key we are looking for does not fit into
63352 +                          found item. Return NS_NOT_FOUND then. Without that
63353 +                          the following case does not work: there is extent of
63354 +                          file 10000, 10001. File 10000, 10002 has been just
63355 +                          created. When writing to position 0 in that file -
63356 +                          traverse_tree will stop here on twig level. When we
63357 +                          want it to go down to leaf level
63358 +                       */
63359 +                       return NS_NOT_FOUND;
63360 +               }
63361 +       }
63362 +
63363 +       if (iplug->b.lookup != NULL) {
63364 +               return iplug->b.lookup(key, bias, coord);
63365 +       } else {
63366 +               assert("nikita-1260", order == LESS_THAN);
63367 +               coord->between = AFTER_UNIT;
63368 +               return (bias == FIND_EXACT) ? NS_NOT_FOUND : NS_FOUND;
63369 +       }
63370 +}
63371 +
63372 +#undef NODE_ADDSTAT
63373 +#undef NODE_INCSTAT
63374 +
63375 +/* plugin->u.node.estimate
63376 +   look for description of this method in plugin/node/node.h */
63377 +reiser4_internal size_t estimate_node40(znode * node)
63378 +{
63379 +       size_t result;
63380 +
63381 +       assert("nikita-597", node != NULL);
63382 +
63383 +       result = free_space_node40(node) - sizeof(item_header40);
63384 +
63385 +       return (result > 0) ? result : 0;
63386 +}
63387 +
63388 +/* plugin->u.node.check
63389 +   look for description of this method in plugin/node/node.h */
63390 +reiser4_internal int
63391 +check_node40(const znode * node /* node to check */ ,
63392 +            __u32 flags /* check flags */ ,
63393 +            const char **error /* where to store error message */ )
63394 +{
63395 +       int nr_items;
63396 +       int i;
63397 +       reiser4_key prev;
63398 +       unsigned old_offset;
63399 +       tree_level level;
63400 +       coord_t coord;
63401 +
63402 +       assert("nikita-580", node != NULL);
63403 +       assert("nikita-581", error != NULL);
63404 +       assert("nikita-2948", znode_is_loaded(node));
63405 +       trace_stamp(TRACE_NODES);
63406 +
63407 +
63408 +       if (ZF_ISSET(node, JNODE_HEARD_BANSHEE))
63409 +               return 0;
63410 +
63411 +       assert("nikita-582", zdata(node) != NULL);
63412 +
63413 +       nr_items = node40_num_of_items_internal(node);
63414 +       if (nr_items < 0) {
63415 +               *error = "Negative number of items";
63416 +               return -1;
63417 +       }
63418 +
63419 +       if (flags & REISER4_NODE_DKEYS)
63420 +               prev = *znode_get_ld_key((znode *)node);
63421 +       else
63422 +               prev = *min_key();
63423 +
63424 +       old_offset = 0;
63425 +       coord_init_zero(&coord);
63426 +       coord.node = (znode *) node;
63427 +       coord.unit_pos = 0;
63428 +       coord.between = AT_UNIT;
63429 +       level = znode_get_level(node);
63430 +       for (i = 0; i < nr_items; i++) {
63431 +               item_header40 *ih;
63432 +               reiser4_key unit_key;
63433 +               unsigned j;
63434 +
63435 +               ih = node40_ih_at(node, (unsigned) i);
63436 +               coord_set_item_pos(&coord, i);
63437 +               if ((ih40_get_offset(ih) >=
63438 +                    znode_size(node) - nr_items * sizeof (item_header40)) ||
63439 +                   (ih40_get_offset(ih) < sizeof (node40_header))) {
63440 +                       *error = "Offset is out of bounds";
63441 +                       return -1;
63442 +               }
63443 +               if (ih40_get_offset(ih) <= old_offset) {
63444 +                       *error = "Offsets are in wrong order";
63445 +                       return -1;
63446 +               }
63447 +               if ((i == 0) && (ih40_get_offset(ih) != sizeof(node40_header))) {
63448 +                       *error = "Wrong offset of first item";
63449 +                       return -1;
63450 +               }
63451 +               old_offset = ih40_get_offset(ih);
63452 +
63453 +               if (keygt(&prev, &ih->key)) {
63454 +                       *error = "Keys are in wrong order";
63455 +                       return -1;
63456 +               }
63457 +               if (!keyeq(&ih->key, unit_key_by_coord(&coord, &unit_key))) {
63458 +                       *error = "Wrong key of first unit";
63459 +                       return -1;
63460 +               }
63461 +               prev = ih->key;
63462 +               for (j = 0; j < coord_num_units(&coord); ++j) {
63463 +                       coord.unit_pos = j;
63464 +                       unit_key_by_coord(&coord, &unit_key);
63465 +                       if (keygt(&prev, &unit_key)) {
63466 +                               *error = "Unit keys are in wrong order";
63467 +                               return -1;
63468 +                       }
63469 +                       prev = unit_key;
63470 +               }
63471 +               coord.unit_pos = 0;
63472 +               if (level != TWIG_LEVEL &&
63473 +                   item_is_extent(&coord)) {
63474 +                       *error = "extent on the wrong level";
63475 +                       return -1;
63476 +               }
63477 +               if (level == LEAF_LEVEL &&
63478 +                   item_is_internal(&coord)) {
63479 +                       *error = "internal item on the wrong level";
63480 +                       return -1;
63481 +               }
63482 +               if (level != LEAF_LEVEL &&
63483 +                   !item_is_internal(&coord) && !item_is_extent(&coord)) {
63484 +                       *error = "wrong item on the internal level";
63485 +                       return -1;
63486 +               }
63487 +               if (level > TWIG_LEVEL &&
63488 +                   !item_is_internal(&coord)) {
63489 +                       *error = "non-internal item on the internal level";
63490 +                       return -1;
63491 +               }
63492 +#if REISER4_DEBUG
63493 +               if (item_plugin_by_coord(&coord)->b.check && item_plugin_by_coord(&coord)->b.check(&coord, error))
63494 +                       return -1;
63495 +#endif
63496 +               if (i) {
63497 +                       coord_t prev_coord;
63498 +                       /* two neighboring items can not be mergeable */
63499 +                       coord_dup(&prev_coord, &coord);
63500 +                       coord_prev_item(&prev_coord);
63501 +                       if (are_items_mergeable(&prev_coord, &coord)) {
63502 +                               *error = "mergeable items in one node";
63503 +                               return -1;
63504 +                       }
63505 +
63506 +               }
63507 +       }
63508 +
63509 +       RLOCK_DK(current_tree);
63510 +       if ((flags & REISER4_NODE_DKEYS) && !node_is_empty(node)) {
63511 +               coord_t coord;
63512 +               item_plugin *iplug;
63513 +
63514 +               coord_init_last_unit(&coord, node);
63515 +               iplug = item_plugin_by_coord(&coord);
63516 +               if ((item_is_extent(&coord) || item_is_tail(&coord)) &&
63517 +                   iplug->s.file.append_key != NULL) {
63518 +                       reiser4_key mkey;
63519 +
63520 +                       iplug->s.file.append_key(&coord, &mkey);
63521 +                       set_key_offset(&mkey, get_key_offset(&mkey) - 1);
63522 +                       if (keygt(&mkey, znode_get_rd_key((znode *) node))) {
63523 +                               *error = "key of rightmost item is too large";
63524 +                               return -1;
63525 +                       }
63526 +               }
63527 +       }
63528 +       if (flags & REISER4_NODE_DKEYS) {
63529 +               RLOCK_TREE(current_tree);
63530 +
63531 +               flags |= REISER4_NODE_TREE_STABLE;
63532 +
63533 +               if (keygt(&prev, znode_get_rd_key((znode *)node))) {
63534 +                       reiser4_stat_inc(tree.rd_key_skew);
63535 +                       if (flags & REISER4_NODE_TREE_STABLE) {
63536 +                               *error = "Last key is greater than rdkey";
63537 +                               return -1;
63538 +                       }
63539 +               }
63540 +               if (keygt(znode_get_ld_key((znode *)node), znode_get_rd_key((znode *)node))) {
63541 +                       *error = "ldkey is greater than rdkey";
63542 +                       return -1;
63543 +               }
63544 +               if (ZF_ISSET(node, JNODE_LEFT_CONNECTED) &&
63545 +                   (node->left != NULL) &&
63546 +                   !ZF_ISSET(node->left, JNODE_HEARD_BANSHEE) &&
63547 +                   ergo(flags & REISER4_NODE_TREE_STABLE,
63548 +                        !keyeq(znode_get_rd_key(node->left), znode_get_ld_key((znode *)node))) &&
63549 +                   ergo(!(flags & REISER4_NODE_TREE_STABLE), keygt(znode_get_rd_key(node->left), znode_get_ld_key((znode *)node)))) {
63550 +                       *error = "left rdkey or ldkey is wrong";
63551 +                       return -1;
63552 +               }
63553 +               if (ZF_ISSET(node, JNODE_RIGHT_CONNECTED) &&
63554 +                   (node->right != NULL) &&
63555 +                   !ZF_ISSET(node->right, JNODE_HEARD_BANSHEE) &&
63556 +                   ergo(flags & REISER4_NODE_TREE_STABLE,
63557 +                        !keyeq(znode_get_rd_key((znode *)node), znode_get_ld_key(node->right))) &&
63558 +                   ergo(!(flags & REISER4_NODE_TREE_STABLE), keygt(znode_get_rd_key((znode *)node), znode_get_ld_key(node->right)))) {
63559 +                       *error = "rdkey or right ldkey is wrong";
63560 +                       return -1;
63561 +               }
63562 +
63563 +               RUNLOCK_TREE(current_tree);
63564 +       }
63565 +       RUNLOCK_DK(current_tree);
63566 +
63567 +       return 0;
63568 +}
63569 +
63570 +/* plugin->u.node.parse
63571 +   look for description of this method in plugin/node/node.h */
63572 +reiser4_internal int
63573 +parse_node40(znode * node /* node to parse */ )
63574 +{
63575 +       node40_header *header;
63576 +       int result;
63577 +
63578 +       header = node40_node_header((znode *) node);
63579 +       result = -EIO;
63580 +       if (unlikely(((__u8) znode_get_level(node)) != nh40_get_level(header)))
63581 +               warning("nikita-494", "Wrong level found in node: %i != %i",
63582 +                       znode_get_level(node), nh40_get_level(header));
63583 +       else if (unlikely(nh40_get_magic(header) != REISER4_NODE_MAGIC))
63584 +               warning("nikita-495",
63585 +                       "Wrong magic in tree node: want %x, got %x",
63586 +                       REISER4_NODE_MAGIC, nh40_get_magic(header));
63587 +       else {
63588 +               node->nr_items = node40_num_of_items_internal(node);
63589 +               result = 0;
63590 +       }
63591 +       if (unlikely(result != 0))
63592 +               /* print_znode("node", node)*/;
63593 +       return RETERR(result);
63594 +}
63595 +
63596 +/* plugin->u.node.init
63597 +   look for description of this method in plugin/node/node.h */
63598 +reiser4_internal int
63599 +init_node40(znode * node /* node to initialise */ )
63600 +{
63601 +       node40_header *header;
63602 +
63603 +       assert("nikita-570", node != NULL);
63604 +       assert("nikita-572", zdata(node) != NULL);
63605 +
63606 +       header = node40_node_header(node);
63607 +       if (REISER4_ZERO_NEW_NODE)
63608 +               xmemset(zdata(node), 0, (unsigned int) znode_size(node));
63609 +       else
63610 +               xmemset(header, 0, sizeof (node40_header));
63611 +       nh40_set_free_space(header, znode_size(node) - sizeof (node40_header));
63612 +       nh40_set_free_space_start(header, sizeof (node40_header));
63613 +       /* sane hypothesis: 0 in CPU format is 0 in disk format */
63614 +       /* items: 0 */
63615 +       save_plugin_id(node_plugin_to_plugin(node->nplug), &header->common_header.plugin_id);
63616 +       nh40_set_level(header, znode_get_level(node));
63617 +       nh40_set_magic(header, REISER4_NODE_MAGIC);
63618 +       node->nr_items = 0;
63619 +       nh40_set_mkfs_id(header, reiser4_mkfs_id(reiser4_get_current_sb()));
63620 +
63621 +       /* flags: 0 */
63622 +       return 0;
63623 +}
63624 +
63625 +reiser4_internal int
63626 +guess_node40(const znode * node /* node to guess plugin of */ )
63627 +{
63628 +       node40_header *nethack;
63629 +
63630 +       assert("nikita-1058", node != NULL);
63631 +       nethack = node40_node_header(node);
63632 +       return
63633 +           (nh40_get_magic(nethack) == REISER4_NODE_MAGIC) &&
63634 +           (plugin_by_disk_id(znode_get_tree(node),
63635 +                              REISER4_NODE_PLUGIN_TYPE, &nethack->common_header.plugin_id)->h.id == NODE40_ID);
63636 +}
63637 +
63638 +#if REISER4_DEBUG_OUTPUT
63639 +reiser4_internal void
63640 +print_node40(const char *prefix, const znode * node /* node to print */ ,
63641 +            __u32 flags UNUSED_ARG /* print flags */ )
63642 +{
63643 +       node40_header *header;
63644 +
63645 +       header = node40_node_header(node);
63646 +       printk("%s: BLOCKNR %Lu FREE_SPACE %u, LEVEL %u, ITEM_NUMBER %u\n",
63647 +              prefix,
63648 +              *znode_get_block(node), nh40_get_free_space(header), nh40_get_level(header), nh40_get_num_items(header));
63649 +}
63650 +#endif
63651 +
63652 +/* plugin->u.node.chage_item_size
63653 +   look for description of this method in plugin/node/node.h */
63654 +reiser4_internal void
63655 +change_item_size_node40(coord_t * coord, int by)
63656 +{
63657 +       node40_header *nh;
63658 +       item_header40 *ih;
63659 +       char *item_data;
63660 +       int item_length;
63661 +       unsigned i;
63662 +
63663 +       node_check(coord->node, 0);
63664 +
63665 +       /* make sure that @item is coord of existing item */
63666 +       assert("vs-210", coord_is_existing_item(coord));
63667 +
63668 +       nh = node40_node_header(coord->node);
63669 +
63670 +       item_data = item_by_coord_node40(coord);
63671 +       item_length = length_by_coord_node40(coord);
63672 +
63673 +       /* move item bodies */
63674 +       ih = node40_ih_at_coord(coord);
63675 +       xmemmove(item_data + item_length + by, item_data + item_length,
63676 +                nh40_get_free_space_start(node40_node_header(coord->node)) - (ih40_get_offset(ih) + item_length));
63677 +
63678 +       /* update offsets of moved items */
63679 +       for (i = coord->item_pos + 1; i < nh40_get_num_items(nh); i++) {
63680 +               ih = node40_ih_at(coord->node, i);
63681 +               ih40_set_offset(ih, ih40_get_offset(ih) + by);
63682 +       }
63683 +
63684 +       /* update node header */
63685 +       nh40_set_free_space(nh, nh40_get_free_space(nh) - by);
63686 +       nh40_set_free_space_start(nh, nh40_get_free_space_start(nh) + by);
63687 +}
63688 +
63689 +static int
63690 +should_notify_parent(const znode * node)
63691 +{
63692 +       /* FIXME_JMACD This looks equivalent to znode_is_root(), right? -josh */
63693 +       return !disk_addr_eq(znode_get_block(node), &znode_get_tree(node)->root_block);
63694 +}
63695 +
63696 +/* plugin->u.node.create_item
63697 +   look for description of this method in plugin/node/node.h */
63698 +reiser4_internal int
63699 +create_item_node40(coord_t * target, const reiser4_key * key, reiser4_item_data * data, carry_plugin_info * info)
63700 +{
63701 +       node40_header *nh;
63702 +       item_header40 *ih;
63703 +       unsigned offset;
63704 +       unsigned i;
63705 +
63706 +       node_check(target->node, 0);
63707 +
63708 +       nh = node40_node_header(target->node);
63709 +
63710 +       assert("vs-212", coord_is_between_items(target));
63711 +       /* node must have enough free space */
63712 +       assert("vs-254", free_space_node40(target->node) >= data->length + sizeof(item_header40));
63713 +       assert("vs-1410", data->length >= 0);
63714 +
63715 +       if (coord_set_to_right(target))
63716 +               /* there are not items to the right of @target, so, new item
63717 +                  will be inserted after last one */
63718 +               coord_set_item_pos(target, nh40_get_num_items(nh));
63719 +
63720 +       if (target->item_pos < nh40_get_num_items(nh)) {
63721 +               /* there are items to be moved to prepare space for new
63722 +                  item */
63723 +               ih = node40_ih_at_coord(target);
63724 +               /* new item will start at this offset */
63725 +               offset = ih40_get_offset(ih);
63726 +
63727 +               xmemmove(zdata(target->node) + offset + data->length,
63728 +                        zdata(target->node) + offset, nh40_get_free_space_start(nh) - offset);
63729 +               /* update headers of moved items */
63730 +               for (i = target->item_pos; i < nh40_get_num_items(nh); i++) {
63731 +                       ih = node40_ih_at(target->node, i);
63732 +                       ih40_set_offset(ih, ih40_get_offset(ih) + data->length);
63733 +               }
63734 +
63735 +               /* @ih is set to item header of the last item, move item headers */
63736 +               xmemmove(ih - 1, ih, sizeof (item_header40) * (nh40_get_num_items(nh) - target->item_pos));
63737 +       } else {
63738 +               /* new item will start at this offset */
63739 +               offset = nh40_get_free_space_start(nh);
63740 +       }
63741 +
63742 +       /* make item header for the new item */
63743 +       ih = node40_ih_at_coord(target);
63744 +       xmemcpy(&ih->key, key, sizeof (reiser4_key));
63745 +       ih40_set_offset(ih, offset);
63746 +       save_plugin_id(item_plugin_to_plugin(data->iplug), &ih->plugin_id);
63747 +
63748 +       /* update node header */
63749 +       nh40_set_free_space(nh, nh40_get_free_space(nh) - data->length - sizeof (item_header40));
63750 +       nh40_set_free_space_start(nh, nh40_get_free_space_start(nh) + data->length);
63751 +       node40_set_num_items(target->node, nh, nh40_get_num_items(nh) + 1);
63752 +
63753 +       /* FIXME: check how does create_item work when between is set to BEFORE_UNIT */
63754 +       target->unit_pos = 0;
63755 +       target->between = AT_UNIT;
63756 +       coord_clear_iplug(target);
63757 +
63758 +       /* initialise item */
63759 +       if (data->iplug->b.init != NULL) {
63760 +               data->iplug->b.init(target, NULL, data);
63761 +       }
63762 +       /* copy item body */
63763 +       if (data->iplug->b.paste != NULL) {
63764 +               data->iplug->b.paste(target, data, info);
63765 +       } else if (data->data != NULL) {
63766 +               if (data->user) {
63767 +                       /* AUDIT: Are we really should not check that pointer
63768 +                          from userspace was valid and data bytes were
63769 +                          available? How will we return -EFAULT of some kind
63770 +                          without this check? */
63771 +                       assert("nikita-3038", schedulable());
63772 +                       /* copy data from user space */
63773 +                       __copy_from_user(zdata(target->node) + offset, data->data, (unsigned) data->length);
63774 +               } else
63775 +                       /* copy from kernel space */
63776 +                       xmemcpy(zdata(target->node) + offset, data->data, (unsigned) data->length);
63777 +       }
63778 +
63779 +       if (target->item_pos == 0) {
63780 +               /* left delimiting key has to be updated */
63781 +               prepare_for_update(NULL, target->node, info);
63782 +       }
63783 +
63784 +       if (item_plugin_by_coord(target)->b.create_hook != NULL) {
63785 +               item_plugin_by_coord(target)->b.create_hook(target, data->arg);
63786 +       }
63787 +
63788 +       node_check(target->node, 0);
63789 +       return 0;
63790 +}
63791 +
63792 +/* plugin->u.node.update_item_key
63793 +   look for description of this method in plugin/node/node.h */
63794 +reiser4_internal void
63795 +update_item_key_node40(coord_t * target, const reiser4_key * key, carry_plugin_info * info)
63796 +{
63797 +       item_header40 *ih;
63798 +
63799 +       ih = node40_ih_at_coord(target);
63800 +       xmemcpy(&ih->key, key, sizeof (reiser4_key));
63801 +
63802 +       if (target->item_pos == 0) {
63803 +               prepare_for_update(NULL, target->node, info);
63804 +       }
63805 +}
63806 +
63807 +/* this bits encode cut mode */
63808 +#define CMODE_TAIL 1
63809 +#define CMODE_WHOLE 2
63810 +#define CMODE_HEAD 4
63811 +
63812 +struct cut40_info {
63813 +       int mode;
63814 +       pos_in_node_t tail_removed; /* position of item which gets tail removed */
63815 +       pos_in_node_t first_removed; /* position of first the leftmost item among items removed completely */
63816 +       pos_in_node_t removed_count; /* number of items removed completely */
63817 +       pos_in_node_t head_removed; /* position of item which gets head removed */
63818 +
63819 +       pos_in_node_t freed_space_start;
63820 +       pos_in_node_t freed_space_end;
63821 +       pos_in_node_t first_moved;
63822 +       pos_in_node_t head_removed_location;
63823 +};
63824 +
63825 +static void
63826 +init_cinfo(struct cut40_info *cinfo)
63827 +{
63828 +       cinfo->mode = 0;
63829 +       cinfo->tail_removed = MAX_POS_IN_NODE;
63830 +       cinfo->first_removed = MAX_POS_IN_NODE;
63831 +       cinfo->removed_count = MAX_POS_IN_NODE;
63832 +       cinfo->head_removed = MAX_POS_IN_NODE;
63833 +       cinfo->freed_space_start = MAX_POS_IN_NODE;
63834 +       cinfo->freed_space_end = MAX_POS_IN_NODE;
63835 +       cinfo->first_moved = MAX_POS_IN_NODE;
63836 +       cinfo->head_removed_location = MAX_POS_IN_NODE;
63837 +}
63838 +
63839 +/* complete cut_node40/kill_node40 content by removing the gap created by */
63840 +static void
63841 +compact(znode *node, struct cut40_info *cinfo)
63842 +{
63843 +       node40_header *nh;
63844 +       item_header40 *ih;
63845 +       pos_in_node_t freed;
63846 +       pos_in_node_t pos, nr_items;
63847 +
63848 +       assert("vs-1526", (cinfo->freed_space_start != MAX_POS_IN_NODE &&
63849 +                          cinfo->freed_space_end != MAX_POS_IN_NODE &&
63850 +                          cinfo->first_moved != MAX_POS_IN_NODE));
63851 +       assert("vs-1523", cinfo->freed_space_end >= cinfo->freed_space_start);
63852 +
63853 +       nh = node40_node_header(node);
63854 +       nr_items = nh40_get_num_items(nh);
63855 +
63856 +       /* remove gap made up by removal */
63857 +       xmemmove(zdata(node) + cinfo->freed_space_start, zdata(node) + cinfo->freed_space_end,
63858 +                nh40_get_free_space_start(nh) - cinfo->freed_space_end);
63859 +
63860 +       /* update item headers of moved items - change their locations */
63861 +       pos = cinfo->first_moved;
63862 +       ih = node40_ih_at(node, pos);
63863 +       if (cinfo->head_removed_location != MAX_POS_IN_NODE) {
63864 +               assert("vs-1580", pos == cinfo->head_removed);
63865 +               ih40_set_offset(ih, cinfo->head_removed_location);
63866 +               pos ++;
63867 +               ih --;
63868 +       }
63869 +
63870 +       freed = cinfo->freed_space_end - cinfo->freed_space_start;
63871 +       for (; pos < nr_items; pos ++, ih --) {
63872 +               assert("vs-1581", ih == node40_ih_at(node, pos));
63873 +               ih40_set_offset(ih, ih40_get_offset(ih) - freed);
63874 +       }
63875 +
63876 +       /* free space start moved to right */
63877 +       nh40_set_free_space_start(nh, nh40_get_free_space_start(nh) - freed);
63878 +
63879 +       if (cinfo->removed_count != MAX_POS_IN_NODE) {
63880 +               /* number of items changed. Remove item headers of those items */
63881 +               ih = node40_ih_at(node, nr_items - 1);
63882 +               xmemmove(ih + cinfo->removed_count, ih,
63883 +                        sizeof (item_header40) * (nr_items - cinfo->removed_count - cinfo->first_removed));
63884 +               freed += sizeof (item_header40) * cinfo->removed_count;
63885 +               node40_set_num_items(node, nh, nr_items - cinfo->removed_count);
63886 +       }
63887 +
63888 +       /* total amount of free space increased */
63889 +       nh40_set_free_space(nh, nh40_get_free_space(nh) + freed);
63890 +}
63891 +
63892 +reiser4_internal int
63893 +shrink_item_node40(coord_t *coord, int delta)
63894 +{
63895 +       node40_header *nh;
63896 +       item_header40 *ih;
63897 +       pos_in_node_t pos;
63898 +       pos_in_node_t nr_items;
63899 +       char  *end;
63900 +       znode *node;
63901 +
63902 +       assert("nikita-3487", coord != NULL);
63903 +       assert("nikita-3488", delta >= 0);
63904 +
63905 +       node = coord->node;
63906 +       node_check(node, 0);
63907 +       nh = node40_node_header(node);
63908 +       nr_items = nh40_get_num_items(nh);
63909 +
63910 +       ih = node40_ih_at_coord(coord);
63911 +       assert("nikita-3489", delta <= length_by_coord_node40(coord));
63912 +       end = zdata(node) + ih40_get_offset(ih) + length_by_coord_node40(coord);
63913 +
63914 +       /* remove gap made up by removal */
63915 +       xmemmove(end - delta, end, nh40_get_free_space_start(nh) - delta);
63916 +
63917 +       /* update item headers of moved items - change their locations */
63918 +       pos = coord->item_pos + 1;
63919 +       ih = node40_ih_at(node, pos);
63920 +       for (; pos < nr_items; pos ++, ih --) {
63921 +               assert("nikita-3490", ih == node40_ih_at(node, pos));
63922 +               ih40_set_offset(ih, ih40_get_offset(ih) - delta);
63923 +       }
63924 +
63925 +       /* free space start moved to left */
63926 +       nh40_set_free_space_start(nh, nh40_get_free_space_start(nh) - delta);
63927 +       /* total amount of free space increased */
63928 +       nh40_set_free_space(nh, nh40_get_free_space(nh) + delta);
63929 +       /*
63930 +        * This method does _not_ changes number of items. Hence, it cannot
63931 +        * make node empty. Also it doesn't remove items at all, which means
63932 +        * that no keys have to be updated either.
63933 +        */
63934 +       return 0;
63935 +}
63936 +
63937 +
63938 +/* this is used by cut_node40 and kill_node40. It analyses input parameters and calculates cut mode. There are 2 types
63939 +   of cut. First is when a unit is removed from the middle of an item.  In this case this function returns 1. All the
63940 +   rest fits into second case: 0 or 1 of items getting tail cut, 0 or more items removed completely and 0 or 1 item
63941 +   getting head cut. Function returns 0 in this case */
63942 +static int
63943 +parse_cut(struct cut40_info *cinfo, const struct cut_kill_params *params)
63944 +{
63945 +       reiser4_key left_key, right_key;
63946 +       reiser4_key min_from_key, max_to_key;
63947 +       const reiser4_key *from_key, *to_key;
63948 +
63949 +       init_cinfo(cinfo);
63950 +
63951 +       /* calculate minimal key stored in first item of items to be cut (params->from) */
63952 +       item_key_by_coord(params->from, &min_from_key);
63953 +       /* and max key stored in last item of items to be cut (params->to) */
63954 +       max_item_key_by_coord(params->to, &max_to_key);
63955 +
63956 +       /* if cut key range is not defined in input parameters - define it using cut coord range */
63957 +       if (params->from_key == NULL) {
63958 +               assert("vs-1513", params->to_key == NULL);
63959 +               unit_key_by_coord(params->from, &left_key);
63960 +               from_key = &left_key;
63961 +               max_unit_key_by_coord(params->to, &right_key);
63962 +               to_key = &right_key;
63963 +       } else {
63964 +               from_key = params->from_key;
63965 +               to_key = params->to_key;
63966 +       }
63967 +
63968 +       if (params->from->item_pos == params->to->item_pos) {
63969 +               if (keylt(&min_from_key, from_key) && keylt(to_key, &max_to_key))
63970 +                       return 1;
63971 +
63972 +               if (keygt(from_key, &min_from_key)) {
63973 +                       /* tail of item is to be cut cut */
63974 +                       cinfo->tail_removed = params->from->item_pos;
63975 +                       cinfo->mode |= CMODE_TAIL;
63976 +               } else if (keylt(to_key, &max_to_key)) {
63977 +                       /* head of item is to be cut */
63978 +                       cinfo->head_removed = params->from->item_pos;
63979 +                       cinfo->mode |= CMODE_HEAD;
63980 +               } else {
63981 +                       /* item is removed completely */
63982 +                       cinfo->first_removed = params->from->item_pos;
63983 +                       cinfo->removed_count = 1;
63984 +                       cinfo->mode |= CMODE_WHOLE;
63985 +               }
63986 +       } else {
63987 +               cinfo->first_removed = params->from->item_pos + 1;
63988 +               cinfo->removed_count = params->to->item_pos - params->from->item_pos - 1;
63989 +
63990 +               if (keygt(from_key, &min_from_key)) {
63991 +                       /* first item is not cut completely */
63992 +                       cinfo->tail_removed = params->from->item_pos;
63993 +                       cinfo->mode |= CMODE_TAIL;
63994 +               } else {
63995 +                       cinfo->first_removed --;
63996 +                       cinfo->removed_count ++;
63997 +               }
63998 +               if (keylt(to_key, &max_to_key)) {
63999 +                       /* last item is not cut completely */
64000 +                       cinfo->head_removed = params->to->item_pos;
64001 +                       cinfo->mode |= CMODE_HEAD;
64002 +               } else {
64003 +                       cinfo->removed_count ++;
64004 +               }
64005 +               if (cinfo->removed_count)
64006 +                       cinfo->mode |= CMODE_WHOLE;
64007 +       }
64008 +
64009 +       return 0;
64010 +}
64011 +
64012 +static void
64013 +call_kill_hooks(znode *node, pos_in_node_t from, pos_in_node_t count, carry_kill_data *kdata)
64014 +{
64015 +       coord_t coord;
64016 +       item_plugin *iplug;
64017 +       pos_in_node_t pos;
64018 +
64019 +       coord.node = node;
64020 +       coord.unit_pos = 0;
64021 +       coord.between = AT_UNIT;
64022 +       for (pos = 0; pos < count; pos ++) {
64023 +               coord_set_item_pos(&coord, from + pos);
64024 +               coord.unit_pos = 0;
64025 +               coord.between = AT_UNIT;
64026 +               iplug = item_plugin_by_coord(&coord);
64027 +               if (iplug->b.kill_hook) {
64028 +                       iplug->b.kill_hook(&coord, 0, coord_num_units(&coord), kdata);
64029 +               }
64030 +       }
64031 +}
64032 +
64033 +/* this is used to kill item partially */
64034 +static pos_in_node_t
64035 +kill_units(coord_t *coord, pos_in_node_t from, pos_in_node_t to, void *data, reiser4_key *smallest_removed,
64036 +          reiser4_key *new_first_key)
64037 +{
64038 +       struct carry_kill_data *kdata;
64039 +       item_plugin *iplug;
64040 +
64041 +       kdata = data;
64042 +       iplug = item_plugin_by_coord(coord);
64043 +
64044 +       assert("vs-1524", iplug->b.kill_units);
64045 +       return iplug->b.kill_units(coord, from, to, kdata, smallest_removed, new_first_key);
64046 +}
64047 +
64048 +/* call item plugin to cut tail of file */
64049 +static pos_in_node_t
64050 +kill_tail(coord_t *coord, void *data, reiser4_key *smallest_removed)
64051 +{
64052 +       struct carry_kill_data *kdata;
64053 +       pos_in_node_t to;
64054 +
64055 +       kdata = data;
64056 +       to = coord_last_unit_pos(coord);
64057 +       return kill_units(coord, coord->unit_pos, to, kdata, smallest_removed, 0);
64058 +}
64059 +
64060 +/* call item plugin to cut head of item */
64061 +static pos_in_node_t
64062 +kill_head(coord_t *coord, void *data, reiser4_key *smallest_removed, reiser4_key *new_first_key)
64063 +{
64064 +       return kill_units(coord, 0, coord->unit_pos, data, smallest_removed, new_first_key);
64065 +}
64066 +
64067 +/* this is used to cut item partially */
64068 +static pos_in_node_t
64069 +cut_units(coord_t *coord, pos_in_node_t from, pos_in_node_t to, void *data,
64070 +         reiser4_key *smallest_removed, reiser4_key *new_first_key)
64071 +{
64072 +       carry_cut_data *cdata;
64073 +       item_plugin *iplug;
64074 +
64075 +       cdata = data;
64076 +       iplug = item_plugin_by_coord(coord);
64077 +       assert("vs-302", iplug->b.cut_units);
64078 +       return iplug->b.cut_units(coord, from, to, cdata, smallest_removed, new_first_key);
64079 +}
64080 +
64081 +/* call item plugin to cut tail of file */
64082 +static pos_in_node_t
64083 +cut_tail(coord_t *coord, void *data, reiser4_key *smallest_removed)
64084 +{
64085 +       carry_cut_data *cdata;
64086 +       pos_in_node_t to;
64087 +
64088 +       cdata = data;
64089 +       to = coord_last_unit_pos(cdata->params.from);
64090 +       return cut_units(coord, coord->unit_pos, to, data, smallest_removed, 0);
64091 +}
64092 +
64093 +/* call item plugin to cut head of item */
64094 +static pos_in_node_t
64095 +cut_head(coord_t *coord, void *data, reiser4_key *smallest_removed, reiser4_key *new_first_key)
64096 +{
64097 +       return cut_units(coord, 0, coord->unit_pos, data, smallest_removed, new_first_key);
64098 +}
64099 +
64100 +/* this returns 1 of key of first item changed, 0 - if it did not */
64101 +static int
64102 +prepare_for_compact(struct cut40_info *cinfo, const struct cut_kill_params *params, int is_cut,
64103 +                   void *data, carry_plugin_info *info)
64104 +{
64105 +       znode *node;
64106 +       item_header40 *ih;
64107 +       pos_in_node_t freed;
64108 +       pos_in_node_t item_pos;
64109 +       coord_t coord;
64110 +       reiser4_key new_first_key;
64111 +       pos_in_node_t (*kill_units_f)(coord_t *, pos_in_node_t, pos_in_node_t, void *, reiser4_key *, reiser4_key *);
64112 +       pos_in_node_t (*kill_tail_f)(coord_t *, void *, reiser4_key *);
64113 +       pos_in_node_t (*kill_head_f)(coord_t *, void *, reiser4_key *, reiser4_key *);
64114 +       int retval;
64115 +
64116 +       retval = 0;
64117 +
64118 +       node = params->from->node;
64119 +
64120 +       assert("vs-184", node == params->to->node);
64121 +       assert("vs-312", !node_is_empty(node));
64122 +       assert("vs-297", coord_compare(params->from, params->to) != COORD_CMP_ON_RIGHT);
64123 +
64124 +       if (is_cut) {
64125 +               kill_units_f = cut_units;
64126 +               kill_tail_f = cut_tail;
64127 +               kill_head_f = cut_head;
64128 +       } else {
64129 +               kill_units_f = kill_units;
64130 +               kill_tail_f = kill_tail;
64131 +               kill_head_f = kill_head;
64132 +       }
64133 +
64134 +       if (parse_cut(cinfo, params) == 1) {
64135 +               /* cut from the middle of item */
64136 +               freed = kill_units_f(params->from, params->from->unit_pos, params->to->unit_pos, data, params->smallest_removed, NULL);
64137 +
64138 +               item_pos = params->from->item_pos;
64139 +               ih = node40_ih_at(node, item_pos);
64140 +               cinfo->freed_space_start = ih40_get_offset(ih) + node40_item_length(node, item_pos) - freed;
64141 +               cinfo->freed_space_end = cinfo->freed_space_start + freed;
64142 +               cinfo->first_moved = item_pos + 1;
64143 +       } else {
64144 +               assert("vs-1521", (cinfo->tail_removed != MAX_POS_IN_NODE ||
64145 +                                  cinfo->first_removed != MAX_POS_IN_NODE ||
64146 +                                  cinfo->head_removed != MAX_POS_IN_NODE));
64147 +
64148 +               switch (cinfo->mode) {
64149 +               case CMODE_TAIL:
64150 +                       /* one item gets cut partially from its end */
64151 +                       assert("vs-1562", cinfo->tail_removed == params->from->item_pos);
64152 +
64153 +                       freed = kill_tail_f(params->from, data, params->smallest_removed);
64154 +
64155 +                       item_pos = cinfo->tail_removed;
64156 +                       ih = node40_ih_at(node, item_pos);
64157 +                       cinfo->freed_space_start = ih40_get_offset(ih) + node40_item_length(node, item_pos) - freed;
64158 +                       cinfo->freed_space_end = cinfo->freed_space_start + freed;
64159 +                       cinfo->first_moved = cinfo->tail_removed + 1;
64160 +                       break;
64161 +
64162 +               case CMODE_WHOLE:
64163 +                       /* one or more items get removed completely */
64164 +                       assert("vs-1563", cinfo->first_removed == params->from->item_pos);
64165 +                       assert("vs-1564", cinfo->removed_count > 0 && cinfo->removed_count != MAX_POS_IN_NODE);
64166 +
64167 +                       /* call kill hook for all items removed completely */
64168 +                       if (is_cut == 0)
64169 +                               call_kill_hooks(node, cinfo->first_removed, cinfo->removed_count, data);
64170 +
64171 +                       item_pos = cinfo->first_removed;
64172 +                       ih = node40_ih_at(node, item_pos);
64173 +
64174 +                       if (params->smallest_removed)
64175 +                               xmemcpy(params->smallest_removed, &ih->key, sizeof (reiser4_key));
64176 +
64177 +                       cinfo->freed_space_start = ih40_get_offset(ih);
64178 +
64179 +                       item_pos += (cinfo->removed_count - 1);
64180 +                       ih -= (cinfo->removed_count - 1);
64181 +                       cinfo->freed_space_end = ih40_get_offset(ih) + node40_item_length(node, item_pos);
64182 +                       cinfo->first_moved = item_pos + 1;
64183 +                       if (cinfo->first_removed == 0)
64184 +                               /* key of first item of the node changes */
64185 +                               retval = 1;
64186 +                       break;
64187 +
64188 +               case CMODE_HEAD:
64189 +                       /* one item gets cut partially from its head */
64190 +                       assert("vs-1565", cinfo->head_removed == params->from->item_pos);
64191 +
64192 +                       freed = kill_head_f(params->to, data, params->smallest_removed, &new_first_key);
64193 +
64194 +                       item_pos = cinfo->head_removed;
64195 +                       ih = node40_ih_at(node, item_pos);
64196 +                       cinfo->freed_space_start = ih40_get_offset(ih);
64197 +                       cinfo->freed_space_end = ih40_get_offset(ih) + freed;
64198 +                       cinfo->first_moved = cinfo->head_removed + 1;
64199 +
64200 +                       /* item head is removed, therefore, item key changed */
64201 +                       coord.node = node;
64202 +                       coord_set_item_pos(&coord, item_pos);
64203 +                       coord.unit_pos = 0;
64204 +                       coord.between = AT_UNIT;
64205 +                       update_item_key_node40(&coord, &new_first_key, 0);
64206 +                       if (item_pos == 0)
64207 +                               /* key of first item of the node changes */
64208 +                               retval = 1;
64209 +                       break;
64210 +
64211 +               case CMODE_TAIL | CMODE_WHOLE:
64212 +                       /* one item gets cut from its end and one or more items get removed completely */
64213 +                       assert("vs-1566", cinfo->tail_removed == params->from->item_pos);
64214 +                       assert("vs-1567", cinfo->first_removed == cinfo->tail_removed + 1);
64215 +                       assert("vs-1564", cinfo->removed_count > 0 && cinfo->removed_count != MAX_POS_IN_NODE);
64216 +
64217 +                       freed = kill_tail_f(params->from, data, params->smallest_removed);
64218 +
64219 +                       item_pos = cinfo->tail_removed;
64220 +                       ih = node40_ih_at(node, item_pos);
64221 +                       cinfo->freed_space_start = ih40_get_offset(ih) + node40_item_length(node, item_pos) - freed;
64222 +
64223 +                       /* call kill hook for all items removed completely */
64224 +                       if (is_cut == 0)
64225 +                               call_kill_hooks(node, cinfo->first_removed, cinfo->removed_count, data);
64226 +
64227 +                       item_pos += cinfo->removed_count;
64228 +                       ih -= cinfo->removed_count;
64229 +                       cinfo->freed_space_end = ih40_get_offset(ih) + node40_item_length(node, item_pos);
64230 +                       cinfo->first_moved = item_pos + 1;
64231 +                       break;
64232 +
64233 +               case CMODE_WHOLE | CMODE_HEAD:
64234 +                       /* one or more items get removed completely and one item gets cut partially from its head */
64235 +                       assert("vs-1568", cinfo->first_removed == params->from->item_pos);
64236 +                       assert("vs-1564", cinfo->removed_count > 0 && cinfo->removed_count != MAX_POS_IN_NODE);
64237 +                       assert("vs-1569", cinfo->head_removed == cinfo->first_removed + cinfo->removed_count);
64238 +
64239 +                       /* call kill hook for all items removed completely */
64240 +                       if (is_cut == 0)
64241 +                               call_kill_hooks(node, cinfo->first_removed, cinfo->removed_count, data);
64242 +
64243 +                       item_pos = cinfo->first_removed;
64244 +                       ih = node40_ih_at(node, item_pos);
64245 +
64246 +                       if (params->smallest_removed)
64247 +                               xmemcpy(params->smallest_removed, &ih->key, sizeof (reiser4_key));
64248 +
64249 +                       freed = kill_head_f(params->to, data, 0, &new_first_key);
64250 +
64251 +                       cinfo->freed_space_start = ih40_get_offset(ih);
64252 +
64253 +                       ih = node40_ih_at(node, cinfo->head_removed);
64254 +                       /* this is the most complex case. Item which got head removed and items which are to be moved
64255 +                          intact change their location differently. */
64256 +                       cinfo->freed_space_end = ih40_get_offset(ih) + freed;
64257 +                       cinfo->first_moved = cinfo->head_removed;
64258 +                       cinfo->head_removed_location = cinfo->freed_space_start;
64259 +
64260 +                       /* item head is removed, therefore, item key changed */
64261 +                       coord.node = node;
64262 +                       coord_set_item_pos(&coord, cinfo->head_removed);
64263 +                       coord.unit_pos = 0;
64264 +                       coord.between = AT_UNIT;
64265 +                       update_item_key_node40(&coord, &new_first_key, 0);
64266 +
64267 +                       assert("vs-1579", cinfo->first_removed == 0);
64268 +                       /* key of first item of the node changes */
64269 +                       retval = 1;
64270 +                       break;
64271 +
64272 +               case CMODE_TAIL | CMODE_HEAD:
64273 +                       /* one item get cut from its end and its neighbor gets cut from its tail */
64274 +                       impossible("vs-1576", "this can not happen currently");
64275 +                       break;
64276 +
64277 +               case CMODE_TAIL | CMODE_WHOLE | CMODE_HEAD:
64278 +                       impossible("vs-1577", "this can not happen currently");
64279 +                       break;
64280 +               default:
64281 +                       impossible("vs-1578", "unexpected cut mode");
64282 +                       break;
64283 +               }
64284 +       }
64285 +       return retval;
64286 +}
64287 +
64288 +
64289 +/* plugin->u.node.kill
64290 +   return value is number of items removed completely */
64291 +int
64292 +kill_node40(struct carry_kill_data *kdata, carry_plugin_info *info)
64293 +{
64294 +       znode *node;
64295 +       struct cut40_info cinfo;
64296 +       int first_key_changed;
64297 +
64298 +       node = kdata->params.from->node;
64299 +       node_check(node, 0);
64300 +
64301 +       first_key_changed = prepare_for_compact(&cinfo, &kdata->params, 0/* not cut */, kdata, info);
64302 +       compact(node, &cinfo);
64303 +
64304 +       if (info) {
64305 +               /* it is not called by node40_shift, so we have to take care
64306 +                  of changes on upper levels */
64307 +               if (node_is_empty(node) && !(kdata->flags & DELETE_RETAIN_EMPTY))
64308 +                       /* all contents of node is deleted */
64309 +                       prepare_removal_node40(node, info);
64310 +               else if (first_key_changed) {
64311 +                       prepare_for_update(NULL, node, info);
64312 +               }
64313 +       }
64314 +
64315 +       coord_clear_iplug(kdata->params.from);
64316 +       coord_clear_iplug(kdata->params.to);
64317 +
64318 +       node_check(node, 0);
64319 +       znode_make_dirty(node);
64320 +       return cinfo.removed_count == MAX_POS_IN_NODE ? 0 : cinfo.removed_count;
64321 +}
64322 +
64323 +/* plugin->u.node.cut
64324 +   return value is number of items removed completely */
64325 +int
64326 +cut_node40(struct carry_cut_data *cdata, carry_plugin_info *info)
64327 +{
64328 +       znode *node;
64329 +       struct cut40_info cinfo;
64330 +       int first_key_changed;
64331 +
64332 +       node = cdata->params.from->node;
64333 +       node_check(node, 0);
64334 +
64335 +       first_key_changed = prepare_for_compact(&cinfo, &cdata->params, 1/* not cut */, cdata, info);
64336 +       compact(node, &cinfo);
64337 +
64338 +       if (info) {
64339 +               /* it is not called by node40_shift, so we have to take care
64340 +                  of changes on upper levels */
64341 +               if (node_is_empty(node))
64342 +                       /* all contents of node is deleted */
64343 +                       prepare_removal_node40(node, info);
64344 +               else if (first_key_changed) {
64345 +                       prepare_for_update(NULL, node, info);
64346 +               }
64347 +       }
64348 +
64349 +       coord_clear_iplug(cdata->params.from);
64350 +       coord_clear_iplug(cdata->params.to);
64351 +
64352 +       node_check(node, 0);
64353 +       znode_make_dirty(node);
64354 +       return cinfo.removed_count == MAX_POS_IN_NODE ? 0 : cinfo.removed_count ;
64355 +}
64356 +
64357 +
64358 +/* this structure is used by shift method of node40 plugin */
64359 +struct shift_params {
64360 +       shift_direction pend;   /* when @pend == append - we are shifting to
64361 +                                  left, when @pend == prepend - to right */
64362 +       coord_t wish_stop;      /* when shifting to left this is last unit we
64363 +                                  want shifted, when shifting to right - this
64364 +                                  is set to unit we want to start shifting
64365 +                                  from */
64366 +       znode *target;
64367 +       int everything;         /* it is set to 1 if everything we have to shift is
64368 +                                  shifted, 0 - otherwise */
64369 +
64370 +       /* FIXME-VS: get rid of read_stop */
64371 +
64372 +       /* these are set by estimate_shift */
64373 +       coord_t real_stop;      /* this will be set to last unit which will be
64374 +                                  really shifted */
64375 +
64376 +       /* coordinate in source node before operation of unit which becomes
64377 +          first after shift to left of last after shift to right */
64378 +       union {
64379 +               coord_t future_first;
64380 +               coord_t future_last;
64381 +       } u;
64382 +
64383 +       unsigned merging_units; /* number of units of first item which have to
64384 +                                  be merged with last item of target node */
64385 +       unsigned merging_bytes; /* number of bytes in those units */
64386 +
64387 +       unsigned entire;        /* items shifted in their entirety */
64388 +       unsigned entire_bytes;  /* number of bytes in those items */
64389 +
64390 +       unsigned part_units;    /* number of units of partially copied item */
64391 +       unsigned part_bytes;    /* number of bytes in those units */
64392 +
64393 +       unsigned shift_bytes;   /* total number of bytes in items shifted (item
64394 +                                  headers not included) */
64395 +
64396 +};
64397 +
64398 +static int
64399 +item_creation_overhead(coord_t * item)
64400 +{
64401 +       return node_plugin_by_coord(item)->item_overhead(item->node, 0);
64402 +}
64403 +
64404 +/* how many units are there in @source starting from source->unit_pos
64405 +   but not further than @stop_coord */
64406 +static int
64407 +wanted_units(coord_t * source, coord_t * stop_coord, shift_direction pend)
64408 +{
64409 +       if (pend == SHIFT_LEFT) {
64410 +               assert("vs-181", source->unit_pos == 0);
64411 +       } else {
64412 +               assert("vs-182", source->unit_pos == coord_last_unit_pos(source));
64413 +       }
64414 +
64415 +       if (source->item_pos != stop_coord->item_pos) {
64416 +               /* @source and @stop_coord are different items */
64417 +               return coord_last_unit_pos(source) + 1;
64418 +       }
64419 +
64420 +       if (pend == SHIFT_LEFT) {
64421 +               return stop_coord->unit_pos + 1;
64422 +       } else {
64423 +               return source->unit_pos - stop_coord->unit_pos + 1;
64424 +       }
64425 +}
64426 +
64427 +/* this calculates what can be copied from @shift->wish_stop.node to
64428 +   @shift->target */
64429 +static void
64430 +estimate_shift(struct shift_params *shift, const reiser4_context *ctx)
64431 +{
64432 +       unsigned target_free_space, size;
64433 +       pos_in_node_t stop_item;        /* item which estimating should not consider */
64434 +       unsigned want;          /* number of units of item we want shifted */
64435 +       coord_t source;         /* item being estimated */
64436 +       item_plugin *iplug;
64437 +
64438 +       /* shifting to left/right starts from first/last units of
64439 +          @shift->wish_stop.node */
64440 +       if (shift->pend == SHIFT_LEFT) {
64441 +               coord_init_first_unit(&source, shift->wish_stop.node);
64442 +       } else {
64443 +               coord_init_last_unit(&source, shift->wish_stop.node);
64444 +       }
64445 +       shift->real_stop = source;
64446 +
64447 +       /* free space in target node and number of items in source */
64448 +       target_free_space = znode_free_space(shift->target);
64449 +
64450 +       shift->everything = 0;
64451 +       if (!node_is_empty(shift->target)) {
64452 +               /* target node is not empty, check for boundary items
64453 +                  mergeability */
64454 +               coord_t to;
64455 +
64456 +               /* item we try to merge @source with */
64457 +               if (shift->pend == SHIFT_LEFT) {
64458 +                       coord_init_last_unit(&to, shift->target);
64459 +               } else {
64460 +                       coord_init_first_unit(&to, shift->target);
64461 +               }
64462 +
64463 +               if ((shift->pend == SHIFT_LEFT) ? are_items_mergeable(&to, &source) : are_items_mergeable(&source, &to)) {
64464 +                       /* how many units of @source do we want to merge to
64465 +                          item @to */
64466 +                       want = wanted_units(&source, &shift->wish_stop, shift->pend);
64467 +
64468 +                       /* how many units of @source we can merge to item
64469 +                          @to */
64470 +                       iplug = item_plugin_by_coord(&source);
64471 +                       if (iplug->b.can_shift != NULL)
64472 +                               shift->merging_units =
64473 +                                   iplug->b.can_shift(target_free_space,
64474 +                                                      &source, shift->target, shift->pend, &size, want);
64475 +                       else {
64476 +                               shift->merging_units = 0;
64477 +                               size = 0;
64478 +                       }
64479 +                       shift->merging_bytes = size;
64480 +                       shift->shift_bytes += size;
64481 +                       /* update stop coord to be set to last unit of @source
64482 +                          we can merge to @target */
64483 +                       if (shift->merging_units)
64484 +                               /* at least one unit can be shifted */
64485 +                               shift->real_stop.unit_pos = (shift->merging_units - source.unit_pos - 1) * shift->pend;
64486 +                       else {
64487 +                               /* nothing can be shifted */
64488 +                               if (shift->pend == SHIFT_LEFT)
64489 +                                       coord_init_before_first_item(&shift->real_stop, source.node);
64490 +                               else
64491 +                                       coord_init_after_last_item(&shift->real_stop, source.node);
64492 +                       }
64493 +                       assert("nikita-2081", shift->real_stop.unit_pos + 1);
64494 +
64495 +                       if (shift->merging_units != want) {
64496 +                               /* we could not copy as many as we want, so,
64497 +                                  there is no reason for estimating any
64498 +                                  longer */
64499 +                               return;
64500 +                       }
64501 +
64502 +                       target_free_space -= size;
64503 +                       coord_add_item_pos(&source, shift->pend);
64504 +               }
64505 +       }
64506 +
64507 +       /* number of item nothing of which we want to shift */
64508 +       stop_item = shift->wish_stop.item_pos + shift->pend;
64509 +
64510 +       /* calculate how many items can be copied into given free
64511 +          space as whole */
64512 +       for (; source.item_pos != stop_item; coord_add_item_pos(&source, shift->pend)) {
64513 +               if (shift->pend == SHIFT_RIGHT)
64514 +                       source.unit_pos = coord_last_unit_pos(&source);
64515 +
64516 +               /* how many units of @source do we want to copy */
64517 +               want = wanted_units(&source, &shift->wish_stop, shift->pend);
64518 +
64519 +               if (want == coord_last_unit_pos(&source) + 1) {
64520 +                       /* we want this item to be copied entirely */
64521 +                       size = item_length_by_coord(&source) + item_creation_overhead(&source);
64522 +                       if (size <= target_free_space) {
64523 +                               /* item fits into target node as whole */
64524 +                               target_free_space -= size;
64525 +                               shift->shift_bytes += size - item_creation_overhead(&source);
64526 +                               shift->entire_bytes += size - item_creation_overhead(&source);
64527 +                               shift->entire++;
64528 +
64529 +                               /* update shift->real_stop coord to be set to
64530 +                                  last unit of @source we can merge to
64531 +                                  @target */
64532 +                               shift->real_stop = source;
64533 +                               if (shift->pend == SHIFT_LEFT)
64534 +                                       shift->real_stop.unit_pos = coord_last_unit_pos(&shift->real_stop);
64535 +                               else
64536 +                                       shift->real_stop.unit_pos = 0;
64537 +                               continue;
64538 +                       }
64539 +               }
64540 +
64541 +               /* we reach here only for an item which does not fit into
64542 +                  target node in its entirety. This item may be either
64543 +                  partially shifted, or not shifted at all. We will have to
64544 +                  create new item in target node, so decrease amout of free
64545 +                  space by an item creation overhead. We can reach here also
64546 +                  if stop coord is in this item */
64547 +               if (target_free_space >= (unsigned) item_creation_overhead(&source)) {
64548 +                       target_free_space -= item_creation_overhead(&source);
64549 +                       iplug = item_plugin_by_coord(&source);
64550 +                       if (iplug->b.can_shift) {
64551 +                               shift->part_units = iplug->b.can_shift(target_free_space, &source, 0    /*target */
64552 +                                                                      , shift->pend, &size, want);
64553 +                       } else {
64554 +                               target_free_space = 0;
64555 +                               shift->part_units = 0;
64556 +                               size = 0;
64557 +                       }
64558 +               } else {
64559 +                       target_free_space = 0;
64560 +                       shift->part_units = 0;
64561 +                       size = 0;
64562 +               }
64563 +               shift->part_bytes = size;
64564 +               shift->shift_bytes += size;
64565 +
64566 +               /* set @shift->real_stop to last unit of @source we can merge
64567 +                  to @shift->target */
64568 +               if (shift->part_units) {
64569 +                       shift->real_stop = source;
64570 +                       shift->real_stop.unit_pos = (shift->part_units - source.unit_pos - 1) * shift->pend;
64571 +                       assert("nikita-2082", shift->real_stop.unit_pos + 1);
64572 +               }
64573 +
64574 +               if (want != shift->part_units)
64575 +                       /* not everything wanted were shifted */
64576 +                       return;
64577 +               break;
64578 +       }
64579 +
64580 +       shift->everything = 1;
64581 +}
64582 +
64583 +static void
64584 +copy_units(coord_t * target, coord_t * source, unsigned from, unsigned count, shift_direction dir, unsigned free_space)
64585 +{
64586 +       item_plugin *iplug;
64587 +
64588 +       assert("nikita-1463", target != NULL);
64589 +       assert("nikita-1464", source != NULL);
64590 +       assert("nikita-1465", from + count <= coord_num_units(source));
64591 +
64592 +       IF_TRACE(TRACE_COORDS, print_coord("copy_units source:", source, 0));
64593 +
64594 +       iplug = item_plugin_by_coord(source);
64595 +       assert("nikita-1468", iplug == item_plugin_by_coord(target));
64596 +       iplug->b.copy_units(target, source, from, count, dir, free_space);
64597 +
64598 +       if (dir == SHIFT_RIGHT) {
64599 +               /* FIXME-VS: this looks not necessary. update_item_key was
64600 +                  called already by copy_units method */
64601 +               reiser4_key split_key;
64602 +
64603 +               assert("nikita-1469", target->unit_pos == 0);
64604 +
64605 +               unit_key_by_coord(target, &split_key);
64606 +               node_plugin_by_coord(target)->update_item_key(target, &split_key, 0);
64607 +       }
64608 +}
64609 +
64610 +/* copy part of @shift->real_stop.node starting either from its beginning or
64611 +   from its end and ending at @shift->real_stop to either the end or the
64612 +   beginning of @shift->target */
64613 +static void
64614 +copy(struct shift_params *shift)
64615 +{
64616 +       node40_header *nh;
64617 +       coord_t from;
64618 +       coord_t to;
64619 +       item_header40 *from_ih, *to_ih;
64620 +       int free_space_start;
64621 +       int new_items;
64622 +       unsigned old_items;
64623 +       int old_offset;
64624 +       unsigned i;
64625 +
64626 +       nh = node40_node_header(shift->target);
64627 +       free_space_start = nh40_get_free_space_start(nh);
64628 +       old_items = nh40_get_num_items(nh);
64629 +       new_items = shift->entire + (shift->part_units ? 1 : 0);
64630 +       assert("vs-185", shift->shift_bytes == shift->merging_bytes + shift->entire_bytes + shift->part_bytes);
64631 +
64632 +       from = shift->wish_stop;
64633 +
64634 +       IF_TRACE(TRACE_COORDS, print_coord("node40_copy from:", &from, 0));
64635 +
64636 +       coord_init_first_unit(&to, shift->target);
64637 +
64638 +       /* NOTE:NIKITA->VS not sure what I am doing: shift->target is empty,
64639 +          hence to.between is set to EMPTY_NODE above. Looks like we want it
64640 +          to be AT_UNIT.
64641 +
64642 +          Oh, wonders of ->betweeness...
64643 +
64644 +       */
64645 +       to.between = AT_UNIT;
64646 +
64647 +       if (shift->pend == SHIFT_LEFT) {
64648 +               /* copying to left */
64649 +
64650 +               coord_set_item_pos(&from, 0);
64651 +               from_ih = node40_ih_at(from.node, 0);
64652 +
64653 +               coord_set_item_pos(&to, node40_num_of_items_internal(to.node) - 1);
64654 +               if (shift->merging_units) {
64655 +                       /* expand last item, so that plugin methods will see
64656 +                          correct data */
64657 +                       free_space_start += shift->merging_bytes;
64658 +                       nh40_set_free_space_start(nh, (unsigned) free_space_start);
64659 +                       nh40_set_free_space(nh, nh40_get_free_space(nh) - shift->merging_bytes);
64660 +
64661 +                       IF_TRACE(TRACE_COORDS, print_coord("before copy_units from:", &from, 0));
64662 +                       IF_TRACE(TRACE_COORDS, print_coord("before copy_units to:", &to, 0));
64663 +
64664 +                       /* appending last item of @target */
64665 +                       copy_units(&to, &from, 0,       /* starting from 0-th unit */
64666 +                                  shift->merging_units, SHIFT_LEFT, shift->merging_bytes);
64667 +                       coord_inc_item_pos(&from);
64668 +                       from_ih--;
64669 +                       coord_inc_item_pos(&to);
64670 +               }
64671 +
64672 +               to_ih = node40_ih_at(shift->target, old_items);
64673 +               if (shift->entire) {
64674 +                       /* copy @entire items entirely */
64675 +
64676 +                       /* copy item headers */
64677 +                       xmemcpy(to_ih - shift->entire + 1,
64678 +                               from_ih - shift->entire + 1, shift->entire * sizeof (item_header40));
64679 +                       /* update item header offset */
64680 +                       old_offset = ih40_get_offset(from_ih);
64681 +                       /* AUDIT: Looks like if we calculate old_offset + free_space_start here instead of just old_offset, we can perform one "add" operation less per each iteration */
64682 +                       for (i = 0; i < shift->entire; i++, to_ih--, from_ih--)
64683 +                               ih40_set_offset(to_ih, ih40_get_offset(from_ih) - old_offset + free_space_start);
64684 +
64685 +                       /* copy item bodies */
64686 +                       xmemcpy(zdata(shift->target) + free_space_start, zdata(from.node) + old_offset, /*ih40_get_offset (from_ih), */
64687 +                               shift->entire_bytes);
64688 +
64689 +                       coord_add_item_pos(&from, (int) shift->entire);
64690 +                       coord_add_item_pos(&to, (int) shift->entire);
64691 +               }
64692 +
64693 +               nh40_set_free_space_start(nh, free_space_start + shift->shift_bytes - shift->merging_bytes);
64694 +               nh40_set_free_space(nh,
64695 +                                   nh40_get_free_space(nh) -
64696 +                                   (shift->shift_bytes - shift->merging_bytes + sizeof (item_header40) * new_items));
64697 +
64698 +               /* update node header */
64699 +               node40_set_num_items(shift->target, nh, old_items + new_items);
64700 +               assert("vs-170", nh40_get_free_space(nh) < znode_size(shift->target));
64701 +
64702 +               if (shift->part_units) {
64703 +                       /* copy heading part (@part units) of @source item as
64704 +                          a new item into @target->node */
64705 +
64706 +                       /* copy item header of partially copied item */
64707 +                       coord_set_item_pos(&to, node40_num_of_items_internal(to.node)
64708 +                                          - 1);
64709 +                       xmemcpy(to_ih, from_ih, sizeof (item_header40));
64710 +                       ih40_set_offset(to_ih, nh40_get_free_space_start(nh) - shift->part_bytes);
64711 +                       if (item_plugin_by_coord(&to)->b.init)
64712 +                               item_plugin_by_coord(&to)->b.init(&to, &from, 0);
64713 +                       copy_units(&to, &from, 0, shift->part_units, SHIFT_LEFT, shift->part_bytes);
64714 +               }
64715 +
64716 +       } else {
64717 +               /* copying to right */
64718 +
64719 +               coord_set_item_pos(&from, node40_num_of_items_internal(from.node) - 1);
64720 +               from_ih = node40_ih_at_coord(&from);
64721 +
64722 +               coord_set_item_pos(&to, 0);
64723 +
64724 +               /* prepare space for new items */
64725 +               xmemmove(zdata(to.node) + sizeof (node40_header) +
64726 +                        shift->shift_bytes,
64727 +                        zdata(to.node) + sizeof (node40_header), free_space_start - sizeof (node40_header));
64728 +               /* update item headers of moved items */
64729 +               to_ih = node40_ih_at(to.node, 0);
64730 +               /* first item gets @merging_bytes longer. free space appears
64731 +                  at its beginning */
64732 +               if (!node_is_empty(to.node))
64733 +                       ih40_set_offset(to_ih, ih40_get_offset(to_ih) + shift->shift_bytes - shift->merging_bytes);
64734 +
64735 +               for (i = 1; i < old_items; i++)
64736 +                       ih40_set_offset(to_ih - i, ih40_get_offset(to_ih - i) + shift->shift_bytes);
64737 +
64738 +               /* move item headers to make space for new items */
64739 +               xmemmove(to_ih - old_items + 1 - new_items, to_ih - old_items + 1, sizeof (item_header40) * old_items);
64740 +               to_ih -= (new_items - 1);
64741 +
64742 +               nh40_set_free_space_start(nh, free_space_start + shift->shift_bytes);
64743 +               nh40_set_free_space(nh,
64744 +                                   nh40_get_free_space(nh) -
64745 +                                   (shift->shift_bytes + sizeof (item_header40) * new_items));
64746 +
64747 +               /* update node header */
64748 +               node40_set_num_items(shift->target, nh, old_items + new_items);
64749 +               assert("vs-170", nh40_get_free_space(nh) < znode_size(shift->target));
64750 +
64751 +               if (shift->merging_units) {
64752 +                       coord_add_item_pos(&to, new_items);
64753 +                       to.unit_pos = 0;
64754 +                       to.between = AT_UNIT;
64755 +                       /* prepend first item of @to */
64756 +                       copy_units(&to, &from,
64757 +                                  coord_last_unit_pos(&from) -
64758 +                                  shift->merging_units + 1, shift->merging_units, SHIFT_RIGHT, shift->merging_bytes);
64759 +                       coord_dec_item_pos(&from);
64760 +                       from_ih++;
64761 +               }
64762 +
64763 +               if (shift->entire) {
64764 +                       /* copy @entire items entirely */
64765 +
64766 +                       /* copy item headers */
64767 +                       xmemcpy(to_ih, from_ih, shift->entire * sizeof (item_header40));
64768 +
64769 +                       /* update item header offset */
64770 +                       old_offset = ih40_get_offset(from_ih + shift->entire - 1);
64771 +                       /* AUDIT: old_offset + sizeof (node40_header) + shift->part_bytes calculation can be taken off the loop. */
64772 +                       for (i = 0; i < shift->entire; i++, to_ih++, from_ih++)
64773 +                               ih40_set_offset(to_ih,
64774 +                                               ih40_get_offset(from_ih) -
64775 +                                               old_offset + sizeof (node40_header) + shift->part_bytes);
64776 +                       /* copy item bodies */
64777 +                       coord_add_item_pos(&from, -(int) (shift->entire - 1));
64778 +                       xmemcpy(zdata(to.node) + sizeof (node40_header) +
64779 +                               shift->part_bytes, item_by_coord_node40(&from),
64780 +                               shift->entire_bytes);
64781 +                       coord_dec_item_pos(&from);
64782 +               }
64783 +
64784 +               if (shift->part_units) {
64785 +                       coord_set_item_pos(&to, 0);
64786 +                       to.unit_pos = 0;
64787 +                       to.between = AT_UNIT;
64788 +                       /* copy heading part (@part units) of @source item as
64789 +                          a new item into @target->node */
64790 +
64791 +                       /* copy item header of partially copied item */
64792 +                       xmemcpy(to_ih, from_ih, sizeof (item_header40));
64793 +                       ih40_set_offset(to_ih, sizeof (node40_header));
64794 +                       if (item_plugin_by_coord(&to)->b.init)
64795 +                               item_plugin_by_coord(&to)->b.init(&to, &from, 0);
64796 +                       copy_units(&to, &from,
64797 +                                  coord_last_unit_pos(&from) -
64798 +                                  shift->part_units + 1, shift->part_units, SHIFT_RIGHT, shift->part_bytes);
64799 +               }
64800 +       }
64801 +}
64802 +
64803 +/* remove everything either before or after @fact_stop. Number of items
64804 +   removed completely is returned */
64805 +static int
64806 +delete_copied(struct shift_params *shift)
64807 +{
64808 +       coord_t from;
64809 +       coord_t to;
64810 +       struct carry_cut_data cdata;
64811 +
64812 +       if (shift->pend == SHIFT_LEFT) {
64813 +               /* we were shifting to left, remove everything from the
64814 +                  beginning of @shift->wish_stop->node upto
64815 +                  @shift->wish_stop */
64816 +               coord_init_first_unit(&from, shift->real_stop.node);
64817 +               to = shift->real_stop;
64818 +
64819 +               /* store old coordinate of unit which will be first after
64820 +                  shift to left */
64821 +               shift->u.future_first = to;
64822 +               coord_next_unit(&shift->u.future_first);
64823 +       } else {
64824 +               /* we were shifting to right, remove everything from
64825 +                  @shift->stop_coord upto to end of
64826 +                  @shift->stop_coord->node */
64827 +               from = shift->real_stop;
64828 +               coord_init_last_unit(&to, from.node);
64829 +
64830 +               /* store old coordinate of unit which will be last after
64831 +                  shift to right */
64832 +               shift->u.future_last = from;
64833 +               coord_prev_unit(&shift->u.future_last);
64834 +       }
64835 +
64836 +       cdata.params.from = &from;
64837 +       cdata.params.to = &to;
64838 +       cdata.params.from_key = 0;
64839 +       cdata.params.to_key = 0;
64840 +       cdata.params.smallest_removed = 0;
64841 +       return cut_node40(&cdata, 0);
64842 +}
64843 +
64844 +/* something was moved between @left and @right. Add carry operation to @info
64845 +   list to have carry to update delimiting key between them */
64846 +static int
64847 +prepare_for_update(znode * left, znode * right, carry_plugin_info * info)
64848 +{
64849 +       carry_op *op;
64850 +       carry_node *cn;
64851 +
64852 +       if (info == NULL)
64853 +               /* nowhere to send operation to. */
64854 +               return 0;
64855 +
64856 +       if (!should_notify_parent(right))
64857 +               return 0;
64858 +
64859 +       op = node_post_carry(info, COP_UPDATE, right, 1);
64860 +       if (IS_ERR(op) || op == NULL)
64861 +               return op ? PTR_ERR(op) : -EIO;
64862 +
64863 +       if (left != NULL) {
64864 +               carry_node *reference;
64865 +
64866 +               if (info->doing)
64867 +                       reference = insert_carry_node(info->doing,
64868 +                                                     info->todo, left);
64869 +               else
64870 +                       reference = op->node;
64871 +               assert("nikita-2992", reference != NULL);
64872 +               cn = add_carry(info->todo, POOLO_BEFORE, reference);
64873 +               if (IS_ERR(cn))
64874 +                       return PTR_ERR(cn);
64875 +               cn->parent = 1;
64876 +               cn->node = left;
64877 +               if (ZF_ISSET(left, JNODE_ORPHAN))
64878 +                       cn->left_before = 1;
64879 +               op->u.update.left = cn;
64880 +       } else
64881 +               op->u.update.left = NULL;
64882 +       return 0;
64883 +}
64884 +
64885 +/* plugin->u.node.prepare_removal
64886 +   to delete a pointer to @empty from the tree add corresponding carry
64887 +   operation (delete) to @info list */
64888 +reiser4_internal int
64889 +prepare_removal_node40(znode * empty, carry_plugin_info * info)
64890 +{
64891 +       carry_op *op;
64892 +
64893 +       if (!should_notify_parent(empty))
64894 +               return 0;
64895 +       /* already on a road to Styx */
64896 +       if (ZF_ISSET(empty, JNODE_HEARD_BANSHEE))
64897 +               return 0;
64898 +       op = node_post_carry(info, COP_DELETE, empty, 1);
64899 +       if (IS_ERR(op) || op == NULL)
64900 +               return RETERR(op ? PTR_ERR(op) : -EIO);
64901 +
64902 +       op->u.delete.child = 0;
64903 +       op->u.delete.flags = 0;
64904 +
64905 +       /* fare thee well */
64906 +       ZF_SET(empty, JNODE_HEARD_BANSHEE);
64907 +       return 0;
64908 +}
64909 +
64910 +/* something were shifted from @insert_coord->node to @shift->target, update
64911 +   @insert_coord correspondingly */
64912 +static void
64913 +adjust_coord(coord_t * insert_coord, struct shift_params *shift, int removed, int including_insert_coord)
64914 +{
64915 +       /* item plugin was invalidated by shifting */
64916 +       coord_clear_iplug(insert_coord);
64917 +
64918 +       if (node_is_empty(shift->wish_stop.node)) {
64919 +               assert("vs-242", shift->everything);
64920 +               if (including_insert_coord) {
64921 +                       if (shift->pend == SHIFT_RIGHT) {
64922 +                               /* set @insert_coord before first unit of
64923 +                                  @shift->target node */
64924 +                               coord_init_before_first_item(insert_coord, shift->target);
64925 +                       } else {
64926 +                               /* set @insert_coord after last in target node */
64927 +                               coord_init_after_last_item(insert_coord, shift->target);
64928 +                       }
64929 +               } else {
64930 +                       /* set @insert_coord inside of empty node. There is
64931 +                          only one possible coord within an empty
64932 +                          node. init_first_unit will set that coord */
64933 +                       coord_init_first_unit(insert_coord, shift->wish_stop.node);
64934 +               }
64935 +               return;
64936 +       }
64937 +
64938 +       if (shift->pend == SHIFT_RIGHT) {
64939 +               /* there was shifting to right */
64940 +               if (shift->everything) {
64941 +                       /* everything wanted was shifted */
64942 +                       if (including_insert_coord) {
64943 +                               /* @insert_coord is set before first unit of
64944 +                                  @to node */
64945 +                               coord_init_before_first_item(insert_coord, shift->target);
64946 +                               insert_coord->between = BEFORE_UNIT;
64947 +                       } else {
64948 +                               /* @insert_coord is set after last unit of
64949 +                                  @insert->node */
64950 +                               coord_init_last_unit(insert_coord, shift->wish_stop.node);
64951 +                               insert_coord->between = AFTER_UNIT;
64952 +                       }
64953 +               }
64954 +               return;
64955 +       }
64956 +
64957 +       /* there was shifting to left */
64958 +       if (shift->everything) {
64959 +               /* everything wanted was shifted */
64960 +               if (including_insert_coord) {
64961 +                       /* @insert_coord is set after last unit in @to node */
64962 +                       coord_init_after_last_item(insert_coord, shift->target);
64963 +               } else {
64964 +                       /* @insert_coord is set before first unit in the same
64965 +                          node */
64966 +                       coord_init_before_first_item(insert_coord, shift->wish_stop.node);
64967 +               }
64968 +               return;
64969 +       }
64970 +
64971 +       /* FIXME-VS: the code below is complicated because with between ==
64972 +          AFTER_ITEM unit_pos is set to 0 */
64973 +
64974 +       if (!removed) {
64975 +               /* no items were shifted entirely */
64976 +               assert("vs-195", shift->merging_units == 0 || shift->part_units == 0);
64977 +
64978 +               if (shift->real_stop.item_pos == insert_coord->item_pos) {
64979 +                       if (shift->merging_units) {
64980 +                               if (insert_coord->between == AFTER_UNIT) {
64981 +                                       assert("nikita-1441", insert_coord->unit_pos >= shift->merging_units);
64982 +                                       insert_coord->unit_pos -= shift->merging_units;
64983 +                               } else if (insert_coord->between == BEFORE_UNIT) {
64984 +                                       assert("nikita-2090", insert_coord->unit_pos > shift->merging_units);
64985 +                                       insert_coord->unit_pos -= shift->merging_units;
64986 +                               }
64987 +
64988 +                               assert("nikita-2083", insert_coord->unit_pos + 1);
64989 +                       } else {
64990 +                               if (insert_coord->between == AFTER_UNIT) {
64991 +                                       assert("nikita-1442", insert_coord->unit_pos >= shift->part_units);
64992 +                                       insert_coord->unit_pos -= shift->part_units;
64993 +                               } else if (insert_coord->between == BEFORE_UNIT) {
64994 +                                       assert("nikita-2089", insert_coord->unit_pos > shift->part_units);
64995 +                                       insert_coord->unit_pos -= shift->part_units;
64996 +                               }
64997 +
64998 +                               assert("nikita-2084", insert_coord->unit_pos + 1);
64999 +                       }
65000 +               }
65001 +               return;
65002 +       }
65003 +
65004 +       /* we shifted to left and there was no enough space for everything */
65005 +       switch (insert_coord->between) {
65006 +       case AFTER_UNIT:
65007 +       case BEFORE_UNIT:
65008 +               if (shift->real_stop.item_pos == insert_coord->item_pos)
65009 +                       insert_coord->unit_pos -= shift->part_units;
65010 +       case AFTER_ITEM:
65011 +               coord_add_item_pos(insert_coord, -removed);
65012 +               break;
65013 +       default:
65014 +               impossible("nikita-2087", "not ready");
65015 +       }
65016 +       assert("nikita-2085", insert_coord->unit_pos + 1);
65017 +}
65018 +
65019 +static int
65020 +call_shift_hooks(struct shift_params *shift)
65021 +{
65022 +       unsigned i, shifted;
65023 +       coord_t coord;
65024 +       item_plugin *iplug;
65025 +
65026 +       assert("vs-275", !node_is_empty(shift->target));
65027 +
65028 +       /* number of items shift touches */
65029 +       shifted = shift->entire + (shift->merging_units ? 1 : 0) + (shift->part_units ? 1 : 0);
65030 +
65031 +       if (shift->pend == SHIFT_LEFT) {
65032 +               /* moved items are at the end */
65033 +               coord_init_last_unit(&coord, shift->target);
65034 +               coord.unit_pos = 0;
65035 +
65036 +               assert("vs-279", shift->pend == 1);
65037 +               for (i = 0; i < shifted; i++) {
65038 +                       unsigned from, count;
65039 +
65040 +                       iplug = item_plugin_by_coord(&coord);
65041 +                       if (i == 0 && shift->part_units) {
65042 +                               assert("vs-277", coord_num_units(&coord) == shift->part_units);
65043 +                               count = shift->part_units;
65044 +                               from = 0;
65045 +                       } else if (i == shifted - 1 && shift->merging_units) {
65046 +                               count = shift->merging_units;
65047 +                               from = coord_num_units(&coord) - count;
65048 +                       } else {
65049 +                               count = coord_num_units(&coord);
65050 +                               from = 0;
65051 +                       }
65052 +
65053 +                       if (iplug->b.shift_hook) {
65054 +                               iplug->b.shift_hook(&coord, from, count, shift->wish_stop.node);
65055 +                       }
65056 +                       coord_add_item_pos(&coord, -shift->pend);
65057 +               }
65058 +       } else {
65059 +               /* moved items are at the beginning */
65060 +               coord_init_first_unit(&coord, shift->target);
65061 +
65062 +               assert("vs-278", shift->pend == -1);
65063 +               for (i = 0; i < shifted; i++) {
65064 +                       unsigned from, count;
65065 +
65066 +                       iplug = item_plugin_by_coord(&coord);
65067 +                       if (i == 0 && shift->part_units) {
65068 +                               assert("vs-277", coord_num_units(&coord) == shift->part_units);
65069 +                               count = coord_num_units(&coord);
65070 +                               from = 0;
65071 +                       } else if (i == shifted - 1 && shift->merging_units) {
65072 +                               count = shift->merging_units;
65073 +                               from = 0;
65074 +                       } else {
65075 +                               count = coord_num_units(&coord);
65076 +                               from = 0;
65077 +                       }
65078 +
65079 +                       if (iplug->b.shift_hook) {
65080 +                               iplug->b.shift_hook(&coord, from, count, shift->wish_stop.node);
65081 +                       }
65082 +                       coord_add_item_pos(&coord, -shift->pend);
65083 +               }
65084 +       }
65085 +
65086 +       return 0;
65087 +}
65088 +
65089 +/* shift to left is completed. Return 1 if unit @old was moved to left neighbor */
65090 +static int
65091 +unit_moved_left(const struct shift_params *shift, const coord_t * old)
65092 +{
65093 +       assert("vs-944", shift->real_stop.node == old->node);
65094 +
65095 +       if (shift->real_stop.item_pos < old->item_pos)
65096 +               return 0;
65097 +       if (shift->real_stop.item_pos == old->item_pos) {
65098 +               if (shift->real_stop.unit_pos < old->unit_pos)
65099 +                       return 0;
65100 +       }
65101 +       return 1;
65102 +}
65103 +
65104 +/* shift to right is completed. Return 1 if unit @old was moved to right
65105 +   neighbor */
65106 +static int
65107 +unit_moved_right(const struct shift_params *shift, const coord_t * old)
65108 +{
65109 +       assert("vs-944", shift->real_stop.node == old->node);
65110 +
65111 +       if (shift->real_stop.item_pos > old->item_pos)
65112 +               return 0;
65113 +       if (shift->real_stop.item_pos == old->item_pos) {
65114 +               if (shift->real_stop.unit_pos > old->unit_pos)
65115 +                       return 0;
65116 +       }
65117 +       return 1;
65118 +}
65119 +
65120 +/* coord @old was set in node from which shift was performed. What was shifted
65121 +   is stored in @shift. Update @old correspondingly to performed shift */
65122 +static coord_t *
65123 +adjust_coord2(const struct shift_params *shift, const coord_t * old, coord_t * new)
65124 +{
65125 +       coord_clear_iplug(new);
65126 +       new->between = old->between;
65127 +
65128 +       coord_clear_iplug(new);
65129 +       if (old->node == shift->target) {
65130 +               if (shift->pend == SHIFT_LEFT) {
65131 +                       /* coord which is set inside of left neighbor does not
65132 +                          change during shift to left */
65133 +                       coord_dup(new, old);
65134 +                       return new;
65135 +               }
65136 +               new->node = old->node;
65137 +               coord_set_item_pos(new,
65138 +                                  old->item_pos + shift->entire +
65139 +                                  (shift->part_units ? 1 : 0));
65140 +               new->unit_pos = old->unit_pos;
65141 +               if (old->item_pos == 0 && shift->merging_units)
65142 +                       new->unit_pos += shift->merging_units;
65143 +               return new;
65144 +       }
65145 +
65146 +       assert("vs-977", old->node == shift->wish_stop.node);
65147 +       if (shift->pend == SHIFT_LEFT) {
65148 +               if (unit_moved_left(shift, old)) {
65149 +                       /* unit @old moved to left neighbor. Calculate its
65150 +                          coordinate there */
65151 +                       new->node = shift->target;
65152 +                       coord_set_item_pos(new,
65153 +                                          node_num_items(shift->target) -
65154 +                                          shift->entire -
65155 +                                          (shift->part_units ? 1 : 0) +
65156 +                                          old->item_pos);
65157 +
65158 +                       new->unit_pos = old->unit_pos;
65159 +                       if (shift->merging_units) {
65160 +                               coord_dec_item_pos(new);
65161 +                               if (old->item_pos == 0) {
65162 +                                       /* unit_pos only changes if item got
65163 +                                          merged */
65164 +                                       new->unit_pos = coord_num_units(new) - (shift->merging_units - old->unit_pos);
65165 +                               }
65166 +                       }
65167 +               } else {
65168 +                       /* unit @old did not move to left neighbor.
65169 +
65170 +                          Use _nocheck, because @old is outside of its node.
65171 +                       */
65172 +                       coord_dup_nocheck(new, old);
65173 +                       coord_add_item_pos(new, -shift->u.future_first.item_pos);
65174 +                       if (new->item_pos == 0)
65175 +                               new->unit_pos -= shift->u.future_first.unit_pos;
65176 +               }
65177 +       } else {
65178 +               if (unit_moved_right(shift, old)) {
65179 +                       /* unit @old moved to right neighbor */
65180 +                       new->node = shift->target;
65181 +                       coord_set_item_pos(new,
65182 +                                          old->item_pos -
65183 +                                          shift->real_stop.item_pos);
65184 +                       if (new->item_pos == 0) {
65185 +                               /* unit @old might change unit pos */
65186 +                               coord_set_item_pos(new,
65187 +                                                  old->unit_pos -
65188 +                                                  shift->real_stop.unit_pos);
65189 +                       }
65190 +               } else {
65191 +                       /* unit @old did not move to right neighbor, therefore
65192 +                          it did not change */
65193 +                       coord_dup(new, old);
65194 +               }
65195 +       }
65196 +       coord_set_iplug(new, item_plugin_by_coord(new));
65197 +       return new;
65198 +}
65199 +
65200 +/* this is called when shift is completed (something of source node is copied
65201 +   to target and deleted in source) to update all taps set in current
65202 +   context */
65203 +static void
65204 +update_taps(const struct shift_params *shift)
65205 +{
65206 +       tap_t *tap;
65207 +       coord_t new;
65208 +
65209 +       for_all_taps(tap) {
65210 +               /* update only taps set to nodes participating in shift */
65211 +               if (tap->coord->node == shift->wish_stop.node || tap->coord->node == shift->target)
65212 +                       tap_to_coord(tap, adjust_coord2(shift, tap->coord, &new));
65213 +       }
65214 +}
65215 +
65216 +#if REISER4_DEBUG
65217 +
65218 +struct shift_check {
65219 +       reiser4_key key;
65220 +       __u16 plugin_id;
65221 +       union {
65222 +               __u64 bytes;
65223 +               __u64 entries;
65224 +               void *unused;
65225 +       } u;
65226 +};
65227 +
65228 +void *
65229 +shift_check_prepare(const znode *left, const znode *right)
65230 +{
65231 +       pos_in_node_t i, nr_items;
65232 +       int mergeable;
65233 +       struct shift_check *data;
65234 +       item_header40 *ih;
65235 +
65236 +
65237 +       if (node_is_empty(left) || node_is_empty(right))
65238 +               mergeable = 0;
65239 +       else {
65240 +               coord_t l, r;
65241 +
65242 +               coord_init_last_unit(&l, left);
65243 +               coord_init_first_unit(&r, right);
65244 +               mergeable = are_items_mergeable(&l, &r);
65245 +       }
65246 +       nr_items = node40_num_of_items_internal(left) + node40_num_of_items_internal(right) - (mergeable ? 1 : 0);
65247 +       data = reiser4_kmalloc(sizeof(struct shift_check) * nr_items, GFP_KERNEL);
65248 +       if (data != NULL) {
65249 +               coord_t coord;
65250 +               pos_in_node_t item_pos;
65251 +
65252 +               coord_init_first_unit(&coord, left);
65253 +               i = 0;
65254 +
65255 +               for (item_pos = 0; item_pos < node40_num_of_items_internal(left); item_pos ++) {
65256 +
65257 +                       coord_set_item_pos(&coord, item_pos);
65258 +                       ih = node40_ih_at_coord(&coord);
65259 +
65260 +                       data[i].key = ih->key;
65261 +                       data[i].plugin_id = d16tocpu(&ih->plugin_id);
65262 +                       switch(data[i].plugin_id) {
65263 +                       case CTAIL_ID:
65264 +                       case FORMATTING_ID:
65265 +                               data[i].u.bytes = coord_num_units(&coord);
65266 +                               break;
65267 +                       case EXTENT_POINTER_ID:
65268 +                               data[i].u.bytes = extent_size(&coord, coord_num_units(&coord));
65269 +                               break;
65270 +                       case COMPOUND_DIR_ID:
65271 +                               data[i].u.entries = coord_num_units(&coord);
65272 +                               break;
65273 +                       default:
65274 +                               data[i].u.unused = NULL;
65275 +                               break;
65276 +                       }
65277 +                       i ++;
65278 +               }
65279 +
65280 +               coord_init_first_unit(&coord, right);
65281 +
65282 +               if (mergeable) {
65283 +                       assert("vs-1609", i != 0);
65284 +
65285 +                       ih = node40_ih_at_coord(&coord);
65286 +
65287 +                       assert("vs-1589", data[i - 1].plugin_id == d16tocpu(&ih->plugin_id));
65288 +                       switch(data[i - 1].plugin_id) {
65289 +                       case CTAIL_ID:
65290 +                       case FORMATTING_ID:
65291 +                               data[i - 1].u.bytes += coord_num_units(&coord);
65292 +                               break;
65293 +                       case EXTENT_POINTER_ID:
65294 +                               data[i - 1].u.bytes += extent_size(&coord, coord_num_units(&coord));
65295 +                               break;
65296 +                       case COMPOUND_DIR_ID:
65297 +                               data[i - 1].u.entries += coord_num_units(&coord);
65298 +                               break;
65299 +                       default:
65300 +                               impossible("vs-1605", "wrong mergeable item");
65301 +                               break;
65302 +                       }
65303 +                       item_pos = 1;
65304 +               } else
65305 +                       item_pos = 0;
65306 +               for (; item_pos < node40_num_of_items_internal(right); item_pos ++) {
65307 +
65308 +                       assert("vs-1604", i < nr_items);
65309 +                       coord_set_item_pos(&coord, item_pos);
65310 +                       ih = node40_ih_at_coord(&coord);
65311 +
65312 +                       data[i].key = ih->key;
65313 +                       data[i].plugin_id = d16tocpu(&ih->plugin_id);
65314 +                       switch(data[i].plugin_id) {
65315 +                       case CTAIL_ID:
65316 +                       case FORMATTING_ID:
65317 +                               data[i].u.bytes = coord_num_units(&coord);
65318 +                               break;
65319 +                       case EXTENT_POINTER_ID:
65320 +                               data[i].u.bytes = extent_size(&coord, coord_num_units(&coord));
65321 +                               break;
65322 +                       case COMPOUND_DIR_ID:
65323 +                               data[i].u.entries = coord_num_units(&coord);
65324 +                               break;
65325 +                       default:
65326 +                               data[i].u.unused = NULL;
65327 +                               break;
65328 +                       }
65329 +                       i ++;
65330 +               }
65331 +               assert("vs-1606", i == nr_items);
65332 +       }
65333 +       return data;
65334 +}
65335 +
65336 +void
65337 +shift_check(void *vp, const znode *left, const znode *right)
65338 +{
65339 +       pos_in_node_t i, nr_items;
65340 +       coord_t coord;
65341 +       __u64 last_bytes;
65342 +       int mergeable;
65343 +       item_header40 *ih;
65344 +       pos_in_node_t item_pos;
65345 +       struct shift_check *data;
65346 +
65347 +       data = (struct shift_check *)vp;
65348 +
65349 +       if (data == NULL)
65350 +               return;
65351 +
65352 +       if (node_is_empty(left) || node_is_empty(right))
65353 +               mergeable = 0;
65354 +       else {
65355 +               coord_t l, r;
65356 +
65357 +               coord_init_last_unit(&l, left);
65358 +               coord_init_first_unit(&r, right);
65359 +               mergeable = are_items_mergeable(&l, &r);
65360 +       }
65361 +
65362 +       nr_items = node40_num_of_items_internal(left) + node40_num_of_items_internal(right) - (mergeable ? 1 : 0);
65363 +
65364 +       i = 0;
65365 +       last_bytes = 0;
65366 +
65367 +       coord_init_first_unit(&coord, left);
65368 +
65369 +       for (item_pos = 0; item_pos < node40_num_of_items_internal(left); item_pos ++) {
65370 +
65371 +               coord_set_item_pos(&coord, item_pos);
65372 +               ih = node40_ih_at_coord(&coord);
65373 +
65374 +               assert("vs-1611", i == item_pos);
65375 +               assert("vs-1590", keyeq(&ih->key, &data[i].key));
65376 +               assert("vs-1591", d16tocpu(&ih->plugin_id) == data[i].plugin_id);
65377 +               if ((i < (node40_num_of_items_internal(left) - 1)) || !mergeable) {
65378 +                       switch(data[i].plugin_id) {
65379 +                       case CTAIL_ID:
65380 +                       case FORMATTING_ID:
65381 +                               assert("vs-1592", data[i].u.bytes == coord_num_units(&coord));
65382 +                               break;
65383 +                       case EXTENT_POINTER_ID:
65384 +                               assert("vs-1593", data[i].u.bytes == extent_size(&coord, coord_num_units(&coord)));
65385 +                               break;
65386 +                       case COMPOUND_DIR_ID:
65387 +                               assert("vs-1594", data[i].u.entries == coord_num_units(&coord));
65388 +                               break;
65389 +                       default:
65390 +                               break;
65391 +                       }
65392 +               }
65393 +               if (item_pos == (node40_num_of_items_internal(left) - 1) && mergeable) {
65394 +                       switch(data[i].plugin_id) {
65395 +                       case CTAIL_ID:
65396 +                       case FORMATTING_ID:
65397 +                               last_bytes = coord_num_units(&coord);
65398 +                               break;
65399 +                       case EXTENT_POINTER_ID:
65400 +                               last_bytes = extent_size(&coord, coord_num_units(&coord));
65401 +                               break;
65402 +                       case COMPOUND_DIR_ID:
65403 +                               last_bytes = coord_num_units(&coord);
65404 +                               break;
65405 +                       default:
65406 +                               impossible("vs-1595", "wrong mergeable item");
65407 +                               break;
65408 +                       }
65409 +               }
65410 +               i ++;
65411 +       }
65412 +
65413 +       coord_init_first_unit(&coord, right);
65414 +       if (mergeable) {
65415 +               ih = node40_ih_at_coord(&coord);
65416 +
65417 +               assert("vs-1589", data[i - 1].plugin_id == d16tocpu(&ih->plugin_id));
65418 +               assert("vs-1608", last_bytes != 0);
65419 +               switch(data[i - 1].plugin_id) {
65420 +               case CTAIL_ID:
65421 +               case FORMATTING_ID:
65422 +                       assert("vs-1596", data[i - 1].u.bytes == last_bytes + coord_num_units(&coord));
65423 +                       break;
65424 +
65425 +               case EXTENT_POINTER_ID:
65426 +                       assert("vs-1597", data[i - 1].u.bytes == last_bytes + extent_size(&coord, coord_num_units(&coord)));
65427 +                       break;
65428 +
65429 +               case COMPOUND_DIR_ID:
65430 +                       assert("vs-1598", data[i - 1].u.bytes == last_bytes + coord_num_units(&coord));
65431 +                       break;
65432 +               default:
65433 +                       impossible("vs-1599", "wrong mergeable item");
65434 +                       break;
65435 +               }
65436 +               item_pos = 1;
65437 +       } else
65438 +               item_pos = 0;
65439 +
65440 +       for (; item_pos < node40_num_of_items_internal(right); item_pos ++) {
65441 +
65442 +               coord_set_item_pos(&coord, item_pos);
65443 +               ih = node40_ih_at_coord(&coord);
65444 +
65445 +               assert("vs-1612", keyeq(&ih->key, &data[i].key));
65446 +               assert("vs-1613", d16tocpu(&ih->plugin_id) == data[i].plugin_id);
65447 +               switch(data[i].plugin_id) {
65448 +               case CTAIL_ID:
65449 +               case FORMATTING_ID:
65450 +                       assert("vs-1600", data[i].u.bytes == coord_num_units(&coord));
65451 +                       break;
65452 +               case EXTENT_POINTER_ID:
65453 +                       assert("vs-1601", data[i].u.bytes == extent_size(&coord, coord_num_units(&coord)));
65454 +                       break;
65455 +               case COMPOUND_DIR_ID:
65456 +                       assert("vs-1602", data[i].u.entries == coord_num_units(&coord));
65457 +                       break;
65458 +               default:
65459 +                       break;
65460 +               }
65461 +               i ++;
65462 +       }
65463 +
65464 +       assert("vs-1603", i == nr_items);
65465 +       reiser4_kfree(data);
65466 +}
65467 +
65468 +#endif
65469 +
65470 +ON_DEBUG_MODIFY(extern __u32 znode_checksum(const znode * node);)
65471 +
65472 +/* plugin->u.node.shift
65473 +   look for description of this method in plugin/node/node.h */
65474 +reiser4_internal int
65475 +shift_node40(coord_t *from, znode *to, shift_direction pend,
65476 +            int delete_child,  /* if @from->node becomes empty - it will be deleted from the tree if this is set to
65477 +                                  1 */
65478 +            int including_stop_coord /* */ ,
65479 +            carry_plugin_info *info)
65480 +{
65481 +       struct shift_params shift;
65482 +       int result;
65483 +       znode *left, *right;
65484 +       znode *source;
65485 +       int target_empty;
65486 +#if REISER4_DEBUG
65487 +       struct shift_check *check_data;
65488 +#endif
65489 +
65490 +       assert("nikita-2161", coord_check(from));
65491 +
65492 +       ON_DEBUG_MODIFY(znode_set_checksum(ZJNODE(to), 0));
65493 +
65494 +       xmemset(&shift, 0, sizeof (shift));
65495 +       shift.pend = pend;
65496 +       shift.wish_stop = *from;
65497 +       shift.target = to;
65498 +
65499 +       assert("nikita-1473", znode_is_write_locked(from->node));
65500 +       assert("nikita-1474", znode_is_write_locked(to));
65501 +       node_check(from->node, 0);
65502 +       node_check(to, 0);
65503 +
65504 +       source = from->node;
65505 +
65506 +       /* set @shift.wish_stop to rightmost/leftmost unit among units we want
65507 +          shifted */
65508 +       if (pend == SHIFT_LEFT) {
65509 +               result = coord_set_to_left(&shift.wish_stop);
65510 +               left = to;
65511 +               right = from->node;
65512 +       } else {
65513 +               result = coord_set_to_right(&shift.wish_stop);
65514 +               left = from->node;
65515 +               right = to;
65516 +       }
65517 +
65518 +       if (result) {
65519 +               /* move insertion coord even if there is nothing to move */
65520 +               if (including_stop_coord) {
65521 +                       /* move insertion coord (@from) */
65522 +                       if (pend == SHIFT_LEFT) {
65523 +                               /* after last item in target node */
65524 +                               coord_init_after_last_item(from, to);
65525 +                       } else {
65526 +                               /* before first item in target node */
65527 +                               coord_init_before_first_item(from, to);
65528 +                       }
65529 +               }
65530 +
65531 +               if (delete_child && node_is_empty(shift.wish_stop.node))
65532 +                       result = prepare_removal_node40(shift.wish_stop.node, info);
65533 +               else
65534 +                       result = 0;
65535 +               /* there is nothing to shift */
65536 +               assert("nikita-2078", coord_check(from));
65537 +               return result;
65538 +       }
65539 +
65540 +       target_empty = node_is_empty(to);
65541 +
65542 +       ON_DEBUG_MODIFY(assert("nikita-3427", to->cksum == znode_checksum(to)));
65543 +
65544 +       /* when first node plugin with item body compression is implemented,
65545 +          this must be changed to call node specific plugin */
65546 +
65547 +       /* shift->stop_coord is updated to last unit which really will be
65548 +          shifted */
65549 +       estimate_shift(&shift, get_current_context());
65550 +       if (!shift.shift_bytes) {
65551 +               /* we could not shift anything */
65552 +               assert("nikita-2079", coord_check(from));
65553 +               ON_DEBUG_MODIFY(assert("nikita-3433",
65554 +                                      to->cksum == znode_checksum(to)));
65555 +               return 0;
65556 +       }
65557 +
65558 +       ON_DEBUG(check_data = shift_check_prepare(left, right));
65559 +
65560 +       IF_TRACE(TRACE_COORDS, print_coord("shift->wish_stop before copy:", &shift.wish_stop, 0));
65561 +
65562 +       copy(&shift);
65563 +
65564 +       /* result value of this is important. It is used by adjust_coord below */
65565 +       result = delete_copied(&shift);
65566 +
65567 +       assert("vs-1610", result >= 0);
65568 +       assert("vs-1471", ((reiser4_context *) current->journal_info)->magic == context_magic);
65569 +
65570 +       /* item which has been moved from one node to another might want to do
65571 +          something on that event. This can be done by item's shift_hook
65572 +          method, which will be now called for every moved items */
65573 +       call_shift_hooks(&shift);
65574 +
65575 +       assert("vs-1472", ((reiser4_context *) current->journal_info)->magic == context_magic);
65576 +
65577 +       update_taps(&shift);
65578 +
65579 +       assert("vs-1473", ((reiser4_context *) current->journal_info)->magic == context_magic);
65580 +
65581 +       /* adjust @from pointer in accordance with @including_stop_coord flag
65582 +          and amount of data which was really shifted */
65583 +       adjust_coord(from, &shift, result, including_stop_coord);
65584 +
65585 +       if (target_empty)
65586 +               /*
65587 +                * items were shifted into empty node. Update delimiting key.
65588 +                */
65589 +               result = prepare_for_update(NULL, left, info);
65590 +
65591 +       /* add update operation to @info, which is the list of operations to
65592 +          be performed on a higher level */
65593 +       result = prepare_for_update(left, right, info);
65594 +       if (!result && node_is_empty(source) && delete_child) {
65595 +               /* all contents of @from->node is moved to @to and @from->node
65596 +                  has to be removed from the tree, so, on higher level we
65597 +                  will be removing the pointer to node @from->node */
65598 +               result = prepare_removal_node40(source, info);
65599 +       }
65600 +
65601 +#ifdef DEBUGGING_SHIFT
65602 +       dinfo("SHIFT TO %s: merging %d, entire %d, part %d, size %d\n",
65603 +             shift.pend == SHIFT_LEFT ? "LEFT" : "RIGHT",
65604 +             shift.merging_units, shift.entire, shift.part_units, shift.shift_bytes);
65605 +#endif
65606 +       ON_TRACE(TRACE_SHIFT, "shift: [%Li] %s--%s [%Li]: %i\n",
65607 +                *znode_get_block(left),
65608 +                (shift.pend == SHIFT_LEFT) ? "<" : "",
65609 +                (shift.pend == SHIFT_LEFT) ? "" : ">", *znode_get_block(right), shift.shift_bytes);
65610 +
65611 +       node_check(source, 0);
65612 +       node_check(to, 0);
65613 +       assert("nikita-2080", coord_check(from));
65614 +
65615 +       ON_DEBUG(shift_check(check_data, left, right));
65616 +
65617 +       return result ? result : (int) shift.shift_bytes;
65618 +}
65619 +
65620 +/* plugin->u.node.fast_insert()
65621 +   look for description of this method in plugin/node/node.h */
65622 +reiser4_internal int
65623 +fast_insert_node40(const coord_t * coord UNUSED_ARG /* node to query */ )
65624 +{
65625 +       return 1;
65626 +}
65627 +
65628 +/* plugin->u.node.fast_paste()
65629 +   look for description of this method in plugin/node/node.h */
65630 +reiser4_internal int
65631 +fast_paste_node40(const coord_t * coord UNUSED_ARG /* node to query */ )
65632 +{
65633 +       return 1;
65634 +}
65635 +
65636 +/* plugin->u.node.fast_cut()
65637 +   look for description of this method in plugin/node/node.h */
65638 +reiser4_internal int
65639 +fast_cut_node40(const coord_t * coord UNUSED_ARG /* node to query */ )
65640 +{
65641 +       return 1;
65642 +}
65643 +
65644 +/* plugin->u.node.modify - not defined */
65645 +
65646 +/* plugin->u.node.max_item_size */
65647 +reiser4_internal int
65648 +max_item_size_node40(void)
65649 +{
65650 +       return reiser4_get_current_sb()->s_blocksize - sizeof (node40_header) - sizeof (item_header40);
65651 +}
65652 +
65653 +/* plugin->u.node.set_item_plugin */
65654 +reiser4_internal int
65655 +set_item_plugin_node40(coord_t *coord, item_id id)
65656 +{
65657 +       item_header40 *ih;
65658 +
65659 +       ih = node40_ih_at_coord(coord);
65660 +       cputod16(id, &ih->plugin_id);
65661 +       coord->iplugid = id;
65662 +       return 0;
65663 +}
65664 +
65665 +
65666 +/*
65667 +   Local variables:
65668 +   c-indentation-style: "K&R"
65669 +   mode-name: "LC"
65670 +   c-basic-offset: 8
65671 +   tab-width: 8
65672 +   fill-column: 120
65673 +   scroll-step: 1
65674 +   End:
65675 +*/
65676 diff -rupN linux-2.6.8-rc3/fs/reiser4/plugin/node/node40.h linux-2.6.8-rc3-a/fs/reiser4/plugin/node/node40.h
65677 --- linux-2.6.8-rc3/fs/reiser4/plugin/node/node40.h     1970-01-01 03:00:00.000000000 +0300
65678 +++ linux-2.6.8-rc3-a/fs/reiser4/plugin/node/node40.h   2004-08-05 21:20:53.296615511 +0400
65679 @@ -0,0 +1,118 @@
65680 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
65681 +
65682 +#if !defined( __REISER4_NODE40_H__ )
65683 +#define __REISER4_NODE40_H__
65684 +
65685 +#include "../../forward.h"
65686 +#include "../../dformat.h"
65687 +#include "node.h"
65688 +
65689 +#include <linux/types.h>
65690 +
65691 +
65692 +/* format of node header for 40 node layouts. Keep bloat out of this struct.  */
65693 +typedef struct node40_header {
65694 +       /* identifier of node plugin. Must be located at the very beginning
65695 +          of a node. */
65696 +       common_node_header common_header;       /* this is 16 bits */
65697 +       /* number of items. Should be first element in the node header,
65698 +          because we haven't yet finally decided whether it shouldn't go into
65699 +          common_header.
65700 +       */
65701 +/* NIKITA-FIXME-HANS: Create a macro such that if there is only one
65702 + * node format at compile time, and it is this one, accesses do not function dereference when
65703 + * accessing these fields (and otherwise they do).  Probably 80% of users will only have one node format at a time throughout the life of reiser4.  */
65704 +       d16 nr_items;
65705 +       /* free space in node measured in bytes */
65706 +       d16 free_space;
65707 +       /* offset to start of free space in node */
65708 +       d16 free_space_start;
65709 +       /* for reiser4_fsck.  When information about what is a free
65710 +           block is corrupted, and we try to recover everything even
65711 +           if marked as freed, then old versions of data may
65712 +           duplicate newer versions, and this field allows us to
65713 +           restore the newer version.  Also useful for when users
65714 +           who don't have the new trashcan installed on their linux distro
65715 +           delete the wrong files and send us desperate emails
65716 +           offering $25 for them back.  */
65717 +
65718 +       /* magic field we need to tell formatted nodes NIKITA-FIXME-HANS: improve this comment*/
65719 +       d32 magic;
65720 +       /* flushstamp is made of mk_id and write_counter. mk_id is an
65721 +          id generated randomly at mkreiserfs time. So we can just
65722 +          skip all nodes with different mk_id. write_counter is d64
65723 +          incrementing counter of writes on disk. It is used for
65724 +          choosing the newest data at fsck time. NIKITA-FIXME-HANS: why was field name changed but not comment? */
65725 +
65726 +       d32 mkfs_id;
65727 +       d64 flush_id;
65728 +       /* node flags to be used by fsck (reiser4ck or reiser4fsck?)
65729 +          and repacker NIKITA-FIXME-HANS: say more or reference elsewhere that says more */
65730 +       d16 flags;
65731 +
65732 +       /* 1 is leaf level, 2 is twig level, root is the numerically
65733 +          largest level */
65734 +       d8 level;
65735 +
65736 +       d8 pad;
65737 +} PACKED node40_header;
65738 +
65739 +/* item headers are not standard across all node layouts, pass
65740 +   pos_in_node to functions instead */
65741 +typedef struct item_header40 {
65742 +       /* key of item */
65743 +       /*  0 */ reiser4_key key;
65744 +       /* offset from start of a node measured in 8-byte chunks */
65745 +       /* 24 */ d16 offset;
65746 +       /* 26 */ d16 flags;
65747 +       /* 28 */ d16 plugin_id;
65748 +} PACKED item_header40;
65749 +
65750 +size_t item_overhead_node40(const znode * node, flow_t * aflow);
65751 +size_t free_space_node40(znode * node);
65752 +node_search_result lookup_node40(znode * node, const reiser4_key * key, lookup_bias bias, coord_t * coord);
65753 +int num_of_items_node40(const znode * node);
65754 +char *item_by_coord_node40(const coord_t * coord);
65755 +int length_by_coord_node40(const coord_t * coord);
65756 +item_plugin *plugin_by_coord_node40(const coord_t * coord);
65757 +reiser4_key *key_at_node40(const coord_t * coord, reiser4_key * key);
65758 +size_t estimate_node40(znode * node);
65759 +int check_node40(const znode * node, __u32 flags, const char **error);
65760 +int parse_node40(znode * node);
65761 +#if REISER4_DEBUG_OUTPUT
65762 +void print_node40(const char *prefix, const znode * node, __u32 flags);
65763 +#endif
65764 +int init_node40(znode * node);
65765 +int guess_node40(const znode * node);
65766 +void change_item_size_node40(coord_t * coord, int by);
65767 +int create_item_node40(coord_t * target, const reiser4_key * key, reiser4_item_data * data, carry_plugin_info * info);
65768 +void update_item_key_node40(coord_t * target, const reiser4_key * key, carry_plugin_info * info);
65769 +int kill_node40(struct carry_kill_data *, carry_plugin_info *);
65770 +int cut_node40(struct carry_cut_data *, carry_plugin_info *);
65771 +int shift_node40(coord_t * from, znode * to, shift_direction pend,
65772 +                /* if @from->node becomes
65773 +                   empty - it will be deleted from
65774 +                   the tree if this is set to 1
65775 +                */
65776 +                int delete_child, int including_stop_coord, carry_plugin_info * info);
65777 +
65778 +int fast_insert_node40(const coord_t * coord);
65779 +int fast_paste_node40(const coord_t * coord);
65780 +int fast_cut_node40(const coord_t * coord);
65781 +int max_item_size_node40(void);
65782 +int prepare_removal_node40(znode * empty, carry_plugin_info * info);
65783 +int set_item_plugin_node40(coord_t * coord, item_id id);
65784 +int shrink_item_node40(coord_t *coord, int delta);
65785 +
65786 +/* __REISER4_NODE40_H__ */
65787 +#endif
65788 +/*
65789 +   Local variables:
65790 +   c-indentation-style: "K&R"
65791 +   mode-name: "LC"
65792 +   c-basic-offset: 8
65793 +   tab-width: 8
65794 +   fill-column: 120
65795 +   scroll-step: 1
65796 +   End:
65797 +*/
65798 diff -rupN linux-2.6.8-rc3/fs/reiser4/plugin/object.c linux-2.6.8-rc3-a/fs/reiser4/plugin/object.c
65799 --- linux-2.6.8-rc3/fs/reiser4/plugin/object.c  1970-01-01 03:00:00.000000000 +0300
65800 +++ linux-2.6.8-rc3-a/fs/reiser4/plugin/object.c        2004-08-05 21:20:53.155645245 +0400
65801 @@ -0,0 +1,1650 @@
65802 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
65803 + * reiser4/README */
65804 +
65805 +/* Examples of object plugins: file, directory, symlink, special file */
65806 +/* Plugins associated with inode:
65807 +
65808 +   Plugin of inode is plugin referenced by plugin-id field of on-disk
65809 +   stat-data. How we store this plugin in in-core inode is not
65810 +   important. Currently pointers are used, another variant is to store
65811 +   offsets and do array lookup on each access.
65812 +
65813 +   Now, each inode has one selected plugin: object plugin that
65814 +   determines what type of file this object is: directory, regular etc.
65815 +
65816 +   This main plugin can use other plugins that are thus subordinated to
65817 +   it. Directory instance of object plugin uses hash; regular file
65818 +   instance uses tail policy plugin.
65819 +
65820 +   Object plugin is either taken from id in stat-data or guessed from
65821 +   i_mode bits. Once it is established we ask it to install its
65822 +   subordinate plugins, by looking again in stat-data or inheriting them
65823 +   from parent.
65824 +*/
65825 +/* How new inode is initialized during ->read_inode():
65826 +    1 read stat-data and initialize inode fields: i_size, i_mode,
65827 +      i_generation, capabilities etc.
65828 +    2 read plugin id from stat data or try to guess plugin id
65829 +      from inode->i_mode bits if plugin id is missing.
65830 +    3 Call ->init_inode() method of stat-data plugin to initialise inode fields.
65831 +
65832 +NIKITA-FIXME-HANS: can you say a little about 1 being done before 3?  What if stat data does contain i_size, etc., due to it being an unusual plugin?
65833 +    4 Call ->activate() method of object's plugin. Plugin is either read from
65834 +      from stat-data or guessed from mode bits
65835 +    5 Call ->inherit() method of object plugin to inherit as yet
65836 +NIKITA-FIXME-HANS: are you missing an "un" here?
65837 +initialized
65838 +      plugins from parent.
65839 +
65840 +   Easy induction proves that on last step all plugins of inode would be
65841 +   initialized.
65842 +
65843 +   When creating new object:
65844 +    1 obtain object plugin id (see next period)
65845 +NIKITA-FIXME-HANS: period?
65846 +    2 ->install() this plugin
65847 +    3 ->inherit() the rest from the parent
65848 +
65849 +*/
65850 +/* We need some examples of creating an object with default and
65851 +  non-default plugin ids.  Nikita, please create them.
65852 +*/
65853 +
65854 +#include "../forward.h"
65855 +#include "../debug.h"
65856 +#include "../key.h"
65857 +#include "../kassign.h"
65858 +#include "../coord.h"
65859 +#include "../seal.h"
65860 +#include "plugin_header.h"
65861 +#include "item/static_stat.h"
65862 +#include "file/file.h"
65863 +#include "file/pseudo.h"
65864 +#include "symlink.h"
65865 +#include "dir/dir.h"
65866 +#include "item/item.h"
65867 +#include "plugin.h"
65868 +#include "object.h"
65869 +#include "../znode.h"
65870 +#include "../tap.h"
65871 +#include "../tree.h"
65872 +#include "../vfs_ops.h"
65873 +#include "../inode.h"
65874 +#include "../super.h"
65875 +#include "../reiser4.h"
65876 +#include "../prof.h"
65877 +#include "../safe_link.h"
65878 +
65879 +#include <linux/types.h>
65880 +#include <linux/fs.h>
65881 +#include <linux/dcache.h>
65882 +#include <linux/quotaops.h>
65883 +#include <linux/security.h> /* security_inode_delete() */
65884 +#include <linux/writeback.h> /* wake_up_inode() */
65885 +#include <linux/xattr_acl.h>
65886 +#include <linux/xattr.h>
65887 +
65888 +/* helper function to print errors */
65889 +static void
65890 +key_warning(const reiser4_key * key /* key to print */,
65891 +           const struct inode *inode,
65892 +           int code /* error code to print */)
65893 +{
65894 +       assert("nikita-716", key != NULL);
65895 +
65896 +       if (code != -ENOMEM) {
65897 +               warning("nikita-717", "Error for inode %llu (%i)",
65898 +                       get_key_objectid(key), code);
65899 +               print_key("for key", key);
65900 +               print_inode("inode", inode);
65901 +       }
65902 +}
65903 +
65904 +/* NIKITA-FIXME-HANS: perhaps this function belongs in another file? */
65905 +#if REISER4_DEBUG
65906 +static void
65907 +check_inode_seal(const struct inode *inode,
65908 +                const coord_t *coord, const reiser4_key *key)
65909 +{
65910 +       reiser4_key unit_key;
65911 +
65912 +       unit_key_by_coord(coord, &unit_key);
65913 +       assert("nikita-2752",
65914 +              WITH_DATA_RET(coord->node, 1, keyeq(key, &unit_key)));
65915 +       assert("nikita-2753", get_inode_oid(inode) == get_key_objectid(key));
65916 +}
65917 +
65918 +static void
65919 +check_sd_coord(coord_t *coord, const reiser4_key *key)
65920 +{
65921 +       reiser4_key ukey;
65922 +
65923 +       coord_clear_iplug(coord);
65924 +       if (zload(coord->node))
65925 +               return;
65926 +
65927 +       if (!coord_is_existing_unit(coord) ||
65928 +           !item_plugin_by_coord(coord) ||
65929 +           !keyeq(unit_key_by_coord(coord, &ukey), key) ||
65930 +           (znode_get_level(coord->node) != LEAF_LEVEL) ||
65931 +           !item_is_statdata(coord)) {
65932 +               warning("nikita-1901", "Conspicuous seal");
65933 +               print_key("key", key);
65934 +               print_coord("coord", coord, 1);
65935 +               impossible("nikita-2877", "no way");
65936 +       }
65937 +       zrelse(coord->node);
65938 +}
65939 +
65940 +#else
65941 +#define check_inode_seal(inode, coord, key) noop
65942 +#define check_sd_coord(coord, key) noop
65943 +#endif
65944 +
65945 +/* find sd of inode in a tree, deal with errors */
65946 +reiser4_internal int
65947 +lookup_sd(struct inode *inode /* inode to look sd for */ ,
65948 +         znode_lock_mode lock_mode /* lock mode */ ,
65949 +         coord_t * coord /* resulting coord */ ,
65950 +         lock_handle * lh /* resulting lock handle */ ,
65951 +         const reiser4_key * key /* resulting key */,
65952 +         int silent)
65953 +{
65954 +       int result;
65955 +       __u32 flags;
65956 +
65957 +       assert("nikita-1692", inode != NULL);
65958 +       assert("nikita-1693", coord != NULL);
65959 +       assert("nikita-1694", key != NULL);
65960 +
65961 +       /* look for the object's stat data in a tree.
65962 +          This returns in "node" pointer to a locked znode and in "pos"
65963 +          position of an item found in node. Both are only valid if
65964 +          coord_found is returned. */
65965 +       flags = (lock_mode == ZNODE_WRITE_LOCK) ? CBK_FOR_INSERT : 0;
65966 +       flags |= CBK_UNIQUE;
65967 +       /*
65968 +        * traverse tree to find stat data. We cannot use vroot here, because
65969 +        * it only covers _body_ of the file, and stat data don't belong
65970 +        * there.
65971 +        */
65972 +       result = coord_by_key(tree_by_inode(inode),
65973 +                              key,
65974 +                              coord,
65975 +                              lh,
65976 +                              lock_mode,
65977 +                              FIND_EXACT,
65978 +                              LEAF_LEVEL,
65979 +                              LEAF_LEVEL,
65980 +                              flags,
65981 +                              0);
65982 +       if (REISER4_DEBUG && result == 0)
65983 +               check_sd_coord(coord, key);
65984 +
65985 +       if (result != 0 && !silent)
65986 +               key_warning(key, inode, result);
65987 +       return result;
65988 +}
65989 +
65990 +/* insert new stat-data into tree. Called with inode state
65991 +    locked. Return inode state locked. */
65992 +static int
65993 +insert_new_sd(struct inode *inode /* inode to create sd for */ )
65994 +{
65995 +       int result;
65996 +       reiser4_key key;
65997 +       coord_t coord;
65998 +       reiser4_item_data data;
65999 +       char *area;
66000 +       reiser4_inode *ref;
66001 +       lock_handle lh;
66002 +       oid_t oid;
66003 +
66004 +       assert("nikita-723", inode != NULL);
66005 +       assert("nikita-3406", inode_get_flag(inode, REISER4_NO_SD));
66006 +
66007 +       ref = reiser4_inode_data(inode);
66008 +       spin_lock_inode(inode);
66009 +
66010 +       /*
66011 +        * prepare specification of new item to be inserted
66012 +        */
66013 +
66014 +       data.iplug = inode_sd_plugin(inode);
66015 +       data.length = data.iplug->s.sd.save_len(inode);
66016 +       spin_unlock_inode(inode);
66017 +
66018 +       data.data = NULL;
66019 +       data.user = 0;
66020 +/* could be optimized for case where there is only one node format in
66021 + * use in the filesystem, probably there are lots of such
66022 + * places we could optimize for only one node layout.... -Hans */
66023 +       if (data.length > tree_by_inode(inode)->nplug->max_item_size()) {
66024 +               /* This is silly check, but we don't know actual node where
66025 +                  insertion will go into. */
66026 +               return RETERR(-ENAMETOOLONG);
66027 +       }
66028 +       oid = oid_allocate(inode->i_sb);
66029 +/* NIKITA-FIXME-HANS: what is your opinion on whether this error check should be encapsulated into oid_allocate? */
66030 +       if (oid == ABSOLUTE_MAX_OID)
66031 +               return RETERR(-EOVERFLOW);
66032 +
66033 +       set_inode_oid(inode, oid);
66034 +
66035 +       coord_init_zero(&coord);
66036 +       init_lh(&lh);
66037 +
66038 +       result = insert_by_key(tree_by_inode(inode),
66039 +                              build_sd_key(inode, &key),
66040 +                              &data,
66041 +                              &coord,
66042 +                              &lh,
66043 +                              /* stat data lives on a leaf level */
66044 +                              LEAF_LEVEL,
66045 +                              CBK_UNIQUE);
66046 +
66047 +       /* we don't want to re-check that somebody didn't insert
66048 +          stat-data while we were doing io, because if it did,
66049 +          insert_by_key() returned error. */
66050 +       /* but what _is_ possible is that plugin for inode's stat-data,
66051 +          list of non-standard plugins or their state would change
66052 +          during io, so that stat-data wouldn't fit into sd. To avoid
66053 +          this race we keep inode_state lock. This lock has to be
66054 +          taken each time you access inode in a way that would cause
66055 +          changes in sd size: changing plugins etc.
66056 +       */
66057 +
66058 +       if (result == IBK_INSERT_OK) {
66059 +               write_current_logf(WRITE_TREE_LOG, "..sd i %#llx %#llx",
66060 +                                  get_inode_oid(inode), ref->locality_id);
66061 +
66062 +               coord_clear_iplug(&coord);
66063 +               result = zload(coord.node);
66064 +               if (result == 0) {
66065 +                       /* have we really inserted stat data? */
66066 +                       assert("nikita-725", item_is_statdata(&coord));
66067 +
66068 +                       /* inode was just created. It is inserted into hash
66069 +                          table, but no directory entry was yet inserted into
66070 +                          parent. So, inode is inaccessible through
66071 +                          ->lookup(). All places that directly grab inode
66072 +                          from hash-table (like old knfsd), should check
66073 +                          IMMUTABLE flag that is set by common_create_child.
66074 +                       */
66075 +                       assert("nikita-3240", data.iplug != NULL);
66076 +                       assert("nikita-3241", data.iplug->s.sd.save != NULL);
66077 +                       area = item_body_by_coord(&coord);
66078 +                       result = data.iplug->s.sd.save(inode, &area);
66079 +                       znode_make_dirty(coord.node);
66080 +                       if (result == 0) {
66081 +                               /* object has stat-data now */
66082 +                               inode_clr_flag(inode, REISER4_NO_SD);
66083 +                               inode_set_flag(inode, REISER4_SDLEN_KNOWN);
66084 +                               /* initialise stat-data seal */
66085 +                               seal_init(&ref->sd_seal, &coord, &key);
66086 +                               ref->sd_coord = coord;
66087 +                               check_inode_seal(inode, &coord, &key);
66088 +                       } else if (result != -ENOMEM)
66089 +                               /*
66090 +                                * convert any other error code to -EIO to
66091 +                                * avoid confusing user level with unexpected
66092 +                                * errors.
66093 +                                */
66094 +                               result = RETERR(-EIO);
66095 +                       zrelse(coord.node);
66096 +               }
66097 +       }
66098 +       done_lh(&lh);
66099 +
66100 +       if (result != 0)
66101 +               key_warning(&key, inode, result);
66102 +       else
66103 +               oid_count_allocated();
66104 +
66105 +       return result;
66106 +}
66107 +
66108 +
66109 +/* update stat-data at @coord */
66110 +static int
66111 +update_sd_at(struct inode * inode, coord_t * coord, reiser4_key * key,
66112 +            lock_handle * lh)
66113 +{
66114 +       int                result;
66115 +       reiser4_item_data  data;
66116 +       char              *area;
66117 +       reiser4_inode     *state;
66118 +       znode             *loaded;
66119 +
66120 +       state = reiser4_inode_data(inode);
66121 +
66122 +       coord_clear_iplug(coord);
66123 +       result = zload(coord->node);
66124 +       if (result != 0)
66125 +               return result;
66126 +       loaded = coord->node;
66127 +
66128 +       spin_lock_inode(inode);
66129 +       assert("nikita-728", inode_sd_plugin(inode) != NULL);
66130 +       data.iplug = inode_sd_plugin(inode);
66131 +
66132 +       /* if inode has non-standard plugins, add appropriate stat data
66133 +        * extension */
66134 +       if (state->plugin_mask != 0)
66135 +               inode_set_extension(inode, PLUGIN_STAT);
66136 +
66137 +       /* data.length is how much space to add to (or remove
66138 +          from if negative) sd */
66139 +       if (!inode_get_flag(inode, REISER4_SDLEN_KNOWN)) {
66140 +               /* recalculate stat-data length */
66141 +               data.length =
66142 +                       data.iplug->s.sd.save_len(inode) -
66143 +                       item_length_by_coord(coord);
66144 +               inode_set_flag(inode, REISER4_SDLEN_KNOWN);
66145 +       } else
66146 +               data.length = 0;
66147 +       spin_unlock_inode(inode);
66148 +
66149 +       /* if on-disk stat data is of different length than required
66150 +          for this inode, resize it */
66151 +       if (data.length != 0) {
66152 +               data.data = NULL;
66153 +               data.user = 0;
66154 +
66155 +               /* insertion code requires that insertion point (coord) was
66156 +                * between units. */
66157 +               coord->between = AFTER_UNIT;
66158 +               result = resize_item(coord,
66159 +                                    &data, key, lh, COPI_DONT_SHIFT_LEFT);
66160 +               if (result != 0) {
66161 +                       key_warning(key, inode, result);
66162 +                       zrelse(loaded);
66163 +                       return result;
66164 +               }
66165 +               if (loaded != coord->node) {
66166 +                       /* resize_item moved coord to another node. Zload it */
66167 +                       zrelse(loaded);
66168 +                       coord_clear_iplug(coord);
66169 +                       result = zload(coord->node);
66170 +                       if (result != 0)
66171 +                               return result;
66172 +                       loaded = coord->node;
66173 +               }
66174 +       }
66175 +
66176 +       area = item_body_by_coord(coord);
66177 +       spin_lock_inode(inode);
66178 +       result = data.iplug->s.sd.save(inode, &area);
66179 +       znode_make_dirty(coord->node);
66180 +
66181 +       /* re-initialise stat-data seal */
66182 +
66183 +       /*
66184 +        * coord.between was possibly skewed from AT_UNIT when stat-data size
66185 +        * was changed and new extensions were pasted into item.
66186 +        */
66187 +       coord->between = AT_UNIT;
66188 +       seal_init(&state->sd_seal, coord, key);
66189 +       state->sd_coord = *coord;
66190 +       spin_unlock_inode(inode);
66191 +       check_inode_seal(inode, coord, key);
66192 +       zrelse(loaded);
66193 +       return result;
66194 +}
66195 +
66196 +reiser4_internal int
66197 +locate_inode_sd(struct inode *inode,
66198 +               reiser4_key *key,
66199 +               coord_t *coord,
66200 +               lock_handle *lh)
66201 +{
66202 +       reiser4_inode *state;
66203 +       seal_t seal;
66204 +       int result;
66205 +
66206 +       assert("nikita-3483", inode != NULL);
66207 +
66208 +       state = reiser4_inode_data(inode);
66209 +       spin_lock_inode(inode);
66210 +       *coord = state->sd_coord;
66211 +       coord_clear_iplug(coord);
66212 +       seal = state->sd_seal;
66213 +       spin_unlock_inode(inode);
66214 +
66215 +       build_sd_key(inode, key);
66216 +       if (seal_is_set(&seal)) {
66217 +               /* first, try to use seal */
66218 +               result = seal_validate(&seal,
66219 +                                      coord,
66220 +                                      key,
66221 +                                      LEAF_LEVEL,
66222 +                                      lh,
66223 +                                      FIND_EXACT,
66224 +                                      ZNODE_WRITE_LOCK,
66225 +                                      ZNODE_LOCK_LOPRI);
66226 +               if (result == 0)
66227 +                       check_sd_coord(coord, key);
66228 +       } else
66229 +               result = -E_REPEAT;
66230 +
66231 +       if (result != 0) {
66232 +               coord_init_zero(coord);
66233 +               result = lookup_sd(inode, ZNODE_WRITE_LOCK, coord, lh, key, 0);
66234 +       }
66235 +       return result;
66236 +}
66237 +
66238 +/* Update existing stat-data in a tree. Called with inode state locked. Return
66239 +   inode state locked. */
66240 +static int
66241 +update_sd(struct inode *inode /* inode to update sd for */ )
66242 +{
66243 +       int result;
66244 +       reiser4_key key;
66245 +       coord_t coord;
66246 +       lock_handle lh;
66247 +
66248 +       assert("nikita-726", inode != NULL);
66249 +
66250 +       /* no stat-data, nothing to update?! */
66251 +       assert("nikita-3482", !inode_get_flag(inode, REISER4_NO_SD));
66252 +
66253 +       init_lh(&lh);
66254 +
66255 +       result = locate_inode_sd(inode, &key, &coord, &lh);
66256 +       if (result == 0)
66257 +               result = update_sd_at(inode, &coord, &key, &lh);
66258 +       done_lh(&lh);
66259 +
66260 +       return result;
66261 +}
66262 +/* NIKITA-FIXME-HANS: the distinction between writing and updating made in the function names seems muddled, please adopt a better function naming strategy */
66263 +/* save object's stat-data to disk */
66264 +reiser4_internal int
66265 +write_sd_by_inode_common(struct inode *inode /* object to save */)
66266 +{
66267 +       int result;
66268 +
66269 +       assert("nikita-730", inode != NULL);
66270 +
66271 +       mark_inode_update(inode, 1);
66272 +
66273 +       if (inode_get_flag(inode, REISER4_NO_SD))
66274 +               /* object doesn't have stat-data yet */
66275 +               result = insert_new_sd(inode);
66276 +       else
66277 +               result = update_sd(inode);
66278 +       if (result != 0 && result != -ENAMETOOLONG && result != -ENOMEM)
66279 +               /* Don't issue warnings about "name is too long" */
66280 +               warning("nikita-2221", "Failed to save sd for %llu: %i",
66281 +                       get_inode_oid(inode), result);
66282 +       return result;
66283 +}
66284 +
66285 +/* checks whether yet another hard links to this object can be added */
66286 +reiser4_internal int
66287 +can_add_link_common(const struct inode *object /* object to check */ )
66288 +{
66289 +       assert("nikita-732", object != NULL);
66290 +
66291 +       /* inode->i_nlink is unsigned int, so just check for integer
66292 +        * overflow */
66293 +       return object->i_nlink + 1 != 0;
66294 +}
66295 +
66296 +/* remove object stat data. Space for it must be reserved by caller before */
66297 +reiser4_internal int
66298 +common_object_delete_no_reserve(struct inode *inode /* object to remove */,
66299 +                               int mode /* cut_mode */)
66300 +{
66301 +       int result;
66302 +
66303 +       assert("nikita-1477", inode != NULL);
66304 +
66305 +       if (!inode_get_flag(inode, REISER4_NO_SD)) {
66306 +               reiser4_key sd_key;
66307 +
66308 +               DQUOT_FREE_INODE(inode);
66309 +               DQUOT_DROP(inode);
66310 +
66311 +               build_sd_key(inode, &sd_key);
66312 +               write_current_logf(WRITE_TREE_LOG, "..sd k %#llx", get_inode_oid(inode));
66313 +               result = cut_tree(tree_by_inode(inode), &sd_key, &sd_key, NULL, mode);
66314 +               if (result == 0) {
66315 +                       inode_set_flag(inode, REISER4_NO_SD);
66316 +                       result = oid_release(inode->i_sb, get_inode_oid(inode));
66317 +                       if (result == 0) {
66318 +                               oid_count_released();
66319 +
66320 +                               result = safe_link_del(inode, SAFE_UNLINK);
66321 +                       }
66322 +               }
66323 +       } else
66324 +               result = 0;
66325 +       return result;
66326 +}
66327 +
66328 +/* delete object stat-data. This is to be used when file deletion turns into stat data removal */
66329 +reiser4_internal int
66330 +delete_object(struct inode *inode /* object to remove */, int mode /* cut mode */)
66331 +{
66332 +       int result;
66333 +
66334 +       assert("nikita-1477", inode != NULL);
66335 +       assert("nikita-3420", inode->i_size == 0 || S_ISLNK(inode->i_mode));
66336 +       assert("nikita-3421", inode->i_nlink == 0);
66337 +
66338 +       if (!inode_get_flag(inode, REISER4_NO_SD)) {
66339 +               reiser4_block_nr reserve;
66340 +
66341 +               /* grab space which is needed to remove 2 items from the tree:
66342 +                stat data and safe-link */
66343 +               reserve = 2 * estimate_one_item_removal(tree_by_inode(inode));
66344 +               if (reiser4_grab_space_force(reserve,
66345 +                                            BA_RESERVED | BA_CAN_COMMIT))
66346 +                       return RETERR(-ENOSPC);
66347 +               result = common_object_delete_no_reserve(inode, mode);
66348 +       } else
66349 +               result = 0;
66350 +       return result;
66351 +}
66352 +
66353 +reiser4_internal int
66354 +delete_file_common(struct inode * inode)
66355 +{
66356 +       return delete_object(inode, 1);
66357 +}
66358 +
66359 +/* common directory consists of two items: stat data and one item containing "." and ".." */
66360 +static int delete_directory_common(struct inode *inode)
66361 +{
66362 +       int result;
66363 +       dir_plugin *dplug;
66364 +
66365 +       dplug = inode_dir_plugin(inode);
66366 +       assert("vs-1101", dplug && dplug->done);
66367 +
66368 +       /* grab space enough for removing two items */
66369 +       if (reiser4_grab_space(2 * estimate_one_item_removal(tree_by_inode(inode)), BA_RESERVED | BA_CAN_COMMIT))
66370 +               return RETERR(-ENOSPC);
66371 +
66372 +       result = dplug->done(inode);
66373 +       if (!result)
66374 +               result = common_object_delete_no_reserve(inode, 1);
66375 +       all_grabbed2free();
66376 +       return result;
66377 +}
66378 +
66379 +/* ->set_plug_in_inode() default method. */
66380 +static int
66381 +set_plug_in_inode_common(struct inode *object /* inode to set plugin on */ ,
66382 +                        struct inode *parent /* parent object */ ,
66383 +                        reiser4_object_create_data * data      /* creational
66384 +                                                                * data */ )
66385 +{
66386 +       __u64 mask;
66387 +
66388 +       object->i_mode = data->mode;
66389 +       /* this should be plugin decision */
66390 +       object->i_uid = current->fsuid;
66391 +       object->i_mtime = object->i_atime = object->i_ctime = CURRENT_TIME;
66392 +/* NIKITA-FIXME-HANS: which is defined as what where? */
66393 +       /* support for BSD style group-id assignment. */
66394 +       if (reiser4_is_set(object->i_sb, REISER4_BSD_GID))
66395 +               object->i_gid = parent->i_gid;
66396 +       else if (parent->i_mode & S_ISGID) {
66397 +               /* parent directory has sguid bit */
66398 +               object->i_gid = parent->i_gid;
66399 +               if (S_ISDIR(object->i_mode))
66400 +                       /* sguid is inherited by sub-directories */
66401 +                       object->i_mode |= S_ISGID;
66402 +       } else
66403 +               object->i_gid = current->fsgid;
66404 +
66405 +       /* this object doesn't have stat-data yet */
66406 +       inode_set_flag(object, REISER4_NO_SD);
66407 +       /* setup inode and file-operations for this inode */
66408 +       setup_inode_ops(object, data);
66409 +       object->i_nlink = 0;
66410 +       seal_init(&reiser4_inode_data(object)->sd_seal, NULL, NULL);
66411 +       mask = (1 << UNIX_STAT) | (1 << LIGHT_WEIGHT_STAT);
66412 +       if (!reiser4_is_set(object->i_sb, REISER4_32_BIT_TIMES))
66413 +               mask |= (1 << LARGE_TIMES_STAT);
66414 +
66415 +       reiser4_inode_data(object)->extmask = mask;
66416 +       return 0;
66417 +}
66418 +
66419 +/* Determine object plugin for @inode based on i_mode.
66420 +
66421 +   Many objects in reiser4 file system are controlled by standard object
66422 +   plugins that emulate traditional unix objects: unix file, directory, symlink, fifo, and so on.
66423 +
66424 +   For such files we don't explicitly store plugin id in object stat
66425 +   data. Rather required plugin is guessed from mode bits, where file "type"
66426 +   is encoded (see stat(2)).
66427 +*/
66428 +reiser4_internal int
66429 +guess_plugin_by_mode(struct inode *inode       /* object to guess plugins
66430 +                                                * for */ )
66431 +{
66432 +       int fplug_id;
66433 +       int dplug_id;
66434 +       reiser4_inode *info;
66435 +
66436 +       assert("nikita-736", inode != NULL);
66437 +
66438 +       dplug_id = fplug_id = -1;
66439 +
66440 +       switch (inode->i_mode & S_IFMT) {
66441 +       case S_IFSOCK:
66442 +       case S_IFBLK:
66443 +       case S_IFCHR:
66444 +       case S_IFIFO:
66445 +               fplug_id = SPECIAL_FILE_PLUGIN_ID;
66446 +               break;
66447 +       case S_IFLNK:
66448 +               fplug_id = SYMLINK_FILE_PLUGIN_ID;
66449 +               break;
66450 +       case S_IFDIR:
66451 +               fplug_id = DIRECTORY_FILE_PLUGIN_ID;
66452 +               dplug_id = HASHED_DIR_PLUGIN_ID;
66453 +               break;
66454 +       default:
66455 +               warning("nikita-737", "wrong file mode: %o", inode->i_mode);
66456 +               return RETERR(-EIO);
66457 +       case S_IFREG:
66458 +               fplug_id = UNIX_FILE_PLUGIN_ID;
66459 +               break;
66460 +       }
66461 +       info = reiser4_inode_data(inode);
66462 +       plugin_set_file(&info->pset,
66463 +                       (fplug_id >= 0) ? file_plugin_by_id(fplug_id) : NULL);
66464 +       plugin_set_dir(&info->pset,
66465 +                      (dplug_id >= 0) ? dir_plugin_by_id(dplug_id) : NULL);
66466 +       return 0;
66467 +}
66468 +
66469 +/* this comon implementation of create estimation function may be used when object creation involves insertion of one item
66470 +   (usualy stat data) into tree */
66471 +static reiser4_block_nr estimate_create_file_common(struct inode *object)
66472 +{
66473 +       return estimate_one_insert_item(tree_by_inode(object));
66474 +}
66475 +
66476 +/* this comon implementation of create directory estimation function may be used when directory creation involves
66477 +   insertion of two items (usualy stat data and item containing "." and "..") into tree */
66478 +static reiser4_block_nr estimate_create_dir_common(struct inode *object)
66479 +{
66480 +       return 2 * estimate_one_insert_item(tree_by_inode(object));
66481 +}
66482 +
66483 +/* ->create method of object plugin */
66484 +static int
66485 +create_common(struct inode *object, struct inode *parent UNUSED_ARG,
66486 +             reiser4_object_create_data * data UNUSED_ARG)
66487 +{
66488 +       reiser4_block_nr reserve;
66489 +       assert("nikita-744", object != NULL);
66490 +       assert("nikita-745", parent != NULL);
66491 +       assert("nikita-747", data != NULL);
66492 +       assert("nikita-748", inode_get_flag(object, REISER4_NO_SD));
66493 +
66494 +       reserve = estimate_create_file_common(object);
66495 +       if (reiser4_grab_space(reserve, BA_CAN_COMMIT))
66496 +               return RETERR(-ENOSPC);
66497 +       return write_sd_by_inode_common(object);
66498 +}
66499 +
66500 +/* standard implementation of ->owns_item() plugin method: compare objectids
66501 +    of keys in inode and coord */
66502 +reiser4_internal int
66503 +owns_item_common(const struct inode *inode     /* object to check
66504 +                                                * against */ ,
66505 +                const coord_t * coord /* coord to check */ )
66506 +{
66507 +       reiser4_key item_key;
66508 +       reiser4_key file_key;
66509 +
66510 +       assert("nikita-760", inode != NULL);
66511 +       assert("nikita-761", coord != NULL);
66512 +
66513 +       return                  /*coord_is_in_node( coord ) && */
66514 +           coord_is_existing_item(coord) &&
66515 +           (get_key_objectid(build_sd_key(inode, &file_key)) == get_key_objectid(item_key_by_coord(coord, &item_key)));
66516 +}
66517 +
66518 +/* @count bytes of flow @f got written, update correspondingly f->length,
66519 +   f->data and f->key */
66520 +reiser4_internal void
66521 +move_flow_forward(flow_t * f, unsigned count)
66522 +{
66523 +       if (f->data)
66524 +               f->data += count;
66525 +       f->length -= count;
66526 +       set_key_offset(&f->key, get_key_offset(&f->key) + count);
66527 +}
66528 +
66529 +/* default ->add_link() method of file plugin */
66530 +static int
66531 +add_link_common(struct inode *object, struct inode *parent UNUSED_ARG)
66532 +{
66533 +       /*
66534 +        * increment ->i_nlink and update ->i_ctime
66535 +        */
66536 +
66537 +       INODE_INC_FIELD(object, i_nlink);
66538 +       object->i_ctime = CURRENT_TIME;
66539 +       return 0;
66540 +}
66541 +
66542 +/* default ->rem_link() method of file plugin */
66543 +static int
66544 +rem_link_common(struct inode *object, struct inode *parent UNUSED_ARG)
66545 +{
66546 +       assert("nikita-2021", object != NULL);
66547 +       assert("nikita-2163", object->i_nlink > 0);
66548 +
66549 +       /*
66550 +        * decrement ->i_nlink and update ->i_ctime
66551 +        */
66552 +
66553 +       INODE_DEC_FIELD(object, i_nlink);
66554 +       object->i_ctime = CURRENT_TIME;
66555 +       return 0;
66556 +}
66557 +
66558 +/* ->not_linked() method for file plugins */
66559 +static int
66560 +not_linked_common(const struct inode *inode)
66561 +{
66562 +       assert("nikita-2007", inode != NULL);
66563 +       return (inode->i_nlink == 0);
66564 +}
66565 +
66566 +/* ->not_linked() method the for directory file plugin */
66567 +static int
66568 +not_linked_dir(const struct inode *inode)
66569 +{
66570 +       assert("nikita-2008", inode != NULL);
66571 +       /* one link from dot */
66572 +       return (inode->i_nlink == 1);
66573 +}
66574 +
66575 +/* ->adjust_to_parent() method for regular files */
66576 +static int
66577 +adjust_to_parent_common(struct inode *object /* new object */ ,
66578 +                       struct inode *parent /* parent directory */ ,
66579 +                       struct inode *root /* root directory */ )
66580 +{
66581 +       assert("nikita-2165", object != NULL);
66582 +       if (parent == NULL)
66583 +               parent = root;
66584 +       assert("nikita-2069", parent != NULL);
66585 +
66586 +       /*
66587 +        * inherit missing plugins from parent
66588 +        */
66589 +
66590 +       grab_plugin(object, parent, PSET_FILE);
66591 +       grab_plugin(object, parent, PSET_SD);
66592 +       grab_plugin(object, parent, PSET_FORMATTING);
66593 +       grab_plugin(object, parent, PSET_PERM);
66594 +       return 0;
66595 +}
66596 +
66597 +/* ->adjust_to_parent() method for directory files */
66598 +static int
66599 +adjust_to_parent_dir(struct inode *object /* new object */ ,
66600 +                    struct inode *parent /* parent directory */ ,
66601 +                    struct inode *root /* root directory */ )
66602 +{
66603 +       int result = 0;
66604 +       pset_member memb;
66605 +
66606 +       assert("nikita-2166", object != NULL);
66607 +       if (parent == NULL)
66608 +               parent = root;
66609 +       assert("nikita-2167", parent != NULL);
66610 +
66611 +       /*
66612 +        * inherit missing plugins from parent
66613 +        */
66614 +       for (memb = 0; memb < PSET_LAST; ++ memb) {
66615 +               result = grab_plugin(object, parent, memb);
66616 +               if (result != 0)
66617 +                       break;
66618 +       }
66619 +       return result;
66620 +}
66621 +
66622 +/* simplest implementation of ->getattr() method. Completely static. */
66623 +static int
66624 +getattr_common(struct vfsmount *mnt UNUSED_ARG, struct dentry *dentry, struct kstat *stat)
66625 +{
66626 +       struct inode *obj;
66627 +
66628 +       assert("nikita-2298", dentry != NULL);
66629 +       assert("nikita-2299", stat != NULL);
66630 +       assert("nikita-2300", dentry->d_inode != NULL);
66631 +
66632 +       obj = dentry->d_inode;
66633 +
66634 +       stat->dev = obj->i_sb->s_dev;
66635 +       stat->ino = oid_to_uino(get_inode_oid(obj));
66636 +       stat->mode = obj->i_mode;
66637 +       /* don't confuse userland with huge nlink. This is not entirely
66638 +        * correct, because nlink_t is not necessary 16 bit signed. */
66639 +       stat->nlink = min(obj->i_nlink, (typeof(obj->i_nlink))0x7fff);
66640 +       stat->uid = obj->i_uid;
66641 +       stat->gid = obj->i_gid;
66642 +       stat->rdev = obj->i_rdev;
66643 +       stat->atime = obj->i_atime;
66644 +       stat->mtime = obj->i_mtime;
66645 +       stat->ctime = obj->i_ctime;
66646 +       stat->size = obj->i_size;
66647 +       stat->blocks = (inode_get_bytes(obj) + VFS_BLKSIZE - 1) >> VFS_BLKSIZE_BITS;
66648 +       /* "preferred" blocksize for efficient file system I/O */
66649 +       stat->blksize = get_super_private(obj->i_sb)->optimal_io_size;
66650 +
66651 +       return 0;
66652 +}
66653 +
66654 +/* plugin->u.file.release */
66655 +static int
66656 +release_dir(struct inode *inode, struct file *file)
66657 +{
66658 +       /* this is called when directory file descriptor is closed. */
66659 +       spin_lock_inode(inode);
66660 +       /* remove directory from readddir list. See comment before
66661 +        * readdir_common() for details. */
66662 +       if (file->private_data != NULL)
66663 +               readdir_list_remove_clean(reiser4_get_file_fsdata(file));
66664 +       spin_unlock_inode(inode);
66665 +       return 0;
66666 +}
66667 +
66668 +/* default implementation of ->bind() method of file plugin */
66669 +static int
66670 +bind_common(struct inode *child UNUSED_ARG, struct inode *parent UNUSED_ARG)
66671 +{
66672 +       return 0;
66673 +}
66674 +
66675 +#define detach_common bind_common
66676 +#define cannot ((void *)bind_common)
66677 +
66678 +static int
66679 +detach_dir(struct inode *child, struct inode *parent)
66680 +{
66681 +       dir_plugin *dplug;
66682 +
66683 +       dplug = inode_dir_plugin(child);
66684 +       assert("nikita-2883", dplug != NULL);
66685 +       assert("nikita-2884", dplug->detach != NULL);
66686 +       return dplug->detach(child, parent);
66687 +}
66688 +
66689 +
66690 +/* this common implementation of update estimation function may be used when stat data update does not do more than
66691 +   inserting a unit into a stat data item which is probably true for most cases */
66692 +reiser4_internal reiser4_block_nr
66693 +estimate_update_common(const struct inode *inode)
66694 +{
66695 +       return estimate_one_insert_into_item(tree_by_inode(inode));
66696 +}
66697 +
66698 +static reiser4_block_nr
66699 +estimate_unlink_common(struct inode *object UNUSED_ARG,
66700 +                      struct inode *parent UNUSED_ARG)
66701 +{
66702 +       return 0;
66703 +}
66704 +
66705 +static reiser4_block_nr
66706 +estimate_unlink_dir_common(struct inode *object, struct inode *parent)
66707 +{
66708 +       dir_plugin *dplug;
66709 +
66710 +       dplug = inode_dir_plugin(object);
66711 +       assert("nikita-2888", dplug != NULL);
66712 +       assert("nikita-2887", dplug->estimate.unlink != NULL);
66713 +       return dplug->estimate.unlink(object, parent);
66714 +}
66715 +
66716 +/* implementation of ->bind() method for file plugin of directory file */
66717 +static int
66718 +bind_dir(struct inode *child, struct inode *parent)
66719 +{
66720 +       dir_plugin *dplug;
66721 +
66722 +       dplug = inode_dir_plugin(child);
66723 +       assert("nikita-2646", dplug != NULL);
66724 +       return dplug->attach(child, parent);
66725 +}
66726 +
66727 +reiser4_internal int
66728 +setattr_reserve_common(reiser4_tree *tree)
66729 +{
66730 +       assert("vs-1096", is_grab_enabled(get_current_context()));
66731 +       return reiser4_grab_space(estimate_one_insert_into_item(tree),
66732 +                                 BA_CAN_COMMIT);
66733 +}
66734 +
66735 +/* ->setattr() method. This is called when inode attribute (including
66736 + * ->i_size) is modified. */
66737 +reiser4_internal int
66738 +setattr_common(struct inode *inode /* Object to change attributes */,
66739 +              struct iattr *attr /* change description */)
66740 +{
66741 +       int   result;
66742 +
66743 +       assert("nikita-3119", !(attr->ia_valid & ATTR_SIZE));
66744 +
66745 +       result = inode_change_ok(inode, attr);
66746 +       if (result)
66747 +               return result;
66748 +
66749 +       /*
66750 +        * grab disk space and call standard inode_setattr().
66751 +        */
66752 +       result = setattr_reserve_common(tree_by_inode(inode));
66753 +       if (!result) {
66754 +               if ((attr->ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) ||
66755 +                   (attr->ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) {
66756 +                       result = DQUOT_TRANSFER(inode, attr) ? -EDQUOT : 0;
66757 +                       if (result) {
66758 +                               all_grabbed2free();
66759 +                               return result;
66760 +                       }
66761 +               }
66762 +               result = inode_setattr(inode, attr);
66763 +               if (!result)
66764 +                       reiser4_update_sd(inode);
66765 +       }
66766 +
66767 +       all_grabbed2free();
66768 +       return result;
66769 +}
66770 +
66771 +/* doesn't seem to be exported in headers. */
66772 +extern spinlock_t inode_lock;
66773 +
66774 +/* ->delete_inode() method. This is called by
66775 + * iput()->iput_final()->drop_inode() when last reference to inode is released
66776 + * and inode has no names. */
66777 +static void delete_inode_common(struct inode *object)
66778 +{
66779 +       /* create context here.
66780 +        *
66781 +        * removal of inode from the hash table (done at the very beginning of
66782 +        * generic_delete_inode(), truncate of pages, and removal of file's
66783 +        * extents has to be performed in the same atom. Otherwise, it may so
66784 +        * happen, that twig node with unallocated extent will be flushed to
66785 +        * the disk.
66786 +        */
66787 +       reiser4_context ctx;
66788 +
66789 +       /*
66790 +        * FIXME: this resembles generic_delete_inode
66791 +        */
66792 +       list_del_init(&object->i_list);
66793 +       list_del_init(&object->i_sb_list);
66794 +       object->i_state |= I_FREEING;
66795 +       inodes_stat.nr_inodes--;
66796 +       spin_unlock(&inode_lock);
66797 +
66798 +       init_context(&ctx, object->i_sb);
66799 +
66800 +       kill_cursors(object);
66801 +
66802 +       if (!is_bad_inode(object)) {
66803 +               file_plugin *fplug;
66804 +
66805 +               /* truncate object body */
66806 +               fplug = inode_file_plugin(object);
66807 +               if (fplug->pre_delete != NULL && fplug->pre_delete(object) != 0)
66808 +                       warning("vs-1216", "Failed to delete file body %llu",
66809 +                               get_inode_oid(object));
66810 +               else
66811 +                       assert("vs-1430",
66812 +                              reiser4_inode_data(object)->anonymous_eflushed == 0 &&
66813 +                              reiser4_inode_data(object)->captured_eflushed == 0);
66814 +       }
66815 +
66816 +       if (object->i_data.nrpages) {
66817 +               warning("vs-1434", "nrpages %ld\n", object->i_data.nrpages);
66818 +               truncate_inode_pages(&object->i_data, 0);
66819 +       }
66820 +       security_inode_delete(object);
66821 +       if (!is_bad_inode(object))
66822 +               DQUOT_INIT(object);
66823 +
66824 +       object->i_sb->s_op->delete_inode(object);
66825 +
66826 +       spin_lock(&inode_lock);
66827 +       hlist_del_init(&object->i_hash);
66828 +       spin_unlock(&inode_lock);
66829 +       wake_up_inode(object);
66830 +       if (object->i_state != I_CLEAR)
66831 +               BUG();
66832 +       destroy_inode(object);
66833 +       reiser4_exit_context(&ctx);
66834 +}
66835 +
66836 +/*
66837 + * ->forget_inode() method. Called by iput()->iput_final()->drop_inode() when
66838 + * last reference to inode with names is released
66839 + */
66840 +static void forget_inode_common(struct inode *object)
66841 +{
66842 +       generic_forget_inode(object);
66843 +}
66844 +
66845 +/* ->drop_inode() method. Called by iput()->iput_final() when last reference
66846 + * to inode is released */
66847 +static void drop_common(struct inode * object)
66848 +{
66849 +       file_plugin *fplug;
66850 +
66851 +       assert("nikita-2643", object != NULL);
66852 +
66853 +       /* -not- creating context in this method, because it is frequently
66854 +          called and all existing ->not_linked() methods are one liners. */
66855 +
66856 +       fplug = inode_file_plugin(object);
66857 +       /* fplug is NULL for fake inode */
66858 +       if (fplug != NULL && fplug->not_linked(object)) {
66859 +               assert("nikita-3231", fplug->delete_inode != NULL);
66860 +               fplug->delete_inode(object);
66861 +       } else {
66862 +               assert("nikita-3232", fplug->forget_inode != NULL);
66863 +               fplug->forget_inode(object);
66864 +       }
66865 +}
66866 +
66867 +static ssize_t
66868 +isdir(void)
66869 +{
66870 +       return RETERR(-EISDIR);
66871 +}
66872 +
66873 +#define eisdir ((void *)isdir)
66874 +
66875 +static ssize_t
66876 +perm(void)
66877 +{
66878 +       return RETERR(-EPERM);
66879 +}
66880 +
66881 +#define eperm ((void *)perm)
66882 +
66883 +static int
66884 +can_rem_dir(const struct inode * inode)
66885 +{
66886 +       /* is_dir_empty() returns 0 is dir is empty */
66887 +       return !is_dir_empty(inode);
66888 +}
66889 +
66890 +static int
66891 +process_truncate(struct inode *inode, __u64 size)
66892 +{
66893 +       int result;
66894 +       struct iattr attr;
66895 +       file_plugin *fplug;
66896 +       reiser4_context ctx;
66897 +
66898 +       init_context(&ctx, inode->i_sb);
66899 +
66900 +       attr.ia_size = size;
66901 +       attr.ia_valid = ATTR_SIZE | ATTR_CTIME;
66902 +       fplug = inode_file_plugin(inode);
66903 +
66904 +       down(&inode->i_sem);
66905 +       result = fplug->setattr(inode, &attr);
66906 +       up(&inode->i_sem);
66907 +
66908 +       context_set_commit_async(&ctx);
66909 +       reiser4_exit_context(&ctx);
66910 +
66911 +       return result;
66912 +}
66913 +
66914 +reiser4_internal int
66915 +safelink_common(struct inode *object, reiser4_safe_link_t link, __u64 value)
66916 +{
66917 +       int result;
66918 +
66919 +       if (link == SAFE_UNLINK)
66920 +               /* nothing to do. iput() in the caller (process_safelink) will
66921 +                * finish with file */
66922 +               result = 0;
66923 +       else if (link == SAFE_TRUNCATE)
66924 +               result = process_truncate(object, value);
66925 +       else {
66926 +               warning("nikita-3438", "Unrecognized safe-link type: %i", link);
66927 +               result = RETERR(-EIO);
66928 +       }
66929 +       return result;
66930 +}
66931 +
66932 +reiser4_internal int prepare_write_common (
66933 +       struct file * file, struct page * page, unsigned from, unsigned to)
66934 +{
66935 +       int result;
66936 +       file_plugin *fplug;
66937 +       struct inode *inode;
66938 +
66939 +       assert("umka-3099", file != NULL);
66940 +       assert("umka-3100", page != NULL);
66941 +       assert("umka-3095", PageLocked(page));
66942 +
66943 +       if (to - from == PAGE_CACHE_SIZE || PageUptodate(page))
66944 +               return 0;
66945 +
66946 +       inode = page->mapping->host;
66947 +       fplug = inode_file_plugin(inode);
66948 +
66949 +       if (fplug->readpage == NULL)
66950 +               return RETERR(-EINVAL);
66951 +
66952 +       result = fplug->readpage(file, page);
66953 +       if (result != 0) {
66954 +               SetPageError(page);
66955 +               ClearPageUptodate(page);
66956 +               /* All reiser4 readpage() implementations should return the
66957 +                * page locked in case of error. */
66958 +               assert("nikita-3472", PageLocked(page));
66959 +       } else {
66960 +               /*
66961 +                * ->readpage() either:
66962 +                *
66963 +                *     1. starts IO against @page. @page is locked for IO in
66964 +                *     this case.
66965 +                *
66966 +                *     2. doesn't start IO. @page is unlocked.
66967 +                *
66968 +                * In either case, page should be locked.
66969 +                */
66970 +               lock_page(page);
66971 +               /*
66972 +                * IO (if any) is completed at this point. Check for IO
66973 +                * errors.
66974 +                */
66975 +               if (!PageUptodate(page))
66976 +                       result = RETERR(-EIO);
66977 +       }
66978 +       assert("umka-3098", PageLocked(page));
66979 +       return result;
66980 +}
66981 +
66982 +reiser4_internal int
66983 +key_by_inode_and_offset_common(struct inode *inode, loff_t off, reiser4_key *key)
66984 +{
66985 +       key_init(key);
66986 +       set_key_locality(key, reiser4_inode_data(inode)->locality_id);
66987 +       set_key_ordering(key, get_inode_ordering(inode));
66988 +       set_key_objectid(key, get_inode_oid(inode));/*FIXME: inode->i_ino */
66989 +       set_key_type(key, KEY_BODY_MINOR);
66990 +       set_key_offset(key, (__u64) off);
66991 +       return 0;
66992 +}
66993 +
66994 +/* default implementation of ->sync() method: commit all transactions */
66995 +static int
66996 +sync_common(struct inode *inode, int datasync)
66997 +{
66998 +       return txnmgr_force_commit_all(inode->i_sb, 0);
66999 +}
67000 +
67001 +static int
67002 +wire_size_common(struct inode *inode)
67003 +{
67004 +       return inode_onwire_size(inode);
67005 +}
67006 +
67007 +static char *
67008 +wire_write_common(struct inode *inode, char *start)
67009 +{
67010 +       return build_inode_onwire(inode, start);
67011 +}
67012 +
67013 +static char *
67014 +wire_read_common(char *addr, reiser4_object_on_wire *obj)
67015 +{
67016 +       return extract_obj_key_id_from_onwire(addr, &obj->u.std.key_id);
67017 +}
67018 +
67019 +static void
67020 +wire_done_common(reiser4_object_on_wire *obj)
67021 +{
67022 +       /* nothing to do */
67023 +}
67024 +
67025 +static struct dentry *
67026 +wire_get_common(struct super_block *sb, reiser4_object_on_wire *obj)
67027 +{
67028 +       struct inode *inode;
67029 +       struct dentry *dentry;
67030 +       reiser4_key key;
67031 +
67032 +       extract_key_from_id(&obj->u.std.key_id, &key);
67033 +       inode = reiser4_iget(sb, &key, 1);
67034 +       if (!IS_ERR(inode)) {
67035 +               reiser4_iget_complete(inode);
67036 +               dentry = d_alloc_anon(inode);
67037 +               if (dentry == NULL) {
67038 +                       iput(inode);
67039 +                       dentry = ERR_PTR(-ENOMEM);
67040 +               } else
67041 +                       dentry->d_op = &get_super_private(sb)->ops.dentry;
67042 +       } else if (PTR_ERR(inode) == -ENOENT)
67043 +               /*
67044 +                * inode wasn't found at the key encoded in the file
67045 +                * handle. Hence, file handle is stale.
67046 +                */
67047 +               dentry = ERR_PTR(RETERR(-ESTALE));
67048 +       else
67049 +               dentry = (void *)inode;
67050 +       return dentry;
67051 +}
67052 +
67053 +
67054 +static int
67055 +change_file(struct inode * inode, reiser4_plugin * plugin)
67056 +{
67057 +       /* cannot change object plugin of already existing object */
67058 +       return RETERR(-EINVAL);
67059 +}
67060 +
67061 +static reiser4_plugin_ops file_plugin_ops = {
67062 +       .init     = NULL,
67063 +       .load     = NULL,
67064 +       .save_len = NULL,
67065 +       .save     = NULL,
67066 +       .change   = change_file
67067 +};
67068 +
67069 +
67070 +/*
67071 + * Definitions of object plugins.
67072 + */
67073 +
67074 +file_plugin file_plugins[LAST_FILE_PLUGIN_ID] = {
67075 +       [UNIX_FILE_PLUGIN_ID] = {
67076 +               .h = {
67077 +                       .type_id = REISER4_FILE_PLUGIN_TYPE,
67078 +                       .id = UNIX_FILE_PLUGIN_ID,
67079 +                       .pops = &file_plugin_ops,
67080 +                       .label = "reg",
67081 +                       .desc = "regular file",
67082 +                       .linkage = TYPE_SAFE_LIST_LINK_ZERO
67083 +               },
67084 +               .open = NULL,
67085 +               .truncate = truncate_unix_file,
67086 +               .write_sd_by_inode = write_sd_by_inode_common,
67087 +               .capturepage = capturepage_unix_file,
67088 +               .readpage = readpage_unix_file,
67089 +               .capture = capture_unix_file,
67090 +               .read = read_unix_file,
67091 +               .write = write_unix_file,
67092 +               .release = release_unix_file,
67093 +               .ioctl = ioctl_unix_file,
67094 +               .mmap = mmap_unix_file,
67095 +               .get_block = get_block_unix_file,
67096 +               .flow_by_inode = flow_by_inode_unix_file,
67097 +               .key_by_inode = key_by_inode_unix_file,
67098 +               .set_plug_in_inode = set_plug_in_inode_common,
67099 +               .adjust_to_parent = adjust_to_parent_common,
67100 +               .create = create_common,
67101 +               .delete = delete_file_common,
67102 +               .sync = sync_unix_file,
67103 +               .add_link = add_link_common,
67104 +               .rem_link = rem_link_common,
67105 +               .owns_item = owns_item_unix_file,
67106 +               .can_add_link = can_add_link_common,
67107 +               .can_rem_link = NULL,
67108 +               .not_linked = not_linked_common,
67109 +               .setattr = setattr_unix_file,
67110 +               .getattr = getattr_common,
67111 +               .seek = NULL,
67112 +               .detach = detach_common,
67113 +               .bind = bind_common,
67114 +               .safelink = safelink_common,
67115 +               .estimate = {
67116 +                       .create = estimate_create_file_common,
67117 +                       .update = estimate_update_common,
67118 +                       .unlink = estimate_unlink_common
67119 +               },
67120 +               .wire = {
67121 +                        .write = wire_write_common,
67122 +                        .read  = wire_read_common,
67123 +                        .get   = wire_get_common,
67124 +                        .size  = wire_size_common,
67125 +                        .done  = wire_done_common
67126 +                },
67127 +               .readpages = readpages_unix_file,
67128 +               .init_inode_data = init_inode_data_unix_file,
67129 +               .pre_delete = pre_delete_unix_file,
67130 +               .drop = drop_common,
67131 +               .delete_inode = delete_inode_common,
67132 +               .destroy_inode = NULL,
67133 +               .forget_inode = forget_inode_common,
67134 +               .sendfile = sendfile_unix_file,
67135 +               .prepare_write = prepare_write_unix_file
67136 +       },
67137 +       [DIRECTORY_FILE_PLUGIN_ID] = {
67138 +               .h = {
67139 +                       .type_id = REISER4_FILE_PLUGIN_TYPE,
67140 +                       .id = DIRECTORY_FILE_PLUGIN_ID,
67141 +                       .pops = &file_plugin_ops,
67142 +                       .label = "dir",
67143 +                       .desc = "directory",
67144 +                       .linkage = TYPE_SAFE_LIST_LINK_ZERO},
67145 +               .open = NULL,
67146 +               .truncate = eisdir,
67147 +               .write_sd_by_inode = write_sd_by_inode_common,
67148 +               .capturepage = NULL,
67149 +               .readpage = eisdir,
67150 +               .capture = NULL,
67151 +               .read = eisdir,
67152 +               .write = eisdir,
67153 +               .release = release_dir,
67154 +               .ioctl = eisdir,
67155 +               .mmap = eisdir,
67156 +               .get_block = NULL,
67157 +               .flow_by_inode = NULL,
67158 +               .key_by_inode = NULL,
67159 +               .set_plug_in_inode = set_plug_in_inode_common,
67160 +               .adjust_to_parent = adjust_to_parent_dir,
67161 +               .create = create_common,
67162 +               .delete = delete_directory_common,
67163 +               .sync = sync_common,
67164 +               .add_link = add_link_common,
67165 +               .rem_link = rem_link_common,
67166 +               .owns_item = owns_item_hashed,
67167 +               .can_add_link = can_add_link_common,
67168 +               .can_rem_link = can_rem_dir,
67169 +               .not_linked = not_linked_dir,
67170 +               .setattr = setattr_common,
67171 +               .getattr = getattr_common,
67172 +               .seek = seek_dir,
67173 +               .detach = detach_dir,
67174 +               .bind = bind_dir,
67175 +               .safelink = safelink_common,
67176 +               .estimate = {
67177 +                       .create = estimate_create_dir_common,
67178 +                       .update = estimate_update_common,
67179 +                       .unlink = estimate_unlink_dir_common
67180 +               },
67181 +               .wire = {
67182 +                        .write = wire_write_common,
67183 +                        .read  = wire_read_common,
67184 +                        .get   = wire_get_common,
67185 +                        .size  = wire_size_common,
67186 +                        .done  = wire_done_common
67187 +                },
67188 +               .readpages = NULL,
67189 +               .init_inode_data = init_inode_ordering,
67190 +               .pre_delete = NULL,
67191 +               .drop = drop_common,
67192 +               .delete_inode = delete_inode_common,
67193 +               .destroy_inode = NULL,
67194 +               .forget_inode = forget_inode_common,
67195 +       },
67196 +       [SYMLINK_FILE_PLUGIN_ID] = {
67197 +               .h = {
67198 +                       .type_id = REISER4_FILE_PLUGIN_TYPE,
67199 +                       .id = SYMLINK_FILE_PLUGIN_ID,
67200 +                       .pops = &file_plugin_ops,
67201 +                       .label = "symlink",
67202 +                       .desc = "symbolic link",
67203 +                       .linkage = TYPE_SAFE_LIST_LINK_ZERO
67204 +               },
67205 +               .open = NULL,
67206 +               .truncate = eperm,
67207 +               .write_sd_by_inode = write_sd_by_inode_common,
67208 +               .capturepage = NULL,
67209 +               .readpage = eperm,
67210 +               .capture = NULL,
67211 +               .read = eperm,
67212 +               .write = eperm,
67213 +               .release = NULL,
67214 +               .ioctl = eperm,
67215 +               .mmap = eperm,
67216 +               .sync = sync_common,
67217 +               .get_block = NULL,
67218 +               .flow_by_inode = NULL,
67219 +               .key_by_inode = NULL,
67220 +               .set_plug_in_inode = set_plug_in_inode_common,
67221 +               .adjust_to_parent = adjust_to_parent_common,
67222 +               .create = create_symlink,
67223 +               /* FIXME-VS: symlink should probably have its own destroy
67224 +                * method */
67225 +               .delete = delete_file_common,
67226 +               .add_link = add_link_common,
67227 +               .rem_link = rem_link_common,
67228 +               .owns_item = NULL,
67229 +               .can_add_link = can_add_link_common,
67230 +               .can_rem_link = NULL,
67231 +               .not_linked = not_linked_common,
67232 +               .setattr = setattr_common,
67233 +               .getattr = getattr_common,
67234 +               .seek = NULL,
67235 +               .detach = detach_common,
67236 +               .bind = bind_common,
67237 +               .safelink = safelink_common,
67238 +               .estimate = {
67239 +                       .create = estimate_create_file_common,
67240 +                       .update = estimate_update_common,
67241 +                       .unlink = estimate_unlink_common
67242 +               },
67243 +               .wire = {
67244 +                        .write = wire_write_common,
67245 +                        .read  = wire_read_common,
67246 +                        .get   = wire_get_common,
67247 +                        .size  = wire_size_common,
67248 +                        .done  = wire_done_common
67249 +                },
67250 +               .readpages = NULL,
67251 +               .init_inode_data = init_inode_ordering,
67252 +               .pre_delete = NULL,
67253 +               .drop = drop_common,
67254 +               .delete_inode = delete_inode_common,
67255 +               .destroy_inode = destroy_inode_symlink,
67256 +               .forget_inode = forget_inode_common,
67257 +       },
67258 +       [SPECIAL_FILE_PLUGIN_ID] = {
67259 +               .h = {
67260 +                       .type_id = REISER4_FILE_PLUGIN_TYPE,
67261 +                       .id = SPECIAL_FILE_PLUGIN_ID,
67262 +                       .pops = &file_plugin_ops,
67263 +                       .label = "special",
67264 +                       .desc = "special: fifo, device or socket",
67265 +                       .linkage = TYPE_SAFE_LIST_LINK_ZERO}
67266 +               ,
67267 +               .open = NULL,
67268 +               .truncate = eperm,
67269 +               .create = create_common,
67270 +               .write_sd_by_inode = write_sd_by_inode_common,
67271 +               .capturepage = NULL,
67272 +               .readpage = eperm,
67273 +               .capture = NULL,
67274 +               .read = eperm,
67275 +               .write = eperm,
67276 +               .release = NULL,
67277 +               .ioctl = eperm,
67278 +               .mmap = eperm,
67279 +               .sync = sync_common,
67280 +               .get_block = NULL,
67281 +               .flow_by_inode = NULL,
67282 +               .key_by_inode = NULL,
67283 +               .set_plug_in_inode = set_plug_in_inode_common,
67284 +               .adjust_to_parent = adjust_to_parent_common,
67285 +               .delete = delete_file_common,
67286 +               .add_link = add_link_common,
67287 +               .rem_link = rem_link_common,
67288 +               .owns_item = owns_item_common,
67289 +               .can_add_link = can_add_link_common,
67290 +               .can_rem_link = NULL,
67291 +               .not_linked = not_linked_common,
67292 +               .setattr = setattr_common,
67293 +               .getattr = getattr_common,
67294 +               .seek = NULL,
67295 +               .detach = detach_common,
67296 +               .bind = bind_common,
67297 +               .safelink = safelink_common,
67298 +               .estimate = {
67299 +                       .create = estimate_create_file_common,
67300 +                       .update = estimate_update_common,
67301 +                       .unlink = estimate_unlink_common
67302 +               },
67303 +               .wire = {
67304 +                        .write = wire_write_common,
67305 +                        .read  = wire_read_common,
67306 +                        .get   = wire_get_common,
67307 +                        .size  = wire_size_common,
67308 +                        .done  = wire_done_common
67309 +                },
67310 +               .readpages = NULL,
67311 +               .init_inode_data = init_inode_ordering,
67312 +               .pre_delete = NULL,
67313 +               .drop = drop_common,
67314 +               .delete_inode = delete_inode_common,
67315 +               .destroy_inode = NULL,
67316 +               .forget_inode = forget_inode_common,
67317 +       },
67318 +       [PSEUDO_FILE_PLUGIN_ID] = {
67319 +               .h = {
67320 +                       .type_id = REISER4_FILE_PLUGIN_TYPE,
67321 +                       .id = PSEUDO_FILE_PLUGIN_ID,
67322 +                       .pops = &file_plugin_ops,
67323 +                       .label = "pseudo",
67324 +                       .desc = "pseudo file",
67325 +                       .linkage = TYPE_SAFE_LIST_LINK_ZERO
67326 +               },
67327 +               .open =              open_pseudo,
67328 +               .truncate          = eperm,
67329 +               .write_sd_by_inode = eperm,
67330 +               .readpage          = eperm,
67331 +               .capturepage       = NULL,
67332 +               .capture           = NULL,
67333 +               .read              = read_pseudo,
67334 +               .write             = write_pseudo,
67335 +               .release           = release_pseudo,
67336 +               .ioctl             = eperm,
67337 +               .mmap              = eperm,
67338 +               .sync = sync_common,
67339 +               .get_block         = eperm,
67340 +               .flow_by_inode     = NULL,
67341 +               .key_by_inode      = NULL,
67342 +               .set_plug_in_inode = set_plug_in_inode_common,
67343 +               .adjust_to_parent  = NULL,
67344 +               .create            = NULL,
67345 +               .delete            = eperm,
67346 +               .add_link          = NULL,
67347 +               .rem_link          = NULL,
67348 +               .owns_item         = NULL,
67349 +               .can_add_link      = cannot,
67350 +               .can_rem_link      = cannot,
67351 +               .not_linked        = NULL,
67352 +               .setattr           = inode_setattr,
67353 +               .getattr           = getattr_common,
67354 +               .seek              = seek_pseudo,
67355 +               .detach            = detach_common,
67356 +               .bind              = bind_common,
67357 +               .safelink = NULL,
67358 +               .estimate = {
67359 +                       .create = NULL,
67360 +                       .update = NULL,
67361 +                       .unlink = NULL
67362 +               },
67363 +               .wire = {
67364 +                        .write = wire_write_pseudo,
67365 +                        .read  = wire_read_pseudo,
67366 +                        .get   = wire_get_pseudo,
67367 +                        .size  = wire_size_pseudo,
67368 +                        .done  = wire_done_pseudo
67369 +                },
67370 +               .readpages = NULL,
67371 +               .init_inode_data = NULL,
67372 +               .pre_delete = NULL,
67373 +               .drop = drop_pseudo,
67374 +               .delete_inode = NULL,
67375 +               .destroy_inode = NULL,
67376 +               .forget_inode = NULL,
67377 +       },
67378 +       [CRC_FILE_PLUGIN_ID] = {
67379 +               .h = {
67380 +                       .type_id = REISER4_FILE_PLUGIN_TYPE,
67381 +                       .id = CRC_FILE_PLUGIN_ID,
67382 +                       .pops = &cryptcompress_plugin_ops,
67383 +                       .label = "cryptcompress",
67384 +                       .desc = "cryptcompress file",
67385 +                       .linkage = TYPE_SAFE_LIST_LINK_ZERO
67386 +               },
67387 +               /* FIXME: check which of these are relly needed */
67388 +               .open = open_cryptcompress,
67389 +               .truncate = truncate_cryptcompress,
67390 +               .write_sd_by_inode = write_sd_by_inode_common,
67391 +               .readpage = readpage_cryptcompress,
67392 +               .capturepage = NULL,
67393 +               .capture = capture_cryptcompress,
67394 +               .read = generic_file_read,
67395 +               .write = write_cryptcompress,
67396 +               .release = NULL,
67397 +               .ioctl = NULL,
67398 +               .mmap = generic_file_mmap,
67399 +               .get_block = get_block_cryptcompress,
67400 +               .sync = sync_common,
67401 +               .flow_by_inode = flow_by_inode_cryptcompress,
67402 +               .key_by_inode = key_by_inode_cryptcompress,
67403 +               .set_plug_in_inode = set_plug_in_inode_common,
67404 +               .adjust_to_parent = adjust_to_parent_common,
67405 +               .create = create_cryptcompress,
67406 +               .delete = delete_cryptcompress,
67407 +               .add_link = add_link_common,
67408 +               .rem_link = rem_link_common,
67409 +               .owns_item = owns_item_common,
67410 +               .can_add_link = can_add_link_common,
67411 +               .can_rem_link = NULL,
67412 +               .not_linked = not_linked_common,
67413 +               .setattr = setattr_cryptcompress,
67414 +               .getattr = getattr_common,
67415 +               .seek = NULL,
67416 +               .detach = detach_common,
67417 +               .bind = bind_common,
67418 +               .safelink = safelink_common,
67419 +               .estimate = {
67420 +                       .create = estimate_create_file_common,
67421 +                       .update = estimate_update_common,
67422 +                       .unlink = estimate_unlink_common
67423 +               },
67424 +               .wire = {
67425 +                        .write = wire_write_common,
67426 +                        .read  = wire_read_common,
67427 +                        .get   = wire_get_common,
67428 +                        .size  = wire_size_common,
67429 +                        .done  = wire_done_common
67430 +                },
67431 +               .readpages = readpages_cryptcompress,
67432 +               .init_inode_data = init_inode_data_cryptcompress,
67433 +               .pre_delete = pre_delete_cryptcompress,
67434 +               .drop = drop_common,
67435 +               .delete_inode = delete_inode_common,
67436 +               .destroy_inode = destroy_inode_cryptcompress,
67437 +               .forget_inode = forget_inode_common,
67438 +               .sendfile = sendfile_common,
67439 +               .prepare_write = prepare_write_common
67440 +       }
67441 +};
67442 +
67443 +/* Make Linus happy.
67444 +   Local variables:
67445 +   c-indentation-style: "K&R"
67446 +   mode-name: "LC"
67447 +   c-basic-offset: 8
67448 +   tab-width: 8
67449 +   fill-column: 120
67450 +   End:
67451 +*/
67452 diff -rupN linux-2.6.8-rc3/fs/reiser4/plugin/object.h linux-2.6.8-rc3-a/fs/reiser4/plugin/object.h
67453 --- linux-2.6.8-rc3/fs/reiser4/plugin/object.h  1970-01-01 03:00:00.000000000 +0300
67454 +++ linux-2.6.8-rc3-a/fs/reiser4/plugin/object.h        2004-08-05 21:20:52.968684679 +0400
67455 @@ -0,0 +1,45 @@
67456 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by
67457 + * reiser4/README */
67458 +
67459 +/* Declaration of object plugin functions. */
67460 +
67461 +#if !defined( __FS_REISER4_PLUGIN_OBJECT_H__ )
67462 +#define __FS_REISER4_PLUGIN_OBJECT_H__
67463 +
67464 +#include "../forward.h"
67465 +
67466 +#include <linux/fs.h>          /* for struct inode */
67467 +#include <linux/types.h>
67468 +
67469 +extern int locate_inode_sd(struct inode *inode,
67470 +                          reiser4_key *key, coord_t *coord, lock_handle *lh);
67471 +extern int lookup_sd(struct inode *inode, znode_lock_mode lock_mode,
67472 +                    coord_t * coord, lock_handle * lh, const reiser4_key * key,
67473 +                    int silent);
67474 +extern int guess_plugin_by_mode(struct inode *inode);
67475 +
67476 +extern int write_sd_by_inode_common(struct inode *inode);
67477 +extern int owns_item_common(const struct inode *inode,
67478 +                           const coord_t * coord);
67479 +extern reiser4_block_nr estimate_update_common(const struct inode *inode);
67480 +extern int safelink_common(struct inode *object,
67481 +                          reiser4_safe_link_t link, __u64 value);
67482 +extern int prepare_write_common (struct file *, struct page *, unsigned, unsigned);
67483 +extern int key_by_inode_and_offset_common(struct inode *, loff_t, reiser4_key *);
67484 +extern int setattr_reserve_common(reiser4_tree *);
67485 +extern int setattr_common(struct inode *, struct iattr *);
67486 +
67487 +extern reiser4_plugin_ops cryptcompress_plugin_ops;
67488 +
67489 +/* __FS_REISER4_PLUGIN_OBJECT_H__ */
67490 +#endif
67491 +
67492 +/* Make Linus happy.
67493 +   Local variables:
67494 +   c-indentation-style: "K&R"
67495 +   mode-name: "LC"
67496 +   c-basic-offset: 8
67497 +   tab-width: 8
67498 +   fill-column: 120
67499 +   End:
67500 +*/
67501 diff -rupN linux-2.6.8-rc3/fs/reiser4/plugin/plugin.c linux-2.6.8-rc3-a/fs/reiser4/plugin/plugin.c
67502 --- linux-2.6.8-rc3/fs/reiser4/plugin/plugin.c  1970-01-01 03:00:00.000000000 +0300
67503 +++ linux-2.6.8-rc3-a/fs/reiser4/plugin/plugin.c        2004-08-05 21:20:52.979682360 +0400
67504 @@ -0,0 +1,650 @@
67505 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
67506 + * reiser4/README */
67507 +
67508 +/* Basic plugin infrastructure, lookup etc. */
67509 +
67510 +/* PLUGINS:
67511 +
67512 +   Plugins are internal Reiser4 "modules" or "objects" used to increase
67513 +   extensibility and allow external users to easily adapt reiser4 to
67514 +   their needs.
67515 +
67516 +   Plugins are classified into several disjoint "types". Plugins
67517 +   belonging to the particular plugin type are termed "instances" of
67518 +   this type. Currently the following types are present:
67519 +
67520 +    . object plugin
67521 +    . hash plugin
67522 +    . tail plugin
67523 +    . perm plugin
67524 +    . item plugin
67525 +    . node layout plugin
67526 +
67527 +NIKITA-FIXME-HANS: update this list, and review this entire comment for currency
67528 +
67529 +   Object (file) plugin determines how given file-system object serves
67530 +   standard VFS requests for read, write, seek, mmap etc. Instances of
67531 +   file plugins are: regular file, directory, symlink. Another example
67532 +   of file plugin is audit plugin, that optionally records accesses to
67533 +   underlying object and forwards requests to it.
67534 +
67535 +   Hash plugins compute hashes used by reiser4 to store and locate
67536 +   files within directories. Instances of hash plugin type are: r5,
67537 +   tea, rupasov.
67538 +
67539 +   Tail plugins (or, more precisely, tail policy plugins) determine
67540 +   when last part of the file should be stored in a formatted item.
67541 +
67542 +   Perm plugins control permissions granted for a process accessing a file.
67543 +
67544 +   Scope and lookup:
67545 +
67546 +   label such that pair ( type_label, plugin_label ) is unique.  This
67547 +   pair is a globally persistent and user-visible plugin
67548 +   identifier. Internally kernel maintains plugins and plugin types in
67549 +   arrays using an index into those arrays as plugin and plugin type
67550 +   identifiers. File-system in turn, also maintains persistent
67551 +   "dictionary" which is mapping from plugin label to numerical
67552 +   identifier which is stored in file-system objects.  That is, we
67553 +   store the offset into the plugin array for that plugin type as the
67554 +   plugin id in the stat data of the filesystem object.
67555 +
67556 +   plugin_labels have meaning for the user interface that assigns
67557 +   plugins to files, and may someday have meaning for dynamic loading of
67558 +   plugins and for copying of plugins from one fs instance to
67559 +   another by utilities like cp and tar.
67560 +
67561 +   Internal kernel plugin type identifier (index in plugins[] array) is
67562 +   of type reiser4_plugin_type. Set of available plugin types is
67563 +   currently static, but dynamic loading doesn't seem to pose
67564 +   insurmountable problems.
67565 +
67566 +   Within each type plugins are addressed by the identifiers of type
67567 +   reiser4_plugin_id (indices in
67568 +   reiser4_plugin_type_data.builtin[]). Such identifiers are only
67569 +   required to be unique within one type, not globally.
67570 +
67571 +   Thus, plugin in memory is uniquely identified by the pair (type_id,
67572 +   id).
67573 +
67574 +   Usage:
67575 +
67576 +   There exists only one instance of each plugin instance, but this
67577 +   single instance can be associated with many entities (file-system
67578 +   objects, items, nodes, transactions, file-descriptors etc.). Entity
67579 +   to which plugin of given type is termed (due to the lack of
67580 +   imagination) "subject" of this plugin type and, by abuse of
67581 +   terminology, subject of particular instance of this type to which
67582 +   it's attached currently. For example, inode is subject of object
67583 +   plugin type. Inode representing directory is subject of directory
67584 +   plugin, hash plugin type and some particular instance of hash plugin
67585 +   type. Inode, representing regular file is subject of "regular file"
67586 +   plugin, tail-policy plugin type etc.
67587 +
67588 +   With each subject the plugin possibly stores some state. For example,
67589 +   the state of a directory plugin (instance of object plugin type) is pointer
67590 +   to hash plugin (if directories always use hashing that is). State of
67591 +   audit plugin is file descriptor (struct file) of log file or some
67592 +   magic value to do logging through printk().
67593 +
67594 +   Interface:
67595 +
67596 +   In addition to a scalar identifier, each plugin type and plugin
67597 +   proper has a "label": short string and a "description"---longer
67598 +   descriptive string. Labels and descriptions of plugin types are
67599 +   hard-coded into plugins[] array, declared and defined in
67600 +   plugin.c. Label and description of plugin are stored in .label and
67601 +   .desc fields of reiser4_plugin_header respectively. It's possible to
67602 +   locate plugin by the pair of labels.
67603 +
67604 +   Features:
67605 +
67606 +    . user-level plugin manipulations:
67607 +      + reiser4("filename/..file_plugin<='audit'");
67608 +      + write(open("filename/..file_plugin"), "audit", 8);
67609 +
67610 +    . user level utilities lsplug and chplug to manipulate plugins.
67611 +      Utilities are not of primary priority. Possibly they will be not
67612 +      working on v4.0
67613 +
67614 +NIKITA-FIXME-HANS: this should be a mkreiserfs option not a mount option, do you agree?  I don't think that specifying it at mount time, and then changing it with each mount, is a good model for usage.
67615 +
67616 +    . mount option "plug" to set-up plugins of root-directory.
67617 +      "plug=foo:bar" will set "bar" as default plugin of type "foo".
67618 +
67619 +   Limitations:
67620 +
67621 +    . each plugin type has to provide at least one builtin
67622 +      plugin. This is technical limitation and it can be lifted in the
67623 +      future.
67624 +
67625 +   TODO:
67626 +
67627 +   New plugin types/plugings:
67628 +   Things we should be able to separately choose to inherit:
67629 +
67630 +   security plugins
67631 +
67632 +   stat data
67633 +
67634 +   file bodies
67635 +
67636 +   file plugins
67637 +
67638 +   dir plugins
67639 +
67640 +    . perm:acl
67641 +
67642 +    d audi---audit plugin intercepting and possibly logging all
67643 +      accesses to object. Requires to put stub functions in file_operations
67644 +      in stead of generic_file_*.
67645 +
67646 +NIKITA-FIXME-HANS: why make overflows a plugin?
67647 +    . over---handle hash overflows
67648 +
67649 +    . sqnt---handle different access patterns and instruments read-ahead
67650 +
67651 +NIKITA-FIXME-HANS: describe the line below in more detail.
67652 +
67653 +    . hier---handle inheritance of plugins along file-system hierarchy
67654 +
67655 +   Different kinds of inheritance: on creation vs. on access.
67656 +   Compatible/incompatible plugins.
67657 +   Inheritance for multi-linked files.
67658 +   Layered plugins.
67659 +   Notion of plugin context is abandoned.
67660 +
67661 +Each file is associated
67662 +   with one plugin and dependant plugins (hash, etc.) are stored as
67663 +   main plugin state. Now, if we have plugins used for regular files
67664 +   but not for directories, how such plugins would be inherited?
67665 +    . always store them with directories also
67666 +
67667 +NIKTIA-FIXME-HANS: Do the line above.  It is not exclusive of doing the line below which is also useful.
67668 +
67669 +    . use inheritance hierarchy, independent of file-system namespace
67670 +
67671 +*/
67672 +
67673 +#include "../debug.h"
67674 +#include "../dformat.h"
67675 +#include "plugin_header.h"
67676 +#include "item/static_stat.h"
67677 +#include "node/node.h"
67678 +#include "security/perm.h"
67679 +#include "space/space_allocator.h"
67680 +#include "disk_format/disk_format.h"
67681 +#include "plugin.h"
67682 +#include "../reiser4.h"
67683 +#include "../jnode.h"
67684 +#include "../inode.h"
67685 +
67686 +#include <linux/fs.h>          /* for struct super_block  */
67687 +
67688 +/* public interface */
67689 +
67690 +/* initialise plugin sub-system. Just call this once on reiser4 startup. */
67691 +int init_plugins(void);
67692 +int setup_plugins(struct super_block *super, reiser4_plugin ** area);
67693 +reiser4_plugin *lookup_plugin(const char *type_label, const char *plug_label);
67694 +reiser4_plugin *lookup_plugin_name(char *plug_label);
67695 +int locate_plugin(struct inode *inode, plugin_locator * loc);
67696 +
67697 +/* internal functions. */
67698 +
67699 +static reiser4_plugin_type find_type(const char *label);
67700 +static reiser4_plugin *find_plugin(reiser4_plugin_type_data * ptype, const char *label);
67701 +
67702 +/* initialise plugin sub-system. Just call this once on reiser4 startup. */
67703 +reiser4_internal int
67704 +init_plugins(void)
67705 +{
67706 +       reiser4_plugin_type type_id;
67707 +
67708 +       ON_TRACE(TRACE_PLUGINS, "Builtin plugins:\n");
67709 +       for (type_id = 0; type_id < REISER4_PLUGIN_TYPES; ++type_id) {
67710 +               reiser4_plugin_type_data *ptype;
67711 +               int i;
67712 +
67713 +               ptype = &plugins[type_id];
67714 +               assert("nikita-3508", ptype->label != NULL);
67715 +               assert("nikita-3509", ptype->type_id == type_id);
67716 +
67717 +               plugin_list_init(&ptype->plugins_list);
67718 +               ON_TRACE(TRACE_PLUGINS,
67719 +                        "Of type %s (%s):\n", ptype->label, ptype->desc);
67720 +/* NIKITA-FIXME-HANS: change builtin_num to some other name lacking the term builtin. */
67721 +               for (i = 0; i < ptype->builtin_num; ++i) {
67722 +                       reiser4_plugin *plugin;
67723 +
67724 +                       plugin = plugin_at(ptype, i);
67725 +
67726 +                       if (plugin->h.label == NULL)
67727 +                               /* uninitialized slot encountered */
67728 +                               continue;
67729 +                       assert("nikita-3445", plugin->h.type_id == type_id);
67730 +                       plugin->h.id = i;
67731 +                       IF_TRACE(TRACE_PLUGINS, print_plugin("\t", plugin));
67732 +                       if (plugin->h.pops != NULL &&
67733 +                           plugin->h.pops->init != NULL) {
67734 +                               int result;
67735 +
67736 +                               result = plugin->h.pops->init(plugin);
67737 +                               if (result != 0)
67738 +                                       return result;
67739 +                       }
67740 +                       plugin_list_clean(plugin);
67741 +                       plugin_list_push_back(&ptype->plugins_list, plugin);
67742 +               }
67743 +       }
67744 +       return 0;
67745 +}
67746 +
67747 +/* lookup plugin name by scanning tables */
67748 +reiser4_internal reiser4_plugin *
67749 +lookup_plugin_name(char *plug_label /* label to search for */ )
67750 +{
67751 +       reiser4_plugin_type type_id;
67752 +       reiser4_plugin *plugin;
67753 +
67754 +/* DEMIDOV-FIXME-HANS: did you get Saveliev to agree that his name is not Vova?  If not, change to DEMIDOV-001 */
67755 +       assert("vova-001", plug_label != NULL);
67756 +
67757 +       plugin = NULL;
67758 +
67759 +       dinfo("lookup_plugin_name: %s\n", plug_label);
67760 +
67761 +       for (type_id = 0; type_id < REISER4_PLUGIN_TYPES; ++type_id) {
67762 +               plugin = find_plugin(&plugins[type_id], plug_label);
67763 +               if (plugin != NULL)
67764 +                       break;
67765 +       }
67766 +       return plugin;
67767 +}
67768 +
67769 +/* true if plugin type id is valid */
67770 +reiser4_internal int
67771 +is_type_id_valid(reiser4_plugin_type type_id /* plugin type id */)
67772 +{
67773 +       /* "type_id" is unsigned, so no comparison with 0 is
67774 +          necessary */
67775 +       return (type_id < REISER4_PLUGIN_TYPES);
67776 +}
67777 +
67778 +/* true if plugin id is valid */
67779 +reiser4_internal int
67780 +is_plugin_id_valid(reiser4_plugin_type type_id /* plugin type id */ ,
67781 +                  reiser4_plugin_id id /* plugin id */)
67782 +{
67783 +       assert("nikita-1653", is_type_id_valid(type_id));
67784 +       return ((id < plugins[type_id].builtin_num) && (id >= 0));
67785 +}
67786 +
67787 +/* lookup plugin by scanning tables */
67788 +reiser4_internal reiser4_plugin *
67789 +lookup_plugin(const char *type_label /* plugin type label */ ,
67790 +             const char *plug_label /* plugin label */ )
67791 +{
67792 +       reiser4_plugin *result;
67793 +       reiser4_plugin_type type_id;
67794 +
67795 +       assert("nikita-546", type_label != NULL);
67796 +       assert("nikita-547", plug_label != NULL);
67797 +
67798 +       type_id = find_type(type_label);
67799 +       if (is_type_id_valid(type_id))
67800 +               result = find_plugin(&plugins[type_id], plug_label);
67801 +       else
67802 +               result = NULL;
67803 +       return result;
67804 +}
67805 +
67806 +/* return plugin by its @type_id and @id.
67807 +
67808 +   Both arguments are checked for validness: this is supposed to be called
67809 +   from user-level.
67810 +
67811 +NIKITA-FIXME-HANS: Do you instead mean that this checks ids created in
67812 +user space, and passed to the filesystem by use of method files? Your
67813 +comment really confused me on the first reading....
67814 +
67815 +*/
67816 +reiser4_internal reiser4_plugin *
67817 +plugin_by_unsafe_id(reiser4_plugin_type type_id        /* plugin
67818 +                                                * type id,
67819 +                                                * unchecked */ ,
67820 +                   reiser4_plugin_id id        /* plugin id,
67821 +                                                * unchecked */ )
67822 +{
67823 +       if (is_type_id_valid(type_id)) {
67824 +               if (is_plugin_id_valid(type_id, id))
67825 +                       return plugin_at(&plugins[type_id], id);
67826 +               else
67827 +                       /* id out of bounds */
67828 +                       warning("nikita-2913",
67829 +                               "Invalid plugin id: [%i:%i]", type_id, id);
67830 +       } else
67831 +               /* type_id out of bounds */
67832 +               warning("nikita-2914", "Invalid type_id: %i", type_id);
67833 +       return NULL;
67834 +}
67835 +
67836 +/* convert plugin id to the disk format */
67837 +reiser4_internal int
67838 +save_plugin_id(reiser4_plugin * plugin /* plugin to convert */ ,
67839 +              d16 * area /* where to store result */ )
67840 +{
67841 +       assert("nikita-1261", plugin != NULL);
67842 +       assert("nikita-1262", area != NULL);
67843 +
67844 +       cputod16((__u16) plugin->h.id, area);
67845 +       return 0;
67846 +}
67847 +
67848 +/* list of all plugins of given type */
67849 +reiser4_internal plugin_list_head *
67850 +get_plugin_list(reiser4_plugin_type type_id    /* plugin type
67851 +                                                * id */ )
67852 +{
67853 +       assert("nikita-1056", is_type_id_valid(type_id));
67854 +       return &plugins[type_id].plugins_list;
67855 +}
67856 +
67857 +#if REISER4_DEBUG_OUTPUT
67858 +/* print human readable plugin information */
67859 +reiser4_internal void
67860 +print_plugin(const char *prefix /* prefix to print */ ,
67861 +            reiser4_plugin * plugin /* plugin to print */ )
67862 +{
67863 +       if (plugin != NULL) {
67864 +               printk("%s: %s (%s:%i)\n", prefix, plugin->h.desc, plugin->h.label, plugin->h.id);
67865 +       } else
67866 +               printk("%s: (nil)\n", prefix);
67867 +}
67868 +
67869 +#endif
67870 +
67871 +/* find plugin type by label */
67872 +static reiser4_plugin_type
67873 +find_type(const char *label    /* plugin type
67874 +                                * label */ )
67875 +{
67876 +       reiser4_plugin_type type_id;
67877 +
67878 +       assert("nikita-550", label != NULL);
67879 +
67880 +       for (type_id = 0; type_id < REISER4_PLUGIN_TYPES &&
67881 +                    strcmp(label, plugins[type_id].label); ++type_id) {
67882 +               ;
67883 +       }
67884 +       return type_id;
67885 +}
67886 +
67887 +/* given plugin label find it within given plugin type by scanning
67888 +    array. Used to map user-visible symbolic name to internal kernel
67889 +    id */
67890 +static reiser4_plugin *
67891 +find_plugin(reiser4_plugin_type_data * ptype   /* plugin
67892 +                                                * type to
67893 +                                                * find
67894 +                                                * plugin
67895 +                                                * within */ ,
67896 +           const char *label /* plugin label */ )
67897 +{
67898 +       int i;
67899 +       reiser4_plugin *result;
67900 +
67901 +       assert("nikita-551", ptype != NULL);
67902 +       assert("nikita-552", label != NULL);
67903 +
67904 +       for (i = 0; i < ptype->builtin_num; ++i) {
67905 +               result = plugin_at(ptype, i);
67906 +               if (result->h.label == NULL)
67907 +                       continue;
67908 +               if (!strcmp(result->h.label, label))
67909 +                       return result;
67910 +       }
67911 +       return NULL;
67912 +}
67913 +
67914 +int
67915 +grab_plugin(struct inode *self, struct inode *ancestor, pset_member memb)
67916 +{
67917 +       reiser4_plugin *plug;
67918 +       reiser4_inode *parent;
67919 +
67920 +       parent = reiser4_inode_data(ancestor);
67921 +       plug = pset_get(parent->hset, memb) ? : pset_get(parent->pset, memb);
67922 +       return grab_plugin_from(self, memb, plug);
67923 +}
67924 +
67925 +static void
67926 +update_plugin_mask(reiser4_inode *info, pset_member memb)
67927 +{
67928 +       struct dentry *rootdir;
67929 +       reiser4_inode *root;
67930 +
67931 +       rootdir = inode_by_reiser4_inode(info)->i_sb->s_root;
67932 +       if (rootdir != NULL) {
67933 +               root = reiser4_inode_data(rootdir->d_inode);
67934 +               /*
67935 +                * if inode is different from the default one, or we are
67936 +                * changing plugin of root directory, update plugin_mask
67937 +                */
67938 +               if (pset_get(info->pset, memb) != pset_get(root->pset, memb) ||
67939 +                   info == root)
67940 +                       info->plugin_mask |= (1 << memb);
67941 +       }
67942 +}
67943 +
67944 +int
67945 +grab_plugin_from(struct inode *self, pset_member memb, reiser4_plugin *plug)
67946 +{
67947 +       reiser4_inode *info;
67948 +       int            result = 0;
67949 +
67950 +       info = reiser4_inode_data(self);
67951 +       if (pset_get(info->pset, memb) == NULL) {
67952 +               result = pset_set(&info->pset, memb, plug);
67953 +               if (result == 0)
67954 +                       update_plugin_mask(info, memb);
67955 +       }
67956 +       return result;
67957 +}
67958 +
67959 +int
67960 +force_plugin(struct inode *self, pset_member memb, reiser4_plugin *plug)
67961 +{
67962 +       reiser4_inode *info;
67963 +       int            result = 0;
67964 +
67965 +       info = reiser4_inode_data(self);
67966 +       if (plug->h.pops != NULL && plug->h.pops->change != NULL)
67967 +               result = plug->h.pops->change(self, plug);
67968 +       else
67969 +               result = pset_set(&info->pset, memb, plug);
67970 +       if (result == 0)
67971 +               update_plugin_mask(info, memb);
67972 +       return result;
67973 +}
67974 +
67975 +/* defined in fs/reiser4/plugin/file.c */
67976 +extern file_plugin file_plugins[LAST_FILE_PLUGIN_ID];
67977 +/* defined in fs/reiser4/plugin/dir.c */
67978 +extern dir_plugin dir_plugins[LAST_DIR_ID];
67979 +/* defined in fs/reiser4/plugin/item/static_stat.c */
67980 +extern sd_ext_plugin sd_ext_plugins[LAST_SD_EXTENSION];
67981 +/* defined in fs/reiser4/plugin/hash.c */
67982 +extern hash_plugin hash_plugins[LAST_HASH_ID];
67983 +/* defined in fs/reiser4/plugin/fibration.c */
67984 +extern fibration_plugin fibration_plugins[LAST_FIBRATION_ID];
67985 +/* defined in fs/reiser4/plugin/crypt.c */
67986 +extern crypto_plugin crypto_plugins[LAST_CRYPTO_ID];
67987 +/* defined in fs/reiser4/plugin/digest.c */
67988 +extern digest_plugin digest_plugins[LAST_DIGEST_ID];
67989 +/* defined in fs/reiser4/plugin/compress.c */
67990 +extern compression_plugin compression_plugins[LAST_COMPRESSION_ID];
67991 +/* defined in fs/reiser4/plugin/tail.c */
67992 +extern formatting_plugin formatting_plugins[LAST_TAIL_FORMATTING_ID];
67993 +/* defined in fs/reiser4/plugin/security/security.c */
67994 +extern perm_plugin perm_plugins[LAST_PERM_ID];
67995 +/* defined in fs/reiser4/plugin/item/item.c */
67996 +extern item_plugin item_plugins[LAST_ITEM_ID];
67997 +/* defined in fs/reiser4/plugin/node/node.c */
67998 +extern node_plugin node_plugins[LAST_NODE_ID];
67999 +/* defined in fs/reiser4/plugin/disk_format/disk_format.c */
68000 +extern disk_format_plugin format_plugins[LAST_FORMAT_ID];
68001 +/* defined in jnode.c */
68002 +extern jnode_plugin jnode_plugins[LAST_JNODE_TYPE];
68003 +/* defined in plugin/pseudo.c */
68004 +extern pseudo_plugin pseudo_plugins[LAST_PSEUDO_ID];
68005 +
68006 +reiser4_plugin_type_data plugins[REISER4_PLUGIN_TYPES] = {
68007 +       /* C90 initializers */
68008 +       [REISER4_FILE_PLUGIN_TYPE] = {
68009 +               .type_id = REISER4_FILE_PLUGIN_TYPE,
68010 +               .label = "file",
68011 +               .desc = "Object plugins",
68012 +               .builtin_num = sizeof_array(file_plugins),
68013 +               .builtin = file_plugins,
68014 +               .plugins_list = TYPE_SAFE_LIST_HEAD_ZERO,
68015 +               .size = sizeof (file_plugin)
68016 +       },
68017 +       [REISER4_DIR_PLUGIN_TYPE] = {
68018 +               .type_id = REISER4_DIR_PLUGIN_TYPE,
68019 +               .label = "dir",
68020 +               .desc = "Directory plugins",
68021 +               .builtin_num = sizeof_array(dir_plugins),
68022 +               .builtin = dir_plugins,
68023 +               .plugins_list = TYPE_SAFE_LIST_HEAD_ZERO,
68024 +               .size = sizeof (dir_plugin)
68025 +       },
68026 +       [REISER4_HASH_PLUGIN_TYPE] = {
68027 +               .type_id = REISER4_HASH_PLUGIN_TYPE,
68028 +               .label = "hash",
68029 +               .desc = "Directory hashes",
68030 +               .builtin_num = sizeof_array(hash_plugins),
68031 +               .builtin = hash_plugins,
68032 +               .plugins_list = TYPE_SAFE_LIST_HEAD_ZERO,
68033 +               .size = sizeof (hash_plugin)
68034 +       },
68035 +       [REISER4_FIBRATION_PLUGIN_TYPE] = {
68036 +               .type_id = REISER4_FIBRATION_PLUGIN_TYPE,
68037 +               .label = "fibration",
68038 +               .desc = "Directory fibrations",
68039 +               .builtin_num = sizeof_array(fibration_plugins),
68040 +               .builtin = fibration_plugins,
68041 +               .plugins_list = TYPE_SAFE_LIST_HEAD_ZERO,
68042 +               .size = sizeof (fibration_plugin)
68043 +       },
68044 +       [REISER4_CRYPTO_PLUGIN_TYPE] = {
68045 +               .type_id = REISER4_CRYPTO_PLUGIN_TYPE,
68046 +               .label = "crypto",
68047 +               .desc = "Crypto plugins",
68048 +               .builtin_num = sizeof_array(crypto_plugins),
68049 +               .builtin = crypto_plugins,
68050 +               .plugins_list = TYPE_SAFE_LIST_HEAD_ZERO,
68051 +               .size = sizeof (crypto_plugin)
68052 +       },
68053 +       [REISER4_DIGEST_PLUGIN_TYPE] = {
68054 +               .type_id = REISER4_DIGEST_PLUGIN_TYPE,
68055 +               .label = "digest",
68056 +               .desc = "Digest plugins",
68057 +               .builtin_num = sizeof_array(digest_plugins),
68058 +               .builtin = digest_plugins,
68059 +               .plugins_list = TYPE_SAFE_LIST_HEAD_ZERO,
68060 +               .size = sizeof (digest_plugin)
68061 +       },
68062 +       [REISER4_COMPRESSION_PLUGIN_TYPE] = {
68063 +               .type_id = REISER4_COMPRESSION_PLUGIN_TYPE,
68064 +               .label = "compression",
68065 +               .desc = "Compression plugins",
68066 +               .builtin_num = sizeof_array(compression_plugins),
68067 +               .builtin = compression_plugins,
68068 +               .plugins_list = TYPE_SAFE_LIST_HEAD_ZERO,
68069 +               .size = sizeof (compression_plugin)
68070 +       },
68071 +
68072 +       [REISER4_FORMATTING_PLUGIN_TYPE] = {
68073 +               .type_id = REISER4_FORMATTING_PLUGIN_TYPE,
68074 +               .label = "formatting",
68075 +               .desc = "Tail inlining policies",
68076 +               .builtin_num = sizeof_array(formatting_plugins),
68077 +               .builtin = formatting_plugins,
68078 +               .plugins_list = TYPE_SAFE_LIST_HEAD_ZERO,
68079 +               .size = sizeof (formatting_plugin)
68080 +       },
68081 +       [REISER4_PERM_PLUGIN_TYPE] = {
68082 +               .type_id = REISER4_PERM_PLUGIN_TYPE,
68083 +               .label = "perm",
68084 +               .desc = "Permission checks",
68085 +               .builtin_num = sizeof_array(perm_plugins),
68086 +               .builtin = perm_plugins,
68087 +               .plugins_list = TYPE_SAFE_LIST_HEAD_ZERO,
68088 +               .size = sizeof (perm_plugin)
68089 +       },
68090 +       [REISER4_ITEM_PLUGIN_TYPE] = {
68091 +               .type_id = REISER4_ITEM_PLUGIN_TYPE,
68092 +               .label = "item",
68093 +               .desc = "Item handlers",
68094 +               .builtin_num = sizeof_array(item_plugins),
68095 +               .builtin = item_plugins,
68096 +               .plugins_list = TYPE_SAFE_LIST_HEAD_ZERO,
68097 +               .size = sizeof (item_plugin)
68098 +       },
68099 +       [REISER4_NODE_PLUGIN_TYPE] = {
68100 +               .type_id = REISER4_NODE_PLUGIN_TYPE,
68101 +               .label = "node",
68102 +               .desc = "node layout handlers",
68103 +               .builtin_num = sizeof_array(node_plugins),
68104 +               .builtin = node_plugins,
68105 +               .plugins_list = TYPE_SAFE_LIST_HEAD_ZERO,
68106 +               .size = sizeof (node_plugin)
68107 +       },
68108 +       [REISER4_SD_EXT_PLUGIN_TYPE] = {
68109 +               .type_id = REISER4_SD_EXT_PLUGIN_TYPE,
68110 +               .label = "sd_ext",
68111 +               .desc = "Parts of stat-data",
68112 +               .builtin_num = sizeof_array(sd_ext_plugins),
68113 +               .builtin = sd_ext_plugins,
68114 +               .plugins_list = TYPE_SAFE_LIST_HEAD_ZERO,
68115 +               .size = sizeof (sd_ext_plugin)
68116 +       },
68117 +       [REISER4_FORMAT_PLUGIN_TYPE] = {
68118 +               .type_id = REISER4_FORMAT_PLUGIN_TYPE,
68119 +               .label = "disk_layout",
68120 +               .desc = "defines filesystem on disk layout",
68121 +               .builtin_num = sizeof_array(format_plugins),
68122 +               .builtin = format_plugins,
68123 +               .plugins_list = TYPE_SAFE_LIST_HEAD_ZERO,
68124 +               .size = sizeof (disk_format_plugin)
68125 +       },
68126 +       [REISER4_JNODE_PLUGIN_TYPE] = {
68127 +               .type_id = REISER4_JNODE_PLUGIN_TYPE,
68128 +               .label = "jnode",
68129 +               .desc = "defines kind of jnode",
68130 +               .builtin_num = sizeof_array(jnode_plugins),
68131 +               .builtin = jnode_plugins,
68132 +               .plugins_list = TYPE_SAFE_LIST_HEAD_ZERO,
68133 +               .size = sizeof (jnode_plugin)
68134 +       },
68135 +       [REISER4_PSEUDO_PLUGIN_TYPE] = {
68136 +               .type_id = REISER4_PSEUDO_PLUGIN_TYPE,
68137 +               .label = "pseudo_file",
68138 +               .desc = "pseudo file",
68139 +               .builtin_num = sizeof_array(pseudo_plugins),
68140 +               .builtin = pseudo_plugins,
68141 +               .plugins_list = TYPE_SAFE_LIST_HEAD_ZERO,
68142 +               .size = sizeof (pseudo_plugin)
68143 +       }
68144 +};
68145 +
68146 +/* Make Linus happy.
68147 +   Local variables:
68148 +   c-indentation-style: "K&R"
68149 +   mode-name: "LC"
68150 +   c-basic-offset: 8
68151 +   tab-width: 8
68152 +   fill-column: 120
68153 +   End:
68154 +*/
68155 diff -rupN linux-2.6.8-rc3/fs/reiser4/plugin/plugin.h linux-2.6.8-rc3-a/fs/reiser4/plugin/plugin.h
68156 --- linux-2.6.8-rc3/fs/reiser4/plugin/plugin.h  1970-01-01 03:00:00.000000000 +0300
68157 +++ linux-2.6.8-rc3-a/fs/reiser4/plugin/plugin.h        2004-08-05 21:20:52.962685945 +0400
68158 @@ -0,0 +1,836 @@
68159 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
68160 +
68161 +/* Basic plugin data-types.
68162 +   see fs/reiser4/plugin/plugin.c for details */
68163 +
68164 +#if !defined( __FS_REISER4_PLUGIN_TYPES_H__ )
68165 +#define __FS_REISER4_PLUGIN_TYPES_H__
68166 +
68167 +#include "../forward.h"
68168 +#include "../debug.h"
68169 +#include "../dformat.h"
68170 +#include "../key.h"
68171 +#include "../type_safe_list.h"
68172 +#include "compress/compress.h"
68173 +#include "plugin_header.h"
68174 +#include "item/static_stat.h"
68175 +#include "item/internal.h"
68176 +#include "item/sde.h"
68177 +#include "item/cde.h"
68178 +/*#include "file/file.h"*/
68179 +#include "pseudo/pseudo.h"
68180 +#include "symlink.h"
68181 +#include "dir/hashed_dir.h"
68182 +#include "dir/dir.h"
68183 +#include "item/item.h"
68184 +#include "node/node.h"
68185 +#include "node/node40.h"
68186 +#include "security/perm.h"
68187 +#include "fibration.h"
68188 +
68189 +#include "space/bitmap.h"
68190 +#include "space/space_allocator.h"
68191 +
68192 +#include "disk_format/disk_format40.h"
68193 +#include "disk_format/disk_format.h"
68194 +
68195 +#include <linux/fs.h>          /* for struct super_block, address_space  */
68196 +#include <linux/mm.h>          /* for struct page */
68197 +#include <linux/buffer_head.h> /* for struct buffer_head */
68198 +#include <linux/dcache.h>      /* for struct dentry */
68199 +#include <linux/types.h>
68200 +#include <linux/crypto.h>
68201 +
68202 +/* a flow is a sequence of bytes being written to or read from the tree.  The
68203 +   tree will slice the flow into items while storing it into nodes, but all of
68204 +   that is hidden from anything outside the tree.  */
68205 +
68206 +struct flow {
68207 +       reiser4_key key;        /* key of start of flow's sequence of bytes */
68208 +       loff_t length;          /* length of flow's sequence of bytes */
68209 +       char *data;             /* start of flow's sequence of bytes */
68210 +       int user;               /* if 1 data is user space, 0 - kernel space */
68211 +       rw_op op;               /* NIKITA-FIXME-HANS: comment is where?  */
68212 +};
68213 +
68214 +typedef ssize_t(*rw_f_type) (struct file * file, flow_t * a_flow, loff_t * off);
68215 +
68216 +typedef struct reiser4_object_on_wire reiser4_object_on_wire;
68217 +
68218 +/* File plugin.  Defines the set of methods that file plugins implement, some of which are optional.
68219 +
68220 + A file plugin offers to the caller an interface for IO ( writing to and/or reading from) to what the caller sees as one
68221 + sequence of bytes.  An IO to it may affect more than one physical sequence of bytes, or no physical sequence of bytes,
68222 + it may affect sequences of bytes offered by other file plugins to the semantic layer, and the file plugin may invoke
68223 + other plugins and delegate work to them, but its interface is structured for offering the caller the ability to read
68224 + and/or write what the caller sees as being a single sequence of bytes.
68225 +
68226 + The file plugin must present a sequence of bytes to the caller, but it does not necessarily have to store a sequence of
68227 + bytes, it does not necessarily have to support efficient tree traversal to any offset in the sequence of bytes (tail
68228 + and extent items, whose keys contain offsets, do however provide efficient non-sequential lookup of any offset in the
68229 + sequence of bytes).
68230 +
68231 + Directory plugins provide methods for selecting file plugins by resolving a name for them.
68232 +
68233 + The functionality other filesystems call an attribute, and rigidly tie together, we decompose into orthogonal
68234 + selectable features of files.  Using the terminology we will define next, an attribute is a perhaps constrained,
68235 + perhaps static length, file whose parent has a uni-count-intra-link to it, which might be grandparent-major-packed, and
68236 + whose parent has a deletion method that deletes it.
68237 +
68238 + File plugins can implement constraints.
68239 +
68240 + Files can be of variable length (e.g. regular unix files), or of static length (e.g. static sized attributes).
68241 +
68242 + An object may have many sequences of bytes, and many file plugins, but, it has exactly one objectid.  It is usually
68243 + desirable that an object has a deletion method which deletes every item with that objectid.  Items cannot in general be
68244 + found by just their objectids.  This means that an object must have either a method built into its deletion plugin
68245 + method for knowing what items need to be deleted, or links stored with the object that provide the plugin with a method
68246 + for finding those items.  Deleting a file within an object may or may not have the effect of deleting the entire
68247 + object, depending on the file plugin's deletion method.
68248 +
68249 + LINK TAXONOMY:
68250 +
68251 +   Many objects have a reference count, and when the reference count reaches 0 the object's deletion method is invoked.
68252 + Some links embody a reference count increase ("countlinks"), and others do not ("nocountlinks").
68253 +
68254 +   Some links are bi-directional links ("bilinks"), and some are uni-directional("unilinks").
68255 +
68256 +   Some links are between parts of the same object ("intralinks"), and some are between different objects ("interlinks").
68257 +
68258 + PACKING TAXONOMY:
68259 +
68260 +   Some items of an object are stored with a major packing locality based on their object's objectid (e.g. unix directory
68261 + items in plan A), and these are called "self-major-packed".
68262 +
68263 +   Some items of an object are stored with a major packing locality based on their semantic parent object's objectid
68264 + (e.g. unix file bodies in plan A), and these are called "parent-major-packed".
68265 +
68266 +   Some items of an object are stored with a major packing locality based on their semantic grandparent, and these are
68267 + called "grandparent-major-packed".  Now carefully notice that we run into trouble with key length if we have to store a
68268 + 8 byte major+minor grandparent based packing locality, an 8 byte parent objectid, an 8 byte attribute objectid, and an
68269 + 8 byte offset, all in a 24 byte key.  One of these fields must be sacrificed if an item is to be
68270 + grandparent-major-packed, and which to sacrifice is left to the item author choosing to make the item
68271 + grandparent-major-packed.  You cannot make tail items and extent items grandparent-major-packed, though you could make
68272 + them self-major-packed (usually they are parent-major-packed).
68273 +
68274 + In the case of ACLs (which are composed of fixed length ACEs which consist of {subject-type,
68275 + subject, and permission bitmask} triples), it makes sense to not have an offset field in the ACE item key, and to allow
68276 + duplicate keys for ACEs.  Thus, the set of ACES for a given file is found by looking for a key consisting of the
68277 + objectid of the grandparent (thus grouping all ACLs in a directory together), the minor packing locality of ACE, the
68278 + objectid of the file, and 0.
68279 +
68280 + IO involves moving data from one location to another, which means that two locations must be specified, source and
68281 + destination.
68282 +
68283 + This source and destination can be in the filesystem, or they can be a pointer in the user process address space plus a byte count.
68284 +
68285 + If both source and destination are in the filesystem, then at least one of them must be representable as a pure stream
68286 + of bytes (which we call a flow, and define as a struct containing a key, a data pointer, and a length).  This may mean
68287 + converting one of them into a flow.  We provide a generic cast_into_flow() method, which will work for any plugin
68288 + supporting read_flow(), though it is inefficiently implemented in that it temporarily stores the flow in a buffer
68289 + (Question: what to do with huge flows that cannot fit into memory?  Answer: we must not convert them all at once. )
68290 +
68291 + Performing a write requires resolving the write request into a flow defining the source, and a method that performs the write, and
68292 + a key that defines where in the tree the write is to go.
68293 +
68294 + Performing a read requires resolving the read request into a flow defining the target, and a method that performs the
68295 + read, and a key that defines where in the tree the read is to come from.
68296 +
68297 + There will exist file plugins which have no pluginid stored on the disk for them, and which are only invoked by other
68298 + plugins.
68299 +
68300 +*/
68301 +typedef struct file_plugin {
68302 +
68303 +       /* generic fields */
68304 +       plugin_header h;
68305 +
68306 +       /* file_operations->open is dispatched here */
68307 +       int (*open) (struct inode * inode, struct file * file);
68308 +                               /* NIKITA-FIXME-HANS: comment all fields, even the ones every non-beginner FS developer knows.... */
68309 +       int (*truncate) (struct inode * inode, loff_t size);
68310 +
68311 +       /* save inode cached stat-data onto disk. It was called
68312 +           reiserfs_update_sd() in 3.x */
68313 +       int (*write_sd_by_inode) (struct inode * inode);
68314 +       int (*readpage) (void *, struct page *);
68315 +       int (*prepare_write) (struct file *, struct page *, unsigned, unsigned);
68316 +
68317 +       /* captures passed page to current atom and takes care about extents handling.
68318 +          This is needed for loop back devices support and used from ->commit_write()
68319 +
68320 +*/                             /* ZAM-FIXME-HANS: are you writing to yourself or the reader?  Bigger comment please. */
68321 +       int (*capturepage) (struct page *);
68322 +       /*
68323 +        * add pages created through mmap into object.
68324 +        */
68325 +       int (*capture) (struct inode *inode, const struct writeback_control *wbc, long *);
68326 +       /* these should be implemented using body_read_flow and body_write_flow
68327 +          builtins */
68328 +        ssize_t(*read) (struct file * file, char *buf, size_t size, loff_t * off);
68329 +        ssize_t(*write) (struct file * file, const char *buf, size_t size, loff_t * off);
68330 +
68331 +       int (*release) (struct inode *inode, struct file * file);
68332 +       int (*ioctl) (struct inode *, struct file *, unsigned int cmd, unsigned long arg);
68333 +       int (*mmap) (struct file * file, struct vm_area_struct * vma);
68334 +       int (*get_block) (struct inode * inode, sector_t block, struct buffer_head * bh_result, int create);
68335 +/* private methods: These are optional.  If used they will allow you to
68336 +   minimize the amount of code needed to implement a deviation from some other
68337 +   method that also uses them. */
68338 +
68339 +       /* Construct flow into @flow according to user-supplied data.
68340 +
68341 +          This is used by read/write methods to construct a flow to
68342 +          write/read. ->flow_by_inode() is plugin method, rather than single
68343 +          global implemenation, because key in a flow used by plugin may
68344 +          depend on data in a @buf.
68345 +
68346 +NIKITA-FIXME-HANS: please create statistics on what functions are
68347 +dereferenced how often for the mongo benchmark.  You can supervise
68348 +Elena doing this for you if that helps.  Email me the list of the top 10, with their counts, and an estimate of the total number of CPU cycles spent dereferencing as a percentage of CPU cycles spent processing (non-idle processing).  If the total percent is, say, less than 1%, it will make our coding discussions much easier, and keep me from questioning whether functions like the below are too frequently called to be dereferenced.  If the total percent is more than 1%, perhaps private methods should be listed in a "required" comment at the top of each plugin (with stern language about how if the comment is missing it will not be accepted by the maintainer), and implemented using macros not dereferenced functions.  How about replacing this whole private methods part of the struct with a thorough documentation of what the standard helper functions are for use in constructing plugins?  I think users have been asking for that, though not in so many words.
68349 +       */
68350 +       int (*flow_by_inode) (struct inode *, char *buf, int user, loff_t size, loff_t off, rw_op op, flow_t *);
68351 +
68352 +       /* Return the key used to retrieve an offset of a file. It is used by
68353 +          default implemenation of ->flow_by_inode() method
68354 +          (common_build_flow()) and, among other things, to get to the extent
68355 +          from jnode of unformatted node.
68356 +       */
68357 +       int (*key_by_inode) (struct inode * inode, loff_t off, reiser4_key * key);
68358 +
68359 +/* NIKITA-FIXME-HANS: this comment is not as clear to others as you think.... */
68360 +       /* set the plugin for a file.  Called during file creation in creat()
68361 +          but not reiser4() unless an inode already exists for the file. */
68362 +       int (*set_plug_in_inode) (struct inode * inode, struct inode * parent, reiser4_object_create_data * data);
68363 +
68364 +/* NIKITA-FIXME-HANS: comment and name seem to say different things, are you setting up the object itself also or just adjusting the parent?.... */
68365 +       /* set up plugins for new @object created in @parent. @root is root
68366 +          directory. */
68367 +       int (*adjust_to_parent) (struct inode * object, struct inode * parent, struct inode * root);
68368 +       /* this does whatever is necessary to do when object is created. For
68369 +          instance, for unix files stat data is inserted */
68370 +       int (*create) (struct inode * object, struct inode * parent,
68371 +                      reiser4_object_create_data * data);
68372 +       /* delete empty object. This method should check REISER4_NO_SD
68373 +          and set REISER4_NO_SD on success. Deletion of empty object
68374 +          at least includes removal of stat-data if any. For directories this
68375 +          also includes removal of dot and dot-dot.
68376 +       */
68377 +       int (*delete) (struct inode * object);
68378 +
68379 +       /* method implementing f_op->fsync() */
68380 +       int (*sync)(struct inode *, int datasync);
68381 +
68382 +       /* add link from @parent to @object */
68383 +       int (*add_link) (struct inode * object, struct inode * parent);
68384 +
68385 +       /* remove link from @parent to @object */
68386 +       int (*rem_link) (struct inode * object, struct inode * parent);
68387 +
68388 +       /* return true if item addressed by @coord belongs to @inode.
68389 +           This is used by read/write to properly slice flow into items
68390 +           in presence of multiple key assignment policies, because
68391 +           items of a file are not necessarily contiguous in a key space,
68392 +           for example, in a plan-b. */
68393 +       int (*owns_item) (const struct inode * inode, const coord_t * coord);
68394 +
68395 +       /* checks whether yet another hard links to this object can be
68396 +          added  */
68397 +       int (*can_add_link) (const struct inode * inode);
68398 +       /* checks whether hard links to this object can be removed */
68399 +       int (*can_rem_link) (const struct inode * inode);
68400 +       /* true if there is only one link (aka name) for this file */
68401 +       int (*not_linked) (const struct inode * inode);
68402 +
68403 +       /* change inode attributes. */
68404 +       int (*setattr) (struct inode * inode, struct iattr * attr);
68405 +
68406 +       /* obtain inode attributes */
68407 +       int (*getattr) (struct vfsmount * mnt UNUSED_ARG, struct dentry * dentry, struct kstat * stat);
68408 +
68409 +       /* seek */
68410 +        loff_t(*seek) (struct file * f, loff_t offset, int origin);
68411 +
68412 +       int (*detach)(struct inode *child, struct inode *parent);
68413 +
68414 +       /* called when @child was just looked up in the @parent */
68415 +       int (*bind) (struct inode * child, struct inode * parent);
68416 +
68417 +       /* process safe-link during mount */
68418 +       int (*safelink)(struct inode *object, reiser4_safe_link_t link,
68419 +                       __u64 value);
68420 +
68421 +       /* The couple of estimate methods for all file operations */
68422 +       struct {
68423 +               reiser4_block_nr (*create) (struct inode *);
68424 +               reiser4_block_nr (*update) (const struct inode *);
68425 +               reiser4_block_nr (*unlink) (struct inode *, struct inode *);
68426 +       } estimate;
68427 +       void (*readpages)(struct file *file, struct address_space *mapping,
68428 +                         struct list_head *pages);
68429 +       /* reiser4 specific part of inode has a union of structures which are specific to a plugin. This method is
68430 +          called when inode is read (read_inode) and when file is created (common_create_child) so that file plugin
68431 +          could initialize its inode data */
68432 +       void (*init_inode_data)(struct inode *, reiser4_object_create_data *, int);
68433 +
68434 +       /* truncate file to zero size. called by reiser4_drop_inode before truncate_inode_pages */
68435 +       int (*pre_delete)(struct inode *);
68436 +
68437 +       /* called from reiser4_drop_inode() */
68438 +       void (*drop)(struct inode *);
68439 +
68440 +       /* called from ->drop() when there are no links, and object should be
68441 +        * garbage collected. */
68442 +       void (*delete_inode)(struct inode *);
68443 +
68444 +       /* called from ->destroy_inode() */
68445 +       void (*destroy_inode)(struct inode *);
68446 +       void (*forget_inode)(struct inode *);
68447 +       ssize_t (*sendfile)(struct file *, loff_t *, size_t, read_actor_t, void __user *);
68448 +       /*
68449 +        * methods to serialize object identify. This is used, for example, by
68450 +        * reiser4_{en,de}code_fh().
68451 +        */
68452 +       struct {
68453 +               /* store object's identity at @area */
68454 +               char *(*write)(struct inode *inode, char *area);
68455 +               /* parse object from wire to the @obj */
68456 +               char *(*read)(char *area, reiser4_object_on_wire *obj);
68457 +               /* given object identity in @obj, find or create its dentry */
68458 +               struct dentry *(*get)(struct super_block *s,
68459 +                                     reiser4_object_on_wire *obj);
68460 +               /* how many bytes ->wire.write() consumes */
68461 +               int (*size)(struct inode *inode);
68462 +               /* finish with object identify */
68463 +               void (*done)(reiser4_object_on_wire *obj);
68464 +       } wire;
68465 +} file_plugin;
68466 +
68467 +struct reiser4_object_on_wire {
68468 +       file_plugin *plugin;
68469 +       union {
68470 +               struct {
68471 +                       obj_key_id key_id;
68472 +               } std;
68473 +               void *generic;
68474 +       } u;
68475 +};
68476 +
68477 +typedef struct dir_plugin {
68478 +       /* generic fields */
68479 +       plugin_header h;
68480 +       /* for use by open call, based on name supplied will install
68481 +          appropriate plugin and state information, into the inode such that
68482 +          subsequent VFS operations that supply a pointer to that inode
68483 +          operate in a manner appropriate.  Note that this may require storing
68484 +          some state for the plugin, and that this state might even include
68485 +          the name used by open.  */
68486 +       int (*lookup) (struct inode * parent_inode, struct dentry **dentry);
68487 +       /* VFS required/defined operations below this line */
68488 +       int (*unlink) (struct inode * parent, struct dentry * victim);
68489 +       int (*link) (struct inode * parent, struct dentry * existing, struct dentry * where);
68490 +       /* rename object named by @old entry in @old_dir to be named by @new
68491 +          entry in @new_dir */
68492 +       int (*rename) (struct inode * old_dir, struct dentry * old, struct inode * new_dir, struct dentry * new);
68493 +
68494 +       /* create new object described by @data and add it to the @parent
68495 +          directory under the name described by @dentry */
68496 +       int (*create_child) (reiser4_object_create_data * data,
68497 +                            struct inode ** retobj);
68498 +
68499 +       /* readdir implementation */
68500 +       int (*readdir) (struct file * f, void *cookie, filldir_t filldir);
68501 +
68502 +       /* private methods: These are optional.  If used they will allow you to
68503 +          minimize the amount of code needed to implement a deviation from
68504 +          some other method that uses them.  You could logically argue that
68505 +          they should be a separate type of plugin. */
68506 +
68507 +       /* check whether "name" is acceptable name to be inserted into
68508 +           this object. Optionally implemented by directory-like objects.
68509 +           Can check for maximal length, reserved symbols etc */
68510 +       int (*is_name_acceptable) (const struct inode * inode, const char *name, int len);
68511 +
68512 +       void (*build_entry_key) (const struct inode * dir /* directory where
68513 +                                                        * entry is (or will
68514 +                                                        * be) in.*/ ,
68515 +                         const struct qstr * name      /* name of file referenced
68516 +                                                        * by this entry */ ,
68517 +                         reiser4_key * result  /* resulting key of directory
68518 +                                                * entry */ );
68519 +       int (*build_readdir_key) (struct file * dir, reiser4_key * result);
68520 +       int (*add_entry) (struct inode * object, struct dentry * where,
68521 +                         reiser4_object_create_data * data, reiser4_dir_entry_desc * entry);
68522 +
68523 +       int (*rem_entry) (struct inode * object, struct dentry * where, reiser4_dir_entry_desc * entry);
68524 +
68525 +       /* initialize directory structure for newly created object. For normal
68526 +          unix directories, insert dot and dotdot. */
68527 +       int (*init) (struct inode * object, struct inode * parent, reiser4_object_create_data * data);
68528 +       /* destroy directory */
68529 +       int (*done) (struct inode * child);
68530 +
68531 +       /* called when @subdir was just looked up in the @dir */
68532 +       int (*attach) (struct inode * subdir, struct inode * dir);
68533 +       int (*detach)(struct inode * subdir, struct inode * dir);
68534 +
68535 +       struct dentry *(*get_parent)(struct inode *childdir);
68536 +
68537 +       struct {
68538 +               reiser4_block_nr (*add_entry) (struct inode *node);
68539 +               reiser4_block_nr (*rem_entry) (struct inode *node);
68540 +               reiser4_block_nr (*unlink) (struct inode *, struct inode *);
68541 +       } estimate;
68542 +} dir_plugin;
68543 +
68544 +typedef struct formatting_plugin {
68545 +       /* generic fields */
68546 +       plugin_header h;
68547 +       /* returns non-zero iff file's tail has to be stored
68548 +           in a direct item. */
68549 +       int (*have_tail) (const struct inode * inode, loff_t size);
68550 +} formatting_plugin;
68551 +
68552 +typedef struct hash_plugin {
68553 +       /* generic fields */
68554 +       plugin_header h;
68555 +       /* computes hash of the given name */
68556 +        __u64(*hash) (const unsigned char *name, int len);
68557 +} hash_plugin;
68558 +
68559 +typedef struct crypto_plugin {
68560 +       /* generic fields */
68561 +       plugin_header h;
68562 +       int (*alloc) (struct inode * inode);
68563 +       void (*free) (struct inode * inode);
68564 +       /* number of cpu expkey words */
68565 +       unsigned nr_keywords;
68566 +       /* Offset translator. For each offset this returns (k * offset), where
68567 +          k (k >= 1) is a coefficient of expansion of the crypto algorithm.
68568 +          For all symmetric algorithms k == 1. For asymmetric algorithms (which
68569 +          inflate data) offset translation guarantees that all disk cluster's
68570 +          units will have keys smaller then next cluster's one.
68571 +       */
68572 +       loff_t (*scale)(struct inode * inode, size_t blocksize, loff_t src);
68573 +       /* Crypto algorithms can accept data only by chunks of crypto block
68574 +          size. This method is to align any flow up to crypto block size when
68575 +          we pass it to crypto algorithm. To align means to append padding of
68576 +          special format specific to the crypto algorithm */
68577 +       int (*align_cluster)(__u8 *tail, int clust_size, int blocksize);
68578 +       /* low-level key manager (check, install, etc..) */
68579 +       int (*setkey) (struct crypto_tfm *tfm, const __u8 *key, unsigned int keylen);
68580 +       /* main text processing procedures */
68581 +       void (*encrypt) (__u32 *expkey, __u8 *dst, const __u8 *src);
68582 +       void (*decrypt) (__u32 *expkey, __u8 *dst, const __u8 *src);
68583 +} crypto_plugin;
68584 +
68585 +typedef struct digest_plugin {
68586 +       /* generic fields */
68587 +       plugin_header h;
68588 +       /* digest size */
68589 +       int dsize;
68590 +       int (*alloc) (struct inode * inode);
68591 +       void (*free) (struct inode * inode);
68592 +} digest_plugin;
68593 +
68594 +
68595 +typedef void * tfm_info_t;
68596 +#define SQUEEZE_TFM_INFO_SIZE (LAST_COMPRESSION_ID * sizeof(tfm_info_t))
68597 +
68598 +typedef struct compression_plugin {
68599 +       /* generic fields */
68600 +       plugin_header h;
68601 +       /* the maximum number of bytes the size of the "compressed" data can
68602 +        * exceed the uncompressed data. */
68603 +       int (*overrun) (unsigned src_len);
68604 +       int (*alloc) (tfm_info_t * ctx, tfm_action act);
68605 +       void (*free) (tfm_info_t * ctx, tfm_action act);
68606 +       /* main text processing procedures */
68607 +       void (*compress)   (tfm_info_t ctx, __u8 *src_first, unsigned src_len,
68608 +                           __u8 *dst_first, unsigned *dst_len);
68609 +       void (*decompress) (tfm_info_t ctx, __u8 *src_first, unsigned src_len,
68610 +                           __u8 *dst_first, unsigned *dst_len);
68611 +}compression_plugin;
68612 +
68613 +typedef struct sd_ext_plugin {
68614 +       /* generic fields */
68615 +       plugin_header h;
68616 +       int (*present) (struct inode * inode, char **area, int *len);
68617 +       int (*absent) (struct inode * inode);
68618 +       int (*save_len) (struct inode * inode);
68619 +       int (*save) (struct inode * inode, char **area);
68620 +#if REISER4_DEBUG_OUTPUT
68621 +       void (*print) (const char *prefix, char **area, int *len);
68622 +#endif
68623 +       /* alignment requirement for this stat-data part */
68624 +       int alignment;
68625 +} sd_ext_plugin;
68626 +
68627 +/* this plugin contains methods to allocate objectid for newly created files,
68628 +   to deallocate objectid when file gets removed, to report number of used and
68629 +   free objectids */
68630 +typedef struct oid_allocator_plugin {
68631 +       /* generic fields */
68632 +       plugin_header h;
68633 +       int (*init_oid_allocator) (reiser4_oid_allocator * map, __u64 nr_files, __u64 oids);
68634 +       /* used to report statfs->f_files */
68635 +        __u64(*oids_used) (reiser4_oid_allocator * map);
68636 +       /* get next oid to use */
68637 +        __u64(*next_oid) (reiser4_oid_allocator * map);
68638 +       /* used to report statfs->f_ffree */
68639 +        __u64(*oids_free) (reiser4_oid_allocator * map);
68640 +       /* allocate new objectid */
68641 +       int (*allocate_oid) (reiser4_oid_allocator * map, oid_t *);
68642 +       /* release objectid */
68643 +       int (*release_oid) (reiser4_oid_allocator * map, oid_t);
68644 +       /* how many pages to reserve in transaction for allocation of new
68645 +          objectid */
68646 +       int (*oid_reserve_allocate) (reiser4_oid_allocator * map);
68647 +       /* how many pages to reserve in transaction for freeing of an
68648 +          objectid */
68649 +       int (*oid_reserve_release) (reiser4_oid_allocator * map);
68650 +       void (*print_info) (const char *, reiser4_oid_allocator *);
68651 +} oid_allocator_plugin;
68652 +
68653 +/* disk layout plugin: this specifies super block, journal, bitmap (if there
68654 +   are any) locations, etc */
68655 +typedef struct disk_format_plugin {
68656 +       /* generic fields */
68657 +       plugin_header h;
68658 +       /* replay journal, initialize super_info_data, etc */
68659 +       int (*get_ready) (struct super_block *, void *data);
68660 +
68661 +       /* key of root directory stat data */
68662 +       const reiser4_key *(*root_dir_key) (const struct super_block *);
68663 +
68664 +       int (*release) (struct super_block *);
68665 +       jnode *(*log_super) (struct super_block *);
68666 +       void (*print_info) (const struct super_block *);
68667 +       int (*check_open) (const struct inode *object);
68668 +} disk_format_plugin;
68669 +
68670 +struct jnode_plugin {
68671 +       /* generic fields */
68672 +       plugin_header h;
68673 +       int (*init) (jnode * node);
68674 +       int (*parse) (jnode * node);
68675 +       struct address_space *(*mapping) (const jnode * node);
68676 +       unsigned long (*index) (const jnode * node);
68677 +       jnode *(*clone) (jnode * node);
68678 +};
68679 +
68680 +/* plugin instance.                                                         */
68681 +/*                                                                          */
68682 +/* This is "wrapper" union for all types of plugins. Most of the code uses  */
68683 +/* plugins of particular type (file_plugin, dir_plugin, etc.)  rather than  */
68684 +/* operates with pointers to reiser4_plugin. This union is only used in     */
68685 +/* some generic code in plugin/plugin.c that operates on all                */
68686 +/* plugins. Technically speaking purpose of this union is to add type       */
68687 +/* safety to said generic code: each plugin type (file_plugin, for          */
68688 +/* example), contains plugin_header as its first memeber. This first member */
68689 +/* is located at the same place in memory as .h member of                   */
68690 +/* reiser4_plugin. Generic code, obtains pointer to reiser4_plugin and      */
68691 +/* looks in the .h which is header of plugin type located in union. This    */
68692 +/* allows to avoid type-casts.                                              */
68693 +union reiser4_plugin {
68694 +       /* generic fields */
68695 +       plugin_header h;
68696 +       /* file plugin */
68697 +       file_plugin file;
68698 +       /* directory plugin */
68699 +       dir_plugin dir;
68700 +       /* hash plugin, used by directory plugin */
68701 +       hash_plugin hash;
68702 +       /* fibration plugin used by directory plugin */
68703 +       fibration_plugin fibration;
68704 +       /* crypto plugin, used by file plugin */
68705 +       crypto_plugin crypto;
68706 +       /* digest plugin, used by file plugin */
68707 +       digest_plugin digest;
68708 +       /* compression plugin, used by file plugin */
68709 +       compression_plugin compression;
68710 +       /* tail plugin, used by file plugin */
68711 +       formatting_plugin formatting;
68712 +       /* permission plugin */
68713 +       perm_plugin perm;
68714 +       /* node plugin */
68715 +       node_plugin node;
68716 +       /* item plugin */
68717 +       item_plugin item;
68718 +       /* stat-data extension plugin */
68719 +       sd_ext_plugin sd_ext;
68720 +       /* disk layout plugin */
68721 +       disk_format_plugin format;
68722 +       /* object id allocator plugin */
68723 +       oid_allocator_plugin oid_allocator;
68724 +       /* plugin for different jnode types */
68725 +       jnode_plugin jnode;
68726 +       /* plugin for pseudo files */
68727 +       pseudo_plugin pseudo;
68728 +       /* place-holder for new plugin types that can be registered
68729 +          dynamically, and used by other dynamically loaded plugins.  */
68730 +       void *generic;
68731 +};
68732 +
68733 +struct reiser4_plugin_ops {
68734 +       /* called when plugin is initialized */
68735 +       int (*init) (reiser4_plugin * plugin);
68736 +       /* called when plugin is unloaded */
68737 +       int (*done) (reiser4_plugin * plugin);
68738 +       /* load given plugin from disk */
68739 +       int (*load) (struct inode * inode,
68740 +                    reiser4_plugin * plugin, char **area, int *len);
68741 +       /* how many space is required to store this plugin's state
68742 +           in stat-data */
68743 +       int (*save_len) (struct inode * inode, reiser4_plugin * plugin);
68744 +       /* save persistent plugin-data to disk */
68745 +       int (*save) (struct inode * inode, reiser4_plugin * plugin, char **area);
68746 +       /* alignment requirement for on-disk state of this plugin
68747 +           in number of bytes */
68748 +       int alignment;
68749 +       /* install itself into given inode. This can return error
68750 +           (e.g., you cannot change hash of non-empty directory). */
68751 +       int (*change) (struct inode * inode, reiser4_plugin * plugin);
68752 +       /* install itself into given inode. This can return error
68753 +           (e.g., you cannot change hash of non-empty directory). */
68754 +       int (*inherit) (struct inode * inode, struct inode * parent,
68755 +                       reiser4_plugin * plugin);
68756 +};
68757 +
68758 +/* functions implemented in fs/reiser4/plugin/plugin.c */
68759 +
68760 +/* stores plugin reference in reiser4-specific part of inode */
68761 +extern int set_object_plugin(struct inode *inode, reiser4_plugin_id id);
68762 +extern int setup_plugins(struct super_block *super, reiser4_plugin ** area);
68763 +extern reiser4_plugin *lookup_plugin(const char *type_label, const char *plug_label);
68764 +extern int init_plugins(void);
68765 +
68766 +/* functions implemented in fs/reiser4/plugin/object.c */
68767 +void move_flow_forward(flow_t * f, unsigned count);
68768 +
68769 +/* builtin plugins */
68770 +
68771 +/* builtin file-plugins */
68772 +typedef enum {
68773 +       /* regular file */
68774 +       UNIX_FILE_PLUGIN_ID,
68775 +       /* directory */
68776 +       DIRECTORY_FILE_PLUGIN_ID,
68777 +       /* symlink */
68778 +       SYMLINK_FILE_PLUGIN_ID,
68779 +       /* for objects completely handled by the VFS: fifos, devices,
68780 +          sockets  */
68781 +       SPECIAL_FILE_PLUGIN_ID,
68782 +       /* Plugin id for crypto-compression objects */
68783 +       CRC_FILE_PLUGIN_ID,
68784 +       /* pseudo file */
68785 +       PSEUDO_FILE_PLUGIN_ID,
68786 +        /* number of file plugins. Used as size of arrays to hold
68787 +          file plugins. */
68788 +       LAST_FILE_PLUGIN_ID
68789 +} reiser4_file_id;
68790 +
68791 +/* builtin dir-plugins */
68792 +typedef enum {
68793 +       HASHED_DIR_PLUGIN_ID,
68794 +       SEEKABLE_HASHED_DIR_PLUGIN_ID,
68795 +       PSEUDO_DIR_PLUGIN_ID,
68796 +       LAST_DIR_ID
68797 +} reiser4_dir_id;
68798 +
68799 +/* builtin hash-plugins */
68800 +
68801 +typedef enum {
68802 +       RUPASOV_HASH_ID,
68803 +       R5_HASH_ID,
68804 +       TEA_HASH_ID,
68805 +       FNV1_HASH_ID,
68806 +       DEGENERATE_HASH_ID,
68807 +       LAST_HASH_ID
68808 +} reiser4_hash_id;
68809 +
68810 +/* builtin crypto-plugins */
68811 +
68812 +typedef enum {
68813 +       NONE_CRYPTO_ID,
68814 +       LAST_CRYPTO_ID
68815 +} reiser4_crypto_id;
68816 +
68817 +/* builtin digest plugins */
68818 +
68819 +typedef enum {
68820 +       NONE_DIGEST_ID,
68821 +       LAST_DIGEST_ID
68822 +} reiser4_digest_id;
68823 +
68824 +/* builtin compression plugins */
68825 +
68826 +typedef enum {
68827 +       NONE_COMPRESSION_ID,
68828 +       NULL_COMPRESSION_ID,
68829 +       LZRW1_COMPRESSION_ID,
68830 +       LZO1_COMPRESSION_ID,
68831 +       GZIP1_COMPRESSION_ID,
68832 +       LAST_COMPRESSION_ID
68833 +} reiser4_compression_id;
68834 +
68835 +/* builtin tail-plugins */
68836 +
68837 +typedef enum {
68838 +       NEVER_TAILS_FORMATTING_ID,
68839 +       ALWAYS_TAILS_FORMATTING_ID,
68840 +       SMALL_FILE_FORMATTING_ID,
68841 +       LAST_TAIL_FORMATTING_ID
68842 +} reiser4_formatting_id;
68843 +
68844 +/* Encapsulations of crypto specific data */
68845 +typedef struct crypto_data {
68846 +        reiser4_crypto_id      cra; /* id of the crypto algorithm */
68847 +       reiser4_digest_id      dia; /* id of the digest algorithm */
68848 +       __u8 * key;                 /* secret key */
68849 +       __u16 keysize;              /* key size, bits */
68850 +       __u8 * keyid;               /* keyid */
68851 +       __u16 keyid_size;           /* keyid size, bytes */
68852 +} crypto_data_t;
68853 +
68854 +/* compression/clustering specific data */
68855 +typedef struct compression_data {
68856 +       reiser4_compression_id coa; /* id of the compression algorithm */
68857 +} compression_data_t;
68858 +
68859 +typedef __u8 cluster_data_t;        /* cluster info */
68860 +
68861 +/* data type used to pack parameters that we pass to vfs
68862 +    object creation function create_object() */
68863 +struct reiser4_object_create_data {
68864 +       /* plugin to control created object */
68865 +       reiser4_file_id id;
68866 +       /* mode of regular file, directory or special file */
68867 +/* what happens if some other sort of perm plugin is in use? */
68868 +       int mode;
68869 +       /* rdev of special file */
68870 +       dev_t rdev;
68871 +       /* symlink target */
68872 +       const char *name;
68873 +       /* add here something for non-standard objects you invent, like
68874 +          query for interpolation file etc. */
68875 +       crypto_data_t * crypto;
68876 +       compression_data_t * compression;
68877 +       cluster_data_t * cluster;
68878 +
68879 +       struct inode  *parent;
68880 +       struct dentry *dentry;
68881 +};
68882 +
68883 +#define MAX_PLUGIN_TYPE_LABEL_LEN  32
68884 +#define MAX_PLUGIN_PLUG_LABEL_LEN  32
68885 +
68886 +/* used for interface with user-land: table-driven parsing in
68887 +    reiser4(). */
68888 +typedef struct plugin_locator {
68889 +       reiser4_plugin_type type_id;
68890 +       reiser4_plugin_id id;
68891 +       char type_label[MAX_PLUGIN_TYPE_LABEL_LEN];
68892 +       char plug_label[MAX_PLUGIN_PLUG_LABEL_LEN];
68893 +} plugin_locator;
68894 +
68895 +extern int locate_plugin(struct inode *inode, plugin_locator * loc);
68896 +
68897 +static inline reiser4_plugin *
68898 +plugin_by_id(reiser4_plugin_type type_id, reiser4_plugin_id id);
68899 +
68900 +static inline reiser4_plugin *
68901 +plugin_by_disk_id(reiser4_tree * tree, reiser4_plugin_type type_id, d16 * did);
68902 +
68903 +#define PLUGIN_BY_ID(TYPE,ID,FIELD)                                    \
68904 +static inline TYPE *TYPE ## _by_id( reiser4_plugin_id id )             \
68905 +{                                                                      \
68906 +       reiser4_plugin *plugin = plugin_by_id ( ID, id );               \
68907 +       return plugin ? & plugin -> FIELD : NULL;                       \
68908 +}                                                                      \
68909 +static inline TYPE *TYPE ## _by_disk_id( reiser4_tree *tree, d16 *id ) \
68910 +{                                                                      \
68911 +       reiser4_plugin *plugin = plugin_by_disk_id ( tree, ID, id );    \
68912 +       return plugin ? & plugin -> FIELD : NULL;                       \
68913 +}                                                                      \
68914 +static inline TYPE *TYPE ## _by_unsafe_id( reiser4_plugin_id id )      \
68915 +{                                                                      \
68916 +       reiser4_plugin *plugin = plugin_by_unsafe_id ( ID, id );        \
68917 +       return plugin ? & plugin -> FIELD : NULL;                       \
68918 +}                                                                      \
68919 +static inline reiser4_plugin* TYPE ## _to_plugin( TYPE* plugin )       \
68920 +{                                                                      \
68921 +       return ( reiser4_plugin * ) plugin;                             \
68922 +}                                                                      \
68923 +static inline reiser4_plugin_id TYPE ## _id( TYPE* plugin )            \
68924 +{                                                                      \
68925 +       return TYPE ## _to_plugin (plugin) -> h.id;                     \
68926 +}                                                                      \
68927 +typedef struct { int foo; } TYPE ## _plugin_dummy
68928 +
68929 +PLUGIN_BY_ID(item_plugin, REISER4_ITEM_PLUGIN_TYPE, item);
68930 +PLUGIN_BY_ID(file_plugin, REISER4_FILE_PLUGIN_TYPE, file);
68931 +PLUGIN_BY_ID(dir_plugin, REISER4_DIR_PLUGIN_TYPE, dir);
68932 +PLUGIN_BY_ID(node_plugin, REISER4_NODE_PLUGIN_TYPE, node);
68933 +PLUGIN_BY_ID(sd_ext_plugin, REISER4_SD_EXT_PLUGIN_TYPE, sd_ext);
68934 +PLUGIN_BY_ID(perm_plugin, REISER4_PERM_PLUGIN_TYPE, perm);
68935 +PLUGIN_BY_ID(hash_plugin, REISER4_HASH_PLUGIN_TYPE, hash);
68936 +PLUGIN_BY_ID(fibration_plugin, REISER4_FIBRATION_PLUGIN_TYPE, fibration);
68937 +PLUGIN_BY_ID(crypto_plugin, REISER4_CRYPTO_PLUGIN_TYPE, crypto);
68938 +PLUGIN_BY_ID(digest_plugin, REISER4_DIGEST_PLUGIN_TYPE, digest);
68939 +PLUGIN_BY_ID(compression_plugin, REISER4_COMPRESSION_PLUGIN_TYPE, compression);
68940 +PLUGIN_BY_ID(formatting_plugin, REISER4_FORMATTING_PLUGIN_TYPE, formatting);
68941 +PLUGIN_BY_ID(disk_format_plugin, REISER4_FORMAT_PLUGIN_TYPE, format);
68942 +PLUGIN_BY_ID(jnode_plugin, REISER4_JNODE_PLUGIN_TYPE, jnode);
68943 +PLUGIN_BY_ID(pseudo_plugin, REISER4_PSEUDO_PLUGIN_TYPE, pseudo);
68944 +
68945 +extern int save_plugin_id(reiser4_plugin * plugin, d16 * area);
68946 +
68947 +#if REISER4_DEBUG_OUTPUT
68948 +extern void print_plugin(const char *prefix, reiser4_plugin * plugin);
68949 +#else
68950 +#define print_plugin( pr, pl ) noop
68951 +#endif
68952 +
68953 +TYPE_SAFE_LIST_DEFINE(plugin, reiser4_plugin, h.linkage);
68954 +
68955 +extern plugin_list_head *get_plugin_list(reiser4_plugin_type type_id);
68956 +
68957 +#define for_all_plugins( ptype, plugin )                       \
68958 +for( plugin = plugin_list_front( get_plugin_list( ptype ) ) ;  \
68959 +     ! plugin_list_end( get_plugin_list( ptype ), plugin ) ;   \
68960 +     plugin = plugin_list_next( plugin ) )
68961 +
68962 +/* enumeration of fields within plugin_set */
68963 +typedef enum {
68964 +       PSET_FILE,
68965 +       PSET_DIR, /* PSET_FILE and PSET_DIR should be first elements:
68966 +                  * inode.c:read_inode() depends on this. */
68967 +       PSET_PERM,
68968 +       PSET_FORMATTING,
68969 +       PSET_HASH,
68970 +       PSET_FIBRATION,
68971 +       PSET_SD,
68972 +       PSET_DIR_ITEM,
68973 +       PSET_CRYPTO,
68974 +       PSET_DIGEST,
68975 +       PSET_COMPRESSION,
68976 +       PSET_LAST
68977 +} pset_member;
68978 +
68979 +int grab_plugin(struct inode *self, struct inode *ancestor, pset_member memb);
68980 +int grab_plugin_from(struct inode *self, pset_member memb, reiser4_plugin *plug);
68981 +int force_plugin(struct inode *self, pset_member memb, reiser4_plugin *plug);
68982 +
68983 +/* __FS_REISER4_PLUGIN_TYPES_H__ */
68984 +#endif
68985 +
68986 +/* Make Linus happy.
68987 +   Local variables:
68988 +   c-indentation-style: "K&R"
68989 +   mode-name: "LC"
68990 +   c-basic-offset: 8
68991 +   tab-width: 8
68992 +   fill-column: 120
68993 +   End:
68994 +*/
68995 diff -rupN linux-2.6.8-rc3/fs/reiser4/plugin/plugin_header.h linux-2.6.8-rc3-a/fs/reiser4/plugin/plugin_header.h
68996 --- linux-2.6.8-rc3/fs/reiser4/plugin/plugin_header.h   1970-01-01 03:00:00.000000000 +0300
68997 +++ linux-2.6.8-rc3-a/fs/reiser4/plugin/plugin_header.h 2004-08-05 21:20:53.088659374 +0400
68998 @@ -0,0 +1,136 @@
68999 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
69000 +
69001 +/* plugin header. Data structures required by all plugin types. */
69002 +
69003 +#if !defined( __PLUGIN_HEADER_H__ )
69004 +#define __PLUGIN_HEADER_H__
69005 +
69006 +/* plugin data-types and constants */
69007 +
69008 +#include "../type_safe_list.h"
69009 +#include "../dformat.h"
69010 +
69011 +typedef enum {
69012 +       REISER4_FILE_PLUGIN_TYPE,
69013 +       REISER4_DIR_PLUGIN_TYPE,
69014 +       REISER4_ITEM_PLUGIN_TYPE,
69015 +       REISER4_NODE_PLUGIN_TYPE,
69016 +       REISER4_HASH_PLUGIN_TYPE,
69017 +       REISER4_FIBRATION_PLUGIN_TYPE,
69018 +       REISER4_FORMATTING_PLUGIN_TYPE,
69019 +       REISER4_PERM_PLUGIN_TYPE,
69020 +       REISER4_SD_EXT_PLUGIN_TYPE,
69021 +       REISER4_FORMAT_PLUGIN_TYPE,
69022 +       REISER4_JNODE_PLUGIN_TYPE,
69023 +       REISER4_CRYPTO_PLUGIN_TYPE,
69024 +       REISER4_DIGEST_PLUGIN_TYPE,
69025 +       REISER4_COMPRESSION_PLUGIN_TYPE,
69026 +       REISER4_PSEUDO_PLUGIN_TYPE,
69027 +       REISER4_PLUGIN_TYPES
69028 +} reiser4_plugin_type;
69029 +
69030 +struct reiser4_plugin_ops;
69031 +/* generic plugin operations, supported by each
69032 +    plugin type. */
69033 +typedef struct reiser4_plugin_ops reiser4_plugin_ops;
69034 +
69035 +TYPE_SAFE_LIST_DECLARE(plugin);
69036 +
69037 +/* the common part of all plugin instances. */
69038 +typedef struct plugin_header {
69039 +       /* plugin type */
69040 +       reiser4_plugin_type type_id;
69041 +       /* id of this plugin */
69042 +       reiser4_plugin_id id;
69043 +       /* plugin operations */
69044 +       reiser4_plugin_ops *pops;
69045 +/* NIKITA-FIXME-HANS: usage of and access to label and desc is not commented and defined. */
69046 +       /* short label of this plugin */
69047 +       const char *label;
69048 +       /* descriptive string.. */
69049 +       const char *desc;
69050 +       /* list linkage */
69051 +       plugin_list_link linkage;
69052 +} plugin_header;
69053 +
69054 +
69055 +/* PRIVATE INTERFACES */
69056 +/* NIKITA-FIXME-HANS: what is this for and why does it duplicate what is in plugin_header? */
69057 +/* plugin type representation. */
69058 +typedef struct reiser4_plugin_type_data {
69059 +       /* internal plugin type identifier. Should coincide with
69060 +           index of this item in plugins[] array. */
69061 +       reiser4_plugin_type type_id;
69062 +       /* short symbolic label of this plugin type. Should be no longer
69063 +           than MAX_PLUGIN_TYPE_LABEL_LEN characters including '\0'. */
69064 +       const char *label;
69065 +       /* plugin type description longer than .label */
69066 +       const char *desc;
69067 +
69068 +/* NIKITA-FIXME-HANS: define built-in */
69069 +       /* number of built-in plugin instances of this type */
69070 +       int builtin_num;
69071 +       /* array of built-in plugins */
69072 +       void *builtin;
69073 +       plugin_list_head plugins_list;
69074 +       size_t size;
69075 +} reiser4_plugin_type_data;
69076 +
69077 +extern reiser4_plugin_type_data plugins[REISER4_PLUGIN_TYPES];
69078 +
69079 +int is_type_id_valid(reiser4_plugin_type type_id);
69080 +int is_plugin_id_valid(reiser4_plugin_type type_id, reiser4_plugin_id id);
69081 +
69082 +static inline reiser4_plugin *
69083 +plugin_at(reiser4_plugin_type_data * ptype, int i)
69084 +{
69085 +       char *builtin;
69086 +
69087 +       builtin = ptype->builtin;
69088 +       return (reiser4_plugin *) (builtin + i * ptype->size);
69089 +}
69090 +
69091 +
69092 +/* return plugin by its @type_id and @id */
69093 +static inline reiser4_plugin *
69094 +plugin_by_id(reiser4_plugin_type type_id /* plugin type id */ ,
69095 +            reiser4_plugin_id id /* plugin id */ )
69096 +{
69097 +       assert("nikita-1651", is_type_id_valid(type_id));
69098 +       assert("nikita-1652", is_plugin_id_valid(type_id, id));
69099 +       return plugin_at(&plugins[type_id], id);
69100 +}
69101 +
69102 +extern reiser4_plugin *
69103 +plugin_by_unsafe_id(reiser4_plugin_type type_id, reiser4_plugin_id id);
69104 +
69105 +/* get plugin whose id is stored in disk format */
69106 +static inline reiser4_plugin *
69107 +plugin_by_disk_id(reiser4_tree * tree UNUSED_ARG       /* tree,
69108 +                                                        * plugin
69109 +                                                        * belongs
69110 +                                                        * to */ ,
69111 +                 reiser4_plugin_type type_id   /* plugin type
69112 +                                                * id */ ,
69113 +                 d16 * did /* plugin id in disk format */ )
69114 +{
69115 +       /* what we should do properly is to maintain within each
69116 +          file-system a dictionary that maps on-disk plugin ids to
69117 +          "universal" ids. This dictionary will be resolved on mount
69118 +          time, so that this function will perform just one additional
69119 +          array lookup. */
69120 +       return plugin_by_unsafe_id(type_id, d16tocpu(did));
69121 +}
69122 +
69123 +/* __PLUGIN_HEADER_H__ */
69124 +#endif
69125 +
69126 +/* Make Linus happy.
69127 +   Local variables:
69128 +   c-indentation-style: "K&R"
69129 +   mode-name: "LC"
69130 +   c-basic-offset: 8
69131 +   tab-width: 8
69132 +   fill-column: 120
69133 +   End:
69134 +*/
69135 diff -rupN linux-2.6.8-rc3/fs/reiser4/plugin/plugin_set.c linux-2.6.8-rc3-a/fs/reiser4/plugin/plugin_set.c
69136 --- linux-2.6.8-rc3/fs/reiser4/plugin/plugin_set.c      1970-01-01 03:00:00.000000000 +0300
69137 +++ linux-2.6.8-rc3-a/fs/reiser4/plugin/plugin_set.c    2004-08-05 21:20:53.275619940 +0400
69138 @@ -0,0 +1,345 @@
69139 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
69140 + * reiser4/README */
69141 +/* NIKITA-FIXME-HANS: you didn't discuss this with me before coding it did you?  Remove plugin-sets from code by March 15th, 2004 */
69142 +/* plugin-sets */
69143 +
69144 +/*
69145 + * Each inode comes with a whole set of plugins: file plugin, directory
69146 + * plugin, hash plugin, tail policy plugin, security plugin, etc.
69147 + *
69148 + * Storing them (pointers to them, that is) in inode is a waste of
69149 + * space. Especially, given that on average file system plugins of vast
69150 + * majority of files will belong to few sets (e.g., one set for regular files,
69151 + * another set for standard directory, etc.)
69152 + *
69153 + * Plugin set (pset) is an object containing pointers to all plugins required
69154 + * by inode. Inode only stores a pointer to pset. psets are "interned", that
69155 + * is, different inodes with the same set of plugins point to the same
69156 + * pset. This is archived by storing psets in global hash table. Races are
69157 + * avoided by simple (and efficient so far) solution of never recycling psets,
69158 + * even when last inode pointing to it is destroyed.
69159 + *
69160 + */
69161 +
69162 +#include "../debug.h"
69163 +
69164 +#include "plugin_set.h"
69165 +
69166 +#include <linux/slab.h>
69167 +#include <linux/stddef.h>
69168 +
69169 +/* slab for plugin sets */
69170 +static kmem_cache_t *plugin_set_slab;
69171 +
69172 +static spinlock_t plugin_set_lock[8] __cacheline_aligned_in_smp = {
69173 +       [0 ... 7] = SPIN_LOCK_UNLOCKED
69174 +};
69175 +
69176 +/* hash table support */
69177 +
69178 +#define PS_TABLE_SIZE (32)
69179 +
69180 +static inline plugin_set *
69181 +cast_to(const unsigned long * a)
69182 +{
69183 +       return container_of(a, plugin_set, hashval);
69184 +}
69185 +
69186 +static inline int
69187 +pseq(const unsigned long * a1, const unsigned long * a2)
69188 +{
69189 +       plugin_set *set1;
69190 +       plugin_set *set2;
69191 +
69192 +       /* make sure fields are not missed in the code below */
69193 +       cassert(sizeof *set1 ==
69194 +
69195 +               sizeof set1->hashval +
69196 +               sizeof set1->link +
69197 +
69198 +               sizeof set1->file +
69199 +               sizeof set1->dir +
69200 +               sizeof set1->perm +
69201 +               sizeof set1->formatting +
69202 +               sizeof set1->hash +
69203 +               sizeof set1->fibration +
69204 +               sizeof set1->sd +
69205 +               sizeof set1->dir_item +
69206 +               sizeof set1->crypto +
69207 +               sizeof set1->digest +
69208 +               sizeof set1->compression);
69209 +
69210 +       set1 = cast_to(a1);
69211 +       set2 = cast_to(a2);
69212 +       return
69213 +               set1->hashval == set2->hashval &&
69214 +
69215 +               set1->file == set2->file &&
69216 +               set1->dir == set2->dir &&
69217 +               set1->perm == set2->perm &&
69218 +               set1->formatting == set2->formatting &&
69219 +               set1->hash == set2->hash &&
69220 +               set1->fibration == set2->fibration &&
69221 +               set1->sd == set2->sd &&
69222 +               set1->dir_item == set2->dir_item &&
69223 +               set1->crypto == set2->crypto &&
69224 +               set1->digest == set2->digest &&
69225 +               set1->compression == set2->compression;
69226 +}
69227 +
69228 +#define HASH_FIELD(hash, set, field)           \
69229 +({                                             \
69230 +        (hash) += (unsigned long)(set)->field >> 2;    \
69231 +})
69232 +
69233 +static inline unsigned long calculate_hash(const plugin_set *set)
69234 +{
69235 +       unsigned long result;
69236 +
69237 +       result = 0;
69238 +       HASH_FIELD(result, set, file);
69239 +       HASH_FIELD(result, set, dir);
69240 +       HASH_FIELD(result, set, perm);
69241 +       HASH_FIELD(result, set, formatting);
69242 +       HASH_FIELD(result, set, hash);
69243 +       HASH_FIELD(result, set, fibration);
69244 +       HASH_FIELD(result, set, sd);
69245 +       HASH_FIELD(result, set, dir_item);
69246 +       HASH_FIELD(result, set, crypto);
69247 +       HASH_FIELD(result, set, digest);
69248 +       HASH_FIELD(result, set, compression);
69249 +       return result & (PS_TABLE_SIZE - 1);
69250 +}
69251 +
69252 +static inline unsigned long
69253 +pshash(ps_hash_table *table, const unsigned long * a)
69254 +{
69255 +       return *a;
69256 +}
69257 +
69258 +/* The hash table definition */
69259 +#define KMALLOC(size) kmalloc((size), GFP_KERNEL)
69260 +#define KFREE(ptr, size) kfree(ptr)
69261 +TYPE_SAFE_HASH_DEFINE(ps, plugin_set, unsigned long, hashval, link, pshash, pseq);
69262 +#undef KFREE
69263 +#undef KMALLOC
69264 +
69265 +static ps_hash_table ps_table;
69266 +static plugin_set empty_set = {
69267 +       .hashval            = 0,
69268 +       .file               = NULL,
69269 +       .dir                = NULL,
69270 +       .perm               = NULL,
69271 +       .formatting         = NULL,
69272 +       .hash               = NULL,
69273 +       .fibration          = NULL,
69274 +       .sd                 = NULL,
69275 +       .dir_item           = NULL,
69276 +       .crypto             = NULL,
69277 +       .digest             = NULL,
69278 +       .compression        = NULL,
69279 +       .link               = { NULL }
69280 +};
69281 +
69282 +reiser4_internal plugin_set *plugin_set_get_empty(void)
69283 +{
69284 +       return &empty_set;
69285 +}
69286 +
69287 +reiser4_internal void plugin_set_put(plugin_set *set)
69288 +{
69289 +}
69290 +
69291 +reiser4_internal plugin_set *plugin_set_clone(plugin_set *set)
69292 +{
69293 +       return set;
69294 +}
69295 +
69296 +static inline unsigned long *
69297 +pset_field(plugin_set *set, int offset)
69298 +{
69299 +       return (unsigned long *)(((char *)set) + offset);
69300 +}
69301 +
69302 +static int plugin_set_field(plugin_set **set, const unsigned long val, const int offset)
69303 +{
69304 +       unsigned long      *spot;
69305 +       spinlock_t *lock;
69306 +       plugin_set  replica;
69307 +       plugin_set *twin;
69308 +       plugin_set *psal;
69309 +       plugin_set *orig;
69310 +
69311 +       assert("nikita-2902", set != NULL);
69312 +       assert("nikita-2904", *set != NULL);
69313 +
69314 +       spot = pset_field(*set, offset);
69315 +       if (unlikely(*spot == val))
69316 +               return 0;
69317 +
69318 +       replica = *(orig = *set);
69319 +       *pset_field(&replica, offset) = val;
69320 +       replica.hashval = calculate_hash(&replica);
69321 +       rcu_read_lock();
69322 +       twin = ps_hash_find(&ps_table, &replica.hashval);
69323 +       if (unlikely(twin == NULL)) {
69324 +               rcu_read_unlock();
69325 +               psal = kmem_cache_alloc(plugin_set_slab, GFP_KERNEL);
69326 +               if (psal == NULL)
69327 +                       return RETERR(-ENOMEM);
69328 +               *psal = replica;
69329 +               lock = &plugin_set_lock[replica.hashval & 7];
69330 +               spin_lock(lock);
69331 +               twin = ps_hash_find(&ps_table, &replica.hashval);
69332 +               if (likely(twin == NULL)) {
69333 +                       *set = psal;
69334 +                       ps_hash_insert_rcu(&ps_table, psal);
69335 +               } else {
69336 +                       *set = twin;
69337 +                       kmem_cache_free(plugin_set_slab, psal);
69338 +               }
69339 +               spin_unlock(lock);
69340 +       } else {
69341 +               rcu_read_unlock();
69342 +               *set = twin;
69343 +       }
69344 +       return 0;
69345 +}
69346 +
69347 +static struct {
69348 +       int                 offset;
69349 +       reiser4_plugin_type type;
69350 +} pset_descr[PSET_LAST] = {
69351 +       [PSET_FILE] = {
69352 +               .offset = offsetof(plugin_set, file),
69353 +               .type   = REISER4_FILE_PLUGIN_TYPE
69354 +       },
69355 +       [PSET_DIR] = {
69356 +               .offset = offsetof(plugin_set, dir),
69357 +               .type   = REISER4_DIR_PLUGIN_TYPE
69358 +       },
69359 +       [PSET_PERM] = {
69360 +               .offset = offsetof(plugin_set, perm),
69361 +               .type   = REISER4_PERM_PLUGIN_TYPE
69362 +       },
69363 +       [PSET_FORMATTING] = {
69364 +               .offset = offsetof(plugin_set, formatting),
69365 +               .type   = REISER4_FORMATTING_PLUGIN_TYPE
69366 +       },
69367 +       [PSET_HASH] = {
69368 +               .offset = offsetof(plugin_set, hash),
69369 +               .type   = REISER4_HASH_PLUGIN_TYPE
69370 +       },
69371 +       [PSET_FIBRATION] = {
69372 +               .offset = offsetof(plugin_set, fibration),
69373 +               .type   = REISER4_FIBRATION_PLUGIN_TYPE
69374 +       },
69375 +       [PSET_SD] = {
69376 +               .offset = offsetof(plugin_set, sd),
69377 +               .type   = REISER4_ITEM_PLUGIN_TYPE
69378 +       },
69379 +       [PSET_DIR_ITEM] = {
69380 +               .offset = offsetof(plugin_set, dir_item),
69381 +               .type   = REISER4_ITEM_PLUGIN_TYPE
69382 +       },
69383 +       [PSET_CRYPTO] = {
69384 +               .offset = offsetof(plugin_set, crypto),
69385 +               .type   = REISER4_CRYPTO_PLUGIN_TYPE
69386 +       },
69387 +       [PSET_DIGEST] = {
69388 +               .offset = offsetof(plugin_set, digest),
69389 +               .type   = REISER4_DIGEST_PLUGIN_TYPE
69390 +       },
69391 +       [PSET_COMPRESSION] = {
69392 +               .offset = offsetof(plugin_set, compression),
69393 +               .type   = REISER4_COMPRESSION_PLUGIN_TYPE
69394 +       }
69395 +};
69396 +
69397 +int pset_set(plugin_set **set, pset_member memb, reiser4_plugin *plugin)
69398 +{
69399 +       assert("nikita-3492", set != NULL);
69400 +       assert("nikita-3493", *set != NULL);
69401 +       assert("nikita-3494", plugin != NULL);
69402 +       assert("nikita-3495", 0 <= memb && memb < PSET_LAST);
69403 +       assert("nikita-3496", plugin->h.type_id == pset_member_to_type(memb));
69404 +
69405 +       return plugin_set_field(set,
69406 +                               (unsigned long)plugin, pset_descr[memb].offset);
69407 +}
69408 +
69409 +reiser4_plugin *pset_get(plugin_set *set, pset_member memb)
69410 +{
69411 +       assert("nikita-3497", set != NULL);
69412 +       assert("nikita-3498", 0 <= memb && memb < PSET_LAST);
69413 +
69414 +       return *(reiser4_plugin **)(((char *)set) + pset_descr[memb].offset);
69415 +}
69416 +
69417 +reiser4_plugin_type pset_member_to_type(pset_member memb)
69418 +{
69419 +       assert("nikita-3501", 0 <= memb && memb < PSET_LAST);
69420 +       return pset_descr[memb].type;
69421 +}
69422 +
69423 +reiser4_plugin_type pset_member_to_type_unsafe(pset_member memb)
69424 +{
69425 +       if (0 <= memb && memb < PSET_LAST)
69426 +               return pset_descr[memb].type;
69427 +       else
69428 +               return REISER4_PLUGIN_TYPES;
69429 +}
69430 +
69431 +#define DEFINE_PLUGIN_SET(type, field)                                 \
69432 +reiser4_internal int plugin_set_ ## field(plugin_set **set, type *val) \
69433 +{                                                                      \
69434 +       cassert(sizeof val == sizeof(unsigned long));                   \
69435 +       return plugin_set_field(set, (unsigned long)val,                \
69436 +                               offsetof(plugin_set, field));           \
69437 +}
69438 +
69439 +DEFINE_PLUGIN_SET(file_plugin, file)
69440 +DEFINE_PLUGIN_SET(dir_plugin, dir)
69441 +DEFINE_PLUGIN_SET(perm_plugin, perm)
69442 +DEFINE_PLUGIN_SET(formatting_plugin, formatting)
69443 +DEFINE_PLUGIN_SET(hash_plugin, hash)
69444 +DEFINE_PLUGIN_SET(fibration_plugin, fibration)
69445 +DEFINE_PLUGIN_SET(item_plugin, sd)
69446 +DEFINE_PLUGIN_SET(item_plugin, dir_item)
69447 +DEFINE_PLUGIN_SET(crypto_plugin, crypto)
69448 +DEFINE_PLUGIN_SET(digest_plugin, digest)
69449 +DEFINE_PLUGIN_SET(compression_plugin, compression)
69450 +
69451 +reiser4_internal int plugin_set_init(void)
69452 +{
69453 +       int result;
69454 +
69455 +       result = ps_hash_init(&ps_table, PS_TABLE_SIZE, NULL);
69456 +       if (result == 0) {
69457 +               plugin_set_slab = kmem_cache_create("plugin_set",
69458 +                                                   sizeof (plugin_set), 0,
69459 +                                                   SLAB_HWCACHE_ALIGN,
69460 +                                                   NULL, NULL);
69461 +               if (plugin_set_slab == NULL)
69462 +                       result = RETERR(-ENOMEM);
69463 +       }
69464 +       return result;
69465 +}
69466 +
69467 +reiser4_internal void plugin_set_done(void)
69468 +{
69469 +       /* NOTE: scan hash table and recycle all objects. */
69470 +       kmem_cache_destroy(plugin_set_slab);
69471 +       ps_hash_done(&ps_table);
69472 +}
69473 +
69474 +
69475 +/* Make Linus happy.
69476 +   Local variables:
69477 +   c-indentation-style: "K&R"
69478 +   mode-name: "LC"
69479 +   c-basic-offset: 8
69480 +   tab-width: 8
69481 +   fill-column: 120
69482 +   End:
69483 +*/
69484 diff -rupN linux-2.6.8-rc3/fs/reiser4/plugin/plugin_set.h linux-2.6.8-rc3-a/fs/reiser4/plugin/plugin_set.h
69485 --- linux-2.6.8-rc3/fs/reiser4/plugin/plugin_set.h      1970-01-01 03:00:00.000000000 +0300
69486 +++ linux-2.6.8-rc3-a/fs/reiser4/plugin/plugin_set.h    2004-08-05 21:20:52.867705979 +0400
69487 @@ -0,0 +1,81 @@
69488 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
69489 +
69490 +/* plugin-sets. see fs/reiser4/plugin/plugin_set.c for details */
69491 +
69492 +#if !defined( __PLUGIN_SET_H__ )
69493 +#define __PLUGIN_SET_H__
69494 +
69495 +#include "../type_safe_hash.h"
69496 +#include "plugin.h"
69497 +
69498 +#include <linux/rcupdate.h>
69499 +
69500 +struct plugin_set;
69501 +typedef struct plugin_set plugin_set;
69502 +
69503 +TYPE_SAFE_HASH_DECLARE(ps, plugin_set);
69504 +
69505 +struct plugin_set {
69506 +       unsigned long               hashval;
69507 +       /* plugin of file */
69508 +       file_plugin        *file;
69509 +       /* plugin of dir */
69510 +       dir_plugin         *dir;
69511 +       /* perm plugin for this file */
69512 +       perm_plugin        *perm;
69513 +       /* tail policy plugin. Only meaningful for regular files */
69514 +       formatting_plugin  *formatting;
69515 +       /* hash plugin. Only meaningful for directories. */
69516 +       hash_plugin        *hash;
69517 +       /* fibration plugin. Only meaningful for directories. */
69518 +       fibration_plugin   *fibration;
69519 +       /* plugin of stat-data */
69520 +       item_plugin        *sd;
69521 +       /* plugin of items a directory is built of */
69522 +       item_plugin        *dir_item;
69523 +       /* crypto plugin */
69524 +       crypto_plugin      *crypto;
69525 +       /* digest plugin */
69526 +       digest_plugin      *digest;
69527 +       /* compression plugin */
69528 +       compression_plugin *compression;
69529 +       ps_hash_link        link;
69530 +};
69531 +
69532 +extern plugin_set *plugin_set_get_empty(void);
69533 +extern plugin_set *plugin_set_clone(plugin_set *set);
69534 +extern void        plugin_set_put(plugin_set *set);
69535 +
69536 +extern int plugin_set_file       (plugin_set **set, file_plugin *file);
69537 +extern int plugin_set_dir        (plugin_set **set, dir_plugin *file);
69538 +extern int plugin_set_perm       (plugin_set **set, perm_plugin *file);
69539 +extern int plugin_set_formatting (plugin_set **set, formatting_plugin *file);
69540 +extern int plugin_set_hash       (plugin_set **set, hash_plugin *file);
69541 +extern int plugin_set_fibration  (plugin_set **set, fibration_plugin *file);
69542 +extern int plugin_set_sd         (plugin_set **set, item_plugin *file);
69543 +extern int plugin_set_dir_item   (plugin_set **set, item_plugin *file);
69544 +extern int plugin_set_crypto     (plugin_set **set, crypto_plugin *file);
69545 +extern int plugin_set_digest     (plugin_set **set, digest_plugin *file);
69546 +extern int plugin_set_compression(plugin_set **set, compression_plugin *file);
69547 +
69548 +extern int  plugin_set_init(void);
69549 +extern void plugin_set_done(void);
69550 +
69551 +extern int pset_set(plugin_set **set, pset_member memb, reiser4_plugin *plugin);
69552 +extern reiser4_plugin *pset_get(plugin_set *set, pset_member memb);
69553 +
69554 +extern reiser4_plugin_type pset_member_to_type(pset_member memb);
69555 +extern reiser4_plugin_type pset_member_to_type_unsafe(pset_member memb);
69556 +
69557 +/* __PLUGIN_SET_H__ */
69558 +#endif
69559 +
69560 +/* Make Linus happy.
69561 +   Local variables:
69562 +   c-indentation-style: "K&R"
69563 +   mode-name: "LC"
69564 +   c-basic-offset: 8
69565 +   tab-width: 8
69566 +   fill-column: 120
69567 +   End:
69568 +*/
69569 diff -rupN linux-2.6.8-rc3/fs/reiser4/plugin/pseudo/pseudo.c linux-2.6.8-rc3-a/fs/reiser4/plugin/pseudo/pseudo.c
69570 --- linux-2.6.8-rc3/fs/reiser4/plugin/pseudo/pseudo.c   1970-01-01 03:00:00.000000000 +0300
69571 +++ linux-2.6.8-rc3-a/fs/reiser4/plugin/pseudo/pseudo.c 2004-08-05 21:20:53.068663591 +0400
69572 @@ -0,0 +1,1804 @@
69573 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
69574 + * reiser4/README */
69575 +
69576 +/* Handling of "pseudo" files representing unified access to meta data in
69577 +   reiser4. */
69578 +
69579 +/*
69580 + * See http://namesys.com/v4/v4.html, and especially
69581 + * http://namesys.com/v4/pseudo.html for basic information about reiser4
69582 + * pseudo files, access to meta-data, reiser4() system call, etc.
69583 + *
69584 + * Pseudo files should be accessible from both reiser4() system call and
69585 + * normal POSIX calls.
69586 + *
69587 + * OVERVIEW
69588 + *
69589 + *     Pseudo files provide access to various functionality through file
69590 + *     system name space. As such they are similar to pseudo file systems
69591 + *     already existing in UNIX and Linux: procfs, sysfs, etc. But pseudo
69592 + *     files are embedded into name space of Reiser4---real block device based
69593 + *     file system, and are more tightly integrated with it. In particular,
69594 + *     some pseudo files are "attached" to other files (either "real" or also
69595 + *     pseudo), by being accessible through path names of the form
69596 + *
69597 + *         "a/b/c/something"
69598 + *
69599 + *     Here object accessible through "a/b/c/something" is attached to the
69600 + *     object accessible through "a/b/c" , and the latter is said to be the
69601 + *     "host" object of the former.
69602 + *
69603 + *     Object can have multiple pseudo files attached to it, distinguished by
69604 + *     the last component of their names "something", "somethingelse",
69605 + *     etc.
69606 + *
69607 + *     (Note however, that currently "real" files have only one single pseudo
69608 + *     file attached to them, viz. pseudo directory "metas". This directory in
69609 + *     turn contains all other pseudo files pertaining to the real file that
69610 + *     "metas" is attached to. To avoid referencing "metas" all the time
69611 + *     "a/b/c" is called a host of "/a/b/c/metas/something". This violates
69612 + *     definition above, but is convenient.)
69613 + *
69614 + *     Moreover, in addition to the purely pseudo files (that is, file system
69615 + *     objects whose content (as available through read(2) system call) is not
69616 + *     backed by any kind of persistent storage), extended file attributes
69617 + *     (see attr(5) on Linux, and http://acl.bestbits.at/) including security
69618 + *     attributes such as ACLs are also available through file system name
69619 + *     space.
69620 + *
69621 + *     As a result each file system object has a sub-name-space rooted at it,
69622 + *     which is in striking contrast with traditional UNIX file system, where
69623 + *     only directories has sub-objects and all other types of files (regular,
69624 + *     FIFO-s, devices, and symlinks) are leaves.
69625 + *
69626 + *     For the sake of objectivity it should be mentioned that this is not
69627 + *     _completely_ new development in file system design, see
69628 + *     http://docs.sun.com/db/doc/816-0220/6m6nkorp9?a=view
69629 + *
69630 + *     In particular, as each object has sub-objects, name space tree is
69631 + *     infinite in both extent (number of reachable objects) and depth.
69632 + *
69633 + *     Some pseudo files are "built-in". They are present as sub-objects in
69634 + *     each file system object, unless specifically disabled.
69635 + *
69636 + *     Built-in pseudo files are implemented in this file and described at
69637 + *     http://namesys.com/v4/pseudo.html
69638 + *
69639 + * IMPLEMENTATION
69640 + *
69641 + *     Pseudo files are implemented as normal inodes, living in the same super
69642 + *     block as other inodes for reiser4 file system. Their inode numbers are
69643 + *     generated by fs/inode.c:new_inode() function and are not persistent (in
69644 + *     the sense that they are not guaranteed to be the same after
69645 + *     remount). To avoid clashes with "normal" inodes, all pseudo inodes are
69646 + *     placed into otherwise unused locality (for example, 0), hence allowing
69647 + *     reiser4_inode_find_actor() to tell them from normal inodes.
69648 + *
69649 + *     All pseudo inodes share the same object plugin
69650 + *     PSEUDO_FILE_PLUGIN_ID. In pseudo-inode specific part of reiser4_inode
69651 + *     (pseudo_info), two things are stored:
69652 + *
69653 + *         1. pointer to the inode of the "host object" (for /a/b/c/metas/acl,
69654 + *         /a/b/c is the host object)
69655 + *
69656 + *         2. pointer to pseudo plugin, used by PSEUDO_FILE_PLUGIN_ID to
69657 + *         implement VFS operations.
69658 + *
69659 + *     This design has following advantages:
69660 + *
69661 + *         1. provides for ease addition of new pseudo files without going
69662 + *         through writing whole new object plugin.
69663 + *
69664 + *         2. allows sys_reiser4() to be implemented by directory invoking
69665 + *         pseudo plugin methods.
69666 + *
69667 + */
69668 +
69669 +#include "../../inode.h"
69670 +#include "../../debug.h"
69671 +#include "../plugin.h"
69672 +
69673 +#include "pseudo.h"
69674 +
69675 +static int init_pseudo(struct inode *parent, struct inode *pseudo,
69676 +                      pseudo_plugin *pplug, const char *name);
69677 +
69678 +static struct inode *add_pseudo(struct inode *parent,
69679 +                               pseudo_plugin *pplug, struct dentry **d);
69680 +
69681 +/*
69682 + * helper method: set ->datum field in the pseudo file specific portion of
69683 + * reiser4 inode.
69684 + */
69685 +static void pseudo_set_datum(struct inode *pseudo, unsigned long datum)
69686 +{
69687 +       reiser4_inode_data(pseudo)->file_plugin_data.pseudo_info.datum = datum;
69688 +}
69689 +
69690 +/*
69691 + * return id of pseudo file plugin for this inode @p
69692 + */
69693 +static int pseudo_id(struct inode *p)
69694 +{
69695 +       return reiser4_inode_data(p)->file_plugin_data.pseudo_info.plugin->h.id;
69696 +}
69697 +
69698 +/*
69699 + * helper method used to implement ->lookup() method of pseudo files.
69700 + *
69701 + * Try to find a pseudo plugin that matches given name (stored in @dentry) and
69702 + * has ->parent field equal to @id.
69703 + *
69704 + * Convention is that ->parent field is set to the id of the pseudo plugin of
69705 + * the parent pseudo file in the hierarchy (that is, plugin for
69706 + * "a/metas/foo/bar" has ->parent set to the plugin id of "a/metas/foo"), with
69707 + * the exception of "a/metas" that uses special reserved value TOP_LEVEL for
69708 + * ->parent.
69709 + */
69710 +static int
69711 +lookup_of_plugin(struct inode *parent, int id, struct dentry **dentry)
69712 +{
69713 +       const char     *name;
69714 +       struct inode   *pseudo;
69715 +       reiser4_plugin *plugin;
69716 +       int             result;
69717 +
69718 +       name = (*dentry)->d_name.name;
69719 +       pseudo = ERR_PTR(-ENOENT);
69720 +
69721 +       /* scan all pseudo file plugins and check each */
69722 +       for_all_plugins(REISER4_PSEUDO_PLUGIN_TYPE, plugin) {
69723 +               pseudo_plugin *pplug;
69724 +
69725 +               pplug = &plugin->pseudo;
69726 +               if (pplug->parent == id &&
69727 +                   pplug->try != NULL && pplug->try(pplug, parent, name)) {
69728 +                       pseudo = add_pseudo(parent, pplug, dentry);
69729 +                       break;
69730 +               }
69731 +       }
69732 +       if (!IS_ERR(pseudo))
69733 +               result = 0;
69734 +       else
69735 +               result = PTR_ERR(pseudo);
69736 +       return result;
69737 +}
69738 +
69739 +/*
69740 + * implement ->lookup() method using convention described in the comment for
69741 + * lookup_of_plugin() function.
69742 + */
69743 +static int lookup_table(struct inode *parent, struct dentry ** dentry)
69744 +{
69745 +       assert("nikita-3511", parent != NULL);
69746 +       assert("nikita-3512", dentry != NULL);
69747 +       assert("nikita-3513",
69748 +              inode_file_plugin(parent)->h.id == PSEUDO_FILE_PLUGIN_ID);
69749 +
69750 +       /*
69751 +        * call lookup_of_plugin() passing id of pseudo plugin for @parent as
69752 +        * "id" parameter.
69753 +        */
69754 +       return lookup_of_plugin(parent, pseudo_id(parent), dentry);
69755 +}
69756 +
69757 +/*
69758 + * helper to implement ->readdir() method for the pseudo files. It uses the
69759 + * same convention as lookup_of_plugin() function.
69760 + */
69761 +static int
69762 +readdir_table(struct file *f, void *dirent, filldir_t filld)
69763 +{
69764 +       loff_t off;
69765 +       ino_t  ino;
69766 +       int    skip;
69767 +       int    id;
69768 +
69769 +       struct inode *inode;
69770 +       reiser4_plugin *plugin;
69771 +
69772 +       off = f->f_pos;
69773 +       if (off < 0)
69774 +               return 0;
69775 +
69776 +       inode = f->f_dentry->d_inode;
69777 +       switch ((int)off) {
69778 +               /*
69779 +                * first, return dot and dotdot
69780 +                */
69781 +       case 0:
69782 +               ino = inode->i_ino;
69783 +               if (filld(dirent, ".", 1, off, ino, DT_DIR) < 0)
69784 +                       break;
69785 +               ++ off;
69786 +               /* fallthrough */
69787 +       case 1:
69788 +               ino = parent_ino(f->f_dentry);
69789 +               if (filld(dirent, "..", 2, off, ino, DT_DIR) < 0)
69790 +                       break;
69791 +               ++ off;
69792 +               /* fallthrough */
69793 +       default:
69794 +               skip = off - 2;
69795 +               id = pseudo_id(inode);
69796 +               /* then, scan all pseudo plugins, looking for the ones with
69797 +                * matching ->parent */
69798 +               for_all_plugins(REISER4_PSEUDO_PLUGIN_TYPE, plugin) {
69799 +                       pseudo_plugin *pplug;
69800 +                       const char *name;
69801 +
69802 +                       pplug = &plugin->pseudo;
69803 +                       if (pplug->parent == id && pplug->readdirable) {
69804 +                               if (skip == 0) {
69805 +                                       name = pplug->h.label;
69806 +                                       /*
69807 +                                        * if match is found---fed @filld with
69808 +                                        * it
69809 +                                        */
69810 +                                       if (filld(dirent, name, strlen(name),
69811 +                                                 off,
69812 +                                                 off + (long)f, DT_REG) < 0)
69813 +                                               break;
69814 +                                       ++ off;
69815 +                               } else
69816 +                                       -- skip;
69817 +                       }
69818 +               }
69819 +       }
69820 +       f->f_pos = off;
69821 +       return 0;
69822 +}
69823 +
69824 +/*
69825 + * special value of ->parent field in pseudo file plugin used by "metas" top
69826 + * level pseudo directory.
69827 + */
69828 +#define TOP_LEVEL (-1)
69829 +
69830 +/*
69831 + * try to look up built-in pseudo file by its name.
69832 + */
69833 +reiser4_internal int
69834 +lookup_pseudo_file(struct inode *parent, struct dentry **dentry)
69835 +{
69836 +       assert("nikita-2999", parent != NULL);
69837 +       assert("nikita-3000", dentry != NULL);
69838 +
69839 +       /* if pseudo files are disabled for this file system bail out */
69840 +       if (reiser4_is_set(parent->i_sb, REISER4_NO_PSEUDO))
69841 +               return RETERR(-ENOENT);
69842 +       else
69843 +               return lookup_of_plugin(parent, TOP_LEVEL, dentry);
69844 +}
69845 +
69846 +/* create inode for pseudo file with plugin @pplug, and add it to the @parent
69847 + * under name @d */
69848 +static struct inode *add_pseudo(struct inode *parent,
69849 +                               pseudo_plugin *pplug, struct dentry **d)
69850 +{
69851 +       struct inode *pseudo;
69852 +
69853 +       pseudo = new_inode(parent->i_sb);
69854 +       if (pseudo != NULL) {
69855 +               int result;
69856 +
69857 +               result = init_pseudo(parent, pseudo, pplug, (*d)->d_name.name);
69858 +               if (result != 0)
69859 +                       pseudo = ERR_PTR(result);
69860 +               else
69861 +                       *d = d_splice_alias(pseudo, *d);
69862 +       } else
69863 +               pseudo = ERR_PTR(RETERR(-ENOMEM));
69864 +       return pseudo;
69865 +}
69866 +
69867 +
69868 +/* helper function: return host object of @inode pseudo file */
69869 +static struct inode *get_inode_host(struct inode *inode)
69870 +{
69871 +       assert("nikita-3510",
69872 +              inode_file_plugin(inode)->h.id == PSEUDO_FILE_PLUGIN_ID);
69873 +       return reiser4_inode_data(inode)->file_plugin_data.pseudo_info.host;
69874 +}
69875 +
69876 +/* helper function: return parent object of @inode pseudo file */
69877 +static struct inode *get_inode_parent(struct inode *inode)
69878 +{
69879 +       assert("nikita-3510",
69880 +              inode_file_plugin(inode)->h.id == PSEUDO_FILE_PLUGIN_ID);
69881 +       return reiser4_inode_data(inode)->file_plugin_data.pseudo_info.parent;
69882 +}
69883 +
69884 +/*
69885 + * initialize pseudo file @pseudo to be child of @parent, with plugin @pplug
69886 + * and name @name.
69887 + */
69888 +static int
69889 +init_pseudo(struct inode *parent, struct inode *pseudo,
69890 +           pseudo_plugin *pplug, const char *name)
69891 +{
69892 +       int result;
69893 +       struct inode  *host;
69894 +       reiser4_inode *idata;
69895 +       reiser4_object_create_data data;
69896 +       static const oid_t pseudo_locality = 0x0ull;
69897 +
69898 +       idata = reiser4_inode_data(pseudo);
69899 +       /* all pseudo files live in special reserved locality */
69900 +       idata->locality_id = pseudo_locality;
69901 +
69902 +       /*
69903 +        * setup ->parent and ->host fields
69904 +        */
69905 +       if (pplug->parent != TOP_LEVEL)
69906 +               /* host of "a/metas/b/c" is "a" */
69907 +               host = get_inode_host(parent);
69908 +       else
69909 +               /* host of "a/metas" is "a" */
69910 +               host = parent;
69911 +
69912 +       idata->file_plugin_data.pseudo_info.host   = host;
69913 +       idata->file_plugin_data.pseudo_info.parent = parent;
69914 +       idata->file_plugin_data.pseudo_info.plugin = pplug;
69915 +
69916 +       data.id   = PSEUDO_FILE_PLUGIN_ID;
69917 +       data.mode = pplug->lookup_mode;
69918 +
69919 +       plugin_set_file(&idata->pset, file_plugin_by_id(PSEUDO_FILE_PLUGIN_ID));
69920 +       /* if plugin has a ->lookup method, it means that @pseudo should
69921 +        * behave like directory. */
69922 +       if (pplug->lookup != NULL)
69923 +               plugin_set_dir(&idata->pset,
69924 +                              dir_plugin_by_id(PSEUDO_DIR_PLUGIN_ID));
69925 +
69926 +       /* perform standard plugin initialization */
69927 +       result = inode_file_plugin(pseudo)->set_plug_in_inode(pseudo,
69928 +                                                             parent, &data);
69929 +       if (result != 0) {
69930 +               warning("nikita-3203", "Cannot install pseudo plugin");
69931 +               print_plugin("plugin", pseudo_plugin_to_plugin(pplug));
69932 +               return result;
69933 +       }
69934 +
69935 +       /* inherit permission plugin from parent, */
69936 +       grab_plugin(pseudo, parent, PSET_PERM);
69937 +       /* and credentials... */
69938 +       pseudo->i_uid = parent->i_uid;
69939 +       pseudo->i_gid = parent->i_gid;
69940 +
69941 +       pseudo->i_nlink = 1;
69942 +       /* insert inode into VFS hash table */
69943 +       insert_inode_hash(pseudo);
69944 +       return 0;
69945 +}
69946 +
69947 +/* helper function: return host object by file descriptor */
69948 +static struct inode *get_pseudo_host(struct file *file)
69949 +{
69950 +       struct inode *inode;
69951 +
69952 +       inode = file->f_dentry->d_inode;
69953 +       return get_inode_host(inode);
69954 +}
69955 +
69956 +/* helper function: return host object by seq_file */
69957 +static struct inode *get_seq_pseudo_host(struct seq_file *seq)
69958 +{
69959 +       struct file *file;
69960 +
69961 +       file = seq->private;
69962 +       return get_pseudo_host(file);
69963 +}
69964 +
69965 +/*
69966 + * implementation of ->try method for pseudo files with fixed names.
69967 + */
69968 +static int try_by_label(pseudo_plugin *pplug,
69969 +                       const struct inode *parent, const char *name)
69970 +{
69971 +       return !strcmp(name, pplug->h.label);
69972 +}
69973 +
69974 +/*
69975 + * read method for the "metas/uid" pseudo file.
69976 + */
69977 +static int show_uid(struct seq_file *seq, void *cookie)
69978 +{
69979 +       seq_printf(seq, "%lu", (long unsigned)get_seq_pseudo_host(seq)->i_uid);
69980 +       return 0;
69981 +}
69982 +
69983 +/* helper: check permissions required to modify metas/[ug]id */
69984 +static int check_perm(struct inode *inode)
69985 +{
69986 +       if (IS_RDONLY(inode))
69987 +               return RETERR(-EROFS);
69988 +       if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
69989 +               return RETERR(-EPERM);
69990 +       return 0;
69991 +}
69992 +
69993 +/*
69994 + * helper function to update [ug]id of @inode. Called by "metas/[ug]id" write
69995 + * methods
69996 + */
69997 +static int update_ugid(struct dentry *dentry, struct inode *inode,
69998 +                      uid_t uid, gid_t gid)
69999 +{
70000 +       int result;
70001 +
70002 +       /* logic COPIED from fs/open.c:chown_common() */
70003 +       result = check_perm(inode);
70004 +       if (result == 0) {
70005 +               struct iattr newattrs;
70006 +
70007 +               newattrs.ia_valid =  ATTR_CTIME;
70008 +               if (uid != (uid_t) -1) {
70009 +                       newattrs.ia_valid |= ATTR_UID;
70010 +                       newattrs.ia_uid = uid;
70011 +               }
70012 +               if (gid != (uid_t) -1) {
70013 +                       newattrs.ia_valid |= ATTR_GID;
70014 +                       newattrs.ia_gid = gid;
70015 +               }
70016 +               if (!S_ISDIR(inode->i_mode))
70017 +                       newattrs.ia_valid |= ATTR_KILL_SUID|ATTR_KILL_SGID;
70018 +               down(&inode->i_sem);
70019 +               result = notify_change(dentry, &newattrs);
70020 +               up(&inode->i_sem);
70021 +       }
70022 +       return result;
70023 +}
70024 +
70025 +/*
70026 + * write method for the "metas/uid": extract uid from user-supplied buffer,
70027 + * and update uid
70028 + */
70029 +static int store_uid(struct file *file, const char *buf)
70030 +{
70031 +       uid_t uid;
70032 +       int result;
70033 +
70034 +       if (sscanf(buf, "%i", &uid) == 1) {
70035 +               struct inode *host;
70036 +
70037 +               host = get_pseudo_host(file);
70038 +               result = update_ugid(file->f_dentry->d_parent->d_parent,
70039 +                                    host, uid, -1);
70040 +       } else
70041 +               result = RETERR(-EINVAL);
70042 +       return result;
70043 +}
70044 +
70045 +/*
70046 + * read method for the "metas/uid" pseudo file.
70047 + */
70048 +static int show_gid(struct seq_file *seq, void *cookie)
70049 +{
70050 +       seq_printf(seq, "%lu", (long unsigned)get_seq_pseudo_host(seq)->i_gid);
70051 +       return 0;
70052 +}
70053 +
70054 +/*
70055 + * write method for the "metas/gid": extract uid from user-supplied buffer,
70056 + * and update gid
70057 + */
70058 +static int get_gid(struct file *file, const char *buf)
70059 +{
70060 +       gid_t gid;
70061 +       int result;
70062 +
70063 +       if (sscanf(buf, "%i", &gid) == 1) {
70064 +               struct inode *host;
70065 +
70066 +               host = get_pseudo_host(file);
70067 +               result = update_ugid(file->f_dentry->d_parent->d_parent,
70068 +                                    host, -1, gid);
70069 +       } else
70070 +               result = RETERR(-EINVAL);
70071 +       return result;
70072 +}
70073 +
70074 +/*
70075 + * read method for the "metas/oid" pseudo file
70076 + */
70077 +static int show_oid(struct seq_file *seq, void *cookie)
70078 +{
70079 +       seq_printf(seq, "%llu", get_inode_oid(get_seq_pseudo_host(seq)));
70080 +       return 0;
70081 +}
70082 +
70083 +/*
70084 + * read method for the "metas/key" pseudo file
70085 + */
70086 +static int show_key(struct seq_file *seq, void *cookie)
70087 +{
70088 +       char buf[KEY_BUF_LEN];
70089 +       reiser4_key key;
70090 +
70091 +       sprintf_key(buf, build_sd_key(get_seq_pseudo_host(seq), &key));
70092 +       seq_printf(seq, "%s", buf);
70093 +       return 0;
70094 +}
70095 +
70096 +/*
70097 + * read method for the "metas/size" pseudo file
70098 + */
70099 +static int show_size(struct seq_file *seq, void *cookie)
70100 +{
70101 +       seq_printf(seq, "%lli", get_seq_pseudo_host(seq)->i_size);
70102 +       return 0;
70103 +}
70104 +
70105 +/*
70106 + * read method for the "metas/nlink" pseudo file
70107 + */
70108 +static int show_nlink(struct seq_file *seq, void *cookie)
70109 +{
70110 +       seq_printf(seq, "%u", get_seq_pseudo_host(seq)->i_nlink);
70111 +       return 0;
70112 +}
70113 +
70114 +/*
70115 + * read method for the "metas/locality" pseudo file
70116 + */
70117 +static int show_locality(struct seq_file *seq, void *cookie)
70118 +{
70119 +       seq_printf(seq, "%llu",
70120 +                  reiser4_inode_data(get_seq_pseudo_host(seq))->locality_id);
70121 +       return 0;
70122 +}
70123 +
70124 +/*
70125 + * read method for the "metas/rwx" pseudo file
70126 + */
70127 +static int show_rwx(struct seq_file *seq, void *cookie)
70128 +{
70129 +       umode_t      m;
70130 +
70131 +       m = get_seq_pseudo_host(seq)->i_mode;
70132 +       seq_printf(seq, "%#ho %c%c%c%c%c%c%c%c%c%c",
70133 +                  m,
70134 +
70135 +                  S_ISREG(m) ? '-' :
70136 +                  S_ISDIR(m) ? 'd' :
70137 +                  S_ISCHR(m) ? 'c' :
70138 +                  S_ISBLK(m) ? 'b' :
70139 +                  S_ISFIFO(m) ? 'p' :
70140 +                  S_ISLNK(m) ? 'l' :
70141 +                  S_ISSOCK(m) ? 's' : '?',
70142 +
70143 +                  m & S_IRUSR ? 'r' : '-',
70144 +                  m & S_IWUSR ? 'w' : '-',
70145 +                  m & S_IXUSR ? 'x' : '-',
70146 +
70147 +                  m & S_IRGRP ? 'r' : '-',
70148 +                  m & S_IWGRP ? 'w' : '-',
70149 +                  m & S_IXGRP ? 'x' : '-',
70150 +
70151 +                  m & S_IROTH ? 'r' : '-',
70152 +                  m & S_IWOTH ? 'w' : '-',
70153 +                  m & S_IXOTH ? 'x' : '-');
70154 +       return 0;
70155 +}
70156 +
70157 +/*
70158 + * write method for the "metas/rwx" file. Extract permission bits from the
70159 + * user supplied buffer and update ->i_mode.
70160 + */
70161 +static int get_rwx(struct file *file, const char *buf)
70162 +{
70163 +       umode_t rwx;
70164 +       int result;
70165 +
70166 +       if (sscanf(buf, "%hi", &rwx) == 1) {
70167 +               struct inode *host;
70168 +
70169 +               host = get_pseudo_host(file);
70170 +               result = check_perm(host);
70171 +               if (result == 0) {
70172 +                       struct iattr newattrs;
70173 +
70174 +                       down(&host->i_sem);
70175 +                       if (rwx == (mode_t) -1)
70176 +                               rwx = host->i_mode;
70177 +                       newattrs.ia_mode =
70178 +                               (rwx & S_IALLUGO) | (host->i_mode & ~S_IALLUGO);
70179 +                       newattrs.ia_valid = ATTR_MODE | ATTR_CTIME;
70180 +                       result = notify_change(file->f_dentry->d_parent->d_parent,
70181 +                                              &newattrs);
70182 +                       up(&host->i_sem);
70183 +               }
70184 +       } else
70185 +               result = RETERR(-EINVAL);
70186 +       return result;
70187 +}
70188 +
70189 +/*
70190 + * seq-methods for "metas/pseudo"
70191 + */
70192 +
70193 +/*
70194 + * start iteration over all pseudo files
70195 + */
70196 +static void * pseudos_start(struct seq_file *m, loff_t *pos)
70197 +{
70198 +       if (*pos >= LAST_PSEUDO_ID)
70199 +               return NULL;
70200 +       return pseudo_plugin_by_id(*pos);
70201 +}
70202 +
70203 +/*
70204 + * stop iteration over all pseudo files
70205 + */
70206 +static void pseudos_stop(struct seq_file *m, void *v)
70207 +{
70208 +}
70209 +
70210 +/*
70211 + * go to next pseudo file in the sequence
70212 + */
70213 +static void * pseudos_next(struct seq_file *m, void *v, loff_t *pos)
70214 +{
70215 +       ++ (*pos);
70216 +       return pseudos_start(m, pos);
70217 +}
70218 +
70219 +/*
70220 + * output information about particular pseudo file.
70221 + */
70222 +static int pseudos_show(struct seq_file *m, void *v)
70223 +{
70224 +       pseudo_plugin *pplug;
70225 +
70226 +       pplug = v;
70227 +       if (pplug->try != NULL)
70228 +               seq_printf(m, "%s\n", pplug->h.label);
70229 +       return 0;
70230 +}
70231 +
70232 +/*
70233 + * seq-methods for "metas/bmap"
70234 + */
70235 +
70236 +/*
70237 + * start iteration over all blocks allocated for the host file
70238 + */
70239 +static void * bmap_start(struct seq_file *m, loff_t *pos)
70240 +{
70241 +       struct inode *host;
70242 +
70243 +       host = get_seq_pseudo_host(m);
70244 +       if (*pos << host->i_blkbits >= host->i_size)
70245 +               return NULL;
70246 +       else
70247 +               return (void *)((unsigned long)*pos + 1);
70248 +}
70249 +
70250 +/*
70251 + * stop iteration over all blocks allocated for the host file
70252 + */
70253 +static void bmap_stop(struct seq_file *m, void *v)
70254 +{
70255 +}
70256 +
70257 +/*
70258 + * go to the next block in the sequence of blocks allocated for the host
70259 + * file.
70260 + */
70261 +static void * bmap_next(struct seq_file *m, void *v, loff_t *pos)
70262 +{
70263 +       ++ (*pos);
70264 +       return bmap_start(m, pos);
70265 +}
70266 +
70267 +extern int reiser4_lblock_to_blocknr(struct address_space *mapping,
70268 +                                    sector_t lblock, reiser4_block_nr *blocknr);
70269 +
70270 +/*
70271 + * output information about single block number allocated for the host file
70272 + * into user supplied buffer
70273 + */
70274 +static int bmap_show(struct seq_file *m, void *v)
70275 +{
70276 +       sector_t lblock;
70277 +       int result;
70278 +       reiser4_block_nr blocknr;
70279 +
70280 +       lblock = ((sector_t)(unsigned long)v) - 1;
70281 +       result = reiser4_lblock_to_blocknr(get_seq_pseudo_host(m)->i_mapping,
70282 +                                          lblock, &blocknr);
70283 +       if (result == 0) {
70284 +               if (blocknr_is_fake(&blocknr))
70285 +                       seq_printf(m, "%#llx\n", blocknr);
70286 +               else
70287 +                       seq_printf(m, "%llu\n", blocknr);
70288 +       }
70289 +       return result;
70290 +}
70291 +
70292 +/*
70293 + * seq-methods for the "metas/readdir"
70294 + */
70295 +
70296 +/* "cursor" used to iterate over all directory entries for the host file */
70297 +typedef struct readdir_cookie {
70298 +       /* position within the tree */
70299 +       tap_t       tap;
70300 +       /* coord used by ->tap */
70301 +       coord_t     coord;
70302 +       /* lock handle used by ->tap */
70303 +       lock_handle lh;
70304 +} readdir_cookie;
70305 +
70306 +/* true if @coord stores directory entries for @host */
70307 +static int is_host_item(struct inode *host, coord_t *coord)
70308 +{
70309 +       if (item_type_by_coord(coord) != DIR_ENTRY_ITEM_TYPE)
70310 +               return 0;
70311 +       if (!inode_file_plugin(host)->owns_item(host, coord))
70312 +               return 0;
70313 +       return 1;
70314 +}
70315 +
70316 +/* helper function to release resources allocated to iterate over directory
70317 + * entries for the host file */
70318 +static void finish(readdir_cookie *c)
70319 +{
70320 +       if (c != NULL && !IS_ERR(c)) {
70321 +               /* release c->tap->lh long term lock... */
70322 +               tap_done(&c->tap);
70323 +               /* ... and free cursor itself */
70324 +               kfree(c);
70325 +       }
70326 +}
70327 +
70328 +/*
70329 + * start iterating over directory entries for the host file
70330 + */
70331 +static void * readdir_start(struct seq_file *m, loff_t *pos)
70332 +{
70333 +       struct inode   *host;
70334 +       readdir_cookie *c;
70335 +       dir_plugin     *dplug;
70336 +       reiser4_key     dotkey;
70337 +       struct qstr     dotname;
70338 +       int             result;
70339 +       loff_t          entryno;
70340 +
70341 +       /*
70342 +        * first, lookup item containing dot of the host
70343 +        */
70344 +
70345 +       host = get_seq_pseudo_host(m);
70346 +       dplug = inode_dir_plugin(host);
70347 +
70348 +       dotname.name = ".";
70349 +       dotname.len  = 1;
70350 +
70351 +       down(&host->i_sem);
70352 +       if (dplug == NULL || dplug->build_entry_key == NULL) {
70353 +               finish(NULL);
70354 +               return NULL;
70355 +       }
70356 +
70357 +       /* build key of dot */
70358 +       dplug->build_entry_key(host, &dotname, &dotkey);
70359 +
70360 +       /* allocate cursor */
70361 +       c = kmalloc(sizeof *c, GFP_KERNEL);
70362 +       if (c == NULL) {
70363 +               finish(NULL);
70364 +               return ERR_PTR(RETERR(-ENOMEM));
70365 +       }
70366 +
70367 +       /* tree lookup */
70368 +       result = object_lookup(host,
70369 +                              &dotkey,
70370 +                              &c->coord,
70371 +                              &c->lh,
70372 +                              ZNODE_READ_LOCK,
70373 +                              FIND_EXACT,
70374 +                              LEAF_LEVEL,
70375 +                              LEAF_LEVEL,
70376 +                              CBK_READDIR_RA,
70377 +                              NULL);
70378 +
70379 +       tap_init(&c->tap, &c->coord, &c->lh, ZNODE_READ_LOCK);
70380 +       if (result == 0)
70381 +               /*
70382 +                * ok, now c->tap is positioned at the dot. We are requested
70383 +                * to start readdir from the offset *pos. Skip that number of
70384 +                * entries. That's not very efficient for the large
70385 +                * directories.
70386 +                */
70387 +               result = tap_load(&c->tap); {
70388 +               if (result == 0) {
70389 +                       for (entryno = 0; entryno != *pos; ++ entryno) {
70390 +                               result = go_next_unit(&c->tap);
70391 +                               if (result == -E_NO_NEIGHBOR) {
70392 +                                       finish(c);
70393 +                                       return NULL;
70394 +                               }
70395 +                               if (result != 0)
70396 +                                       break;
70397 +                               if (!is_host_item(host, c->tap.coord)) {
70398 +                                       finish(c);
70399 +                                       return NULL;
70400 +                               }
70401 +                       }
70402 +               }
70403 +       }
70404 +       if (result != 0) {
70405 +               finish(c);
70406 +               return ERR_PTR(result);
70407 +       } else
70408 +               return c;
70409 +}
70410 +
70411 +/*
70412 + * stop iterating over directory entries for the host file
70413 + */
70414 +static void readdir_stop(struct seq_file *m, void *v)
70415 +{
70416 +       up(&get_seq_pseudo_host(m)->i_sem);
70417 +       finish(v);
70418 +}
70419 +
70420 +/*
70421 + * go to the next entry in the host directory
70422 + */
70423 +static void * readdir_next(struct seq_file *m, void *v, loff_t *pos)
70424 +{
70425 +       readdir_cookie *c;
70426 +       struct inode   *host;
70427 +       int result;
70428 +
70429 +       c = v;
70430 +       ++ (*pos);
70431 +       host = get_seq_pseudo_host(m);
70432 +       /* next entry is in the next unit */
70433 +       result = go_next_unit(&c->tap);
70434 +       if (result == 0) {
70435 +               /* check whether end of the directory was reached. */
70436 +               if (!is_host_item(host, c->tap.coord)) {
70437 +                       finish(c);
70438 +                       return NULL;
70439 +               } else
70440 +                       return v;
70441 +       } else {
70442 +               finish(c);
70443 +               return ERR_PTR(result);
70444 +       }
70445 +}
70446 +
70447 +/*
70448 + * output information about single directory entry in the host directory
70449 + */
70450 +static int readdir_show(struct seq_file *m, void *v)
70451 +{
70452 +       readdir_cookie *c;
70453 +       item_plugin *iplug;
70454 +       char *name;
70455 +       char buf[DE_NAME_BUF_LEN];
70456 +
70457 +       c = v;
70458 +       iplug = item_plugin_by_coord(&c->coord);
70459 +
70460 +       name = iplug->s.dir.extract_name(&c->coord, buf);
70461 +       assert("nikita-3221", name != NULL);
70462 +       /* entries are separated by the "/" in the user buffer, because this
70463 +        * is the only symbol (besides NUL) that is not allowed in file
70464 +        * names. */
70465 +       seq_printf(m, "%s/", name);
70466 +       return 0;
70467 +}
70468 +
70469 +/*
70470 + * methods for "metas/plugin"
70471 + */
70472 +
70473 +/*
70474 + * entry in the table mapping plugin pseudo file name to the corresponding
70475 + * pset member.
70476 + */
70477 +typedef struct plugin_entry {
70478 +       const char *name;
70479 +       pset_member memb;
70480 +} plugin_entry;
70481 +
70482 +/* initializer for plugin_entry */
70483 +#define PLUGIN_ENTRY(field, ind)               \
70484 +{                                              \
70485 +       .name = #field,                         \
70486 +       .memb = ind                             \
70487 +}
70488 +
70489 +#define PSEUDO_ARRAY_ENTRY(idx, aname)         \
70490 +[idx] = {                                      \
70491 +       .name = aname                           \
70492 +}
70493 +
70494 +/*
70495 + * initialize array defining files available under "metas/plugin".
70496 + */
70497 +static plugin_entry pentry[] = {
70498 +       /* "a/metas/plugin/file" corresponds to the PSET_FILE plugin of its
70499 +        * host file (that is, "a"), etc. */
70500 +       PLUGIN_ENTRY(file, PSET_FILE),
70501 +       PLUGIN_ENTRY(dir, PSET_DIR),
70502 +       PLUGIN_ENTRY(perm, PSET_PERM),
70503 +       PLUGIN_ENTRY(formatting, PSET_FORMATTING),
70504 +       PLUGIN_ENTRY(hash, PSET_HASH),
70505 +       PLUGIN_ENTRY(fibration, PSET_FIBRATION),
70506 +       PLUGIN_ENTRY(sd, PSET_SD),
70507 +       PLUGIN_ENTRY(dir_item, PSET_DIR_ITEM),
70508 +       PLUGIN_ENTRY(crypto, PSET_CRYPTO),
70509 +       PLUGIN_ENTRY(digest, PSET_DIGEST),
70510 +       PLUGIN_ENTRY(compression, PSET_COMPRESSION),
70511 +       {
70512 +               .name = NULL,
70513 +       }
70514 +};
70515 +
70516 +/*
70517 + * enumeration of files available under "a/metas/plugin/foo"
70518 + */
70519 +typedef enum {
70520 +       PFIELD_TYPEID, /* "a/metas/plugin/foo/type_id" contains type id of the
70521 +                       * plugin foo */
70522 +       PFIELD_ID,     /* "a/metas/plugin/foo/id" contains id of the plugin
70523 +                       * foo */
70524 +       PFIELD_LABEL,  /* "a/metas/plugin/foo/label" contains label of the
70525 +                       * plugin foo */
70526 +       PFIELD_DESC    /* "a/metas/plugin/foo/desc" contains description of
70527 +                       * the plugin foo */
70528 +} plugin_field;
70529 +
70530 +/* map pseudo files under "a/metas/plugin/foo" to their names */
70531 +static plugin_entry fentry[] = {
70532 +       PSEUDO_ARRAY_ENTRY(PFIELD_TYPEID, "type_id"),
70533 +       PSEUDO_ARRAY_ENTRY(PFIELD_ID, "id"),
70534 +       PSEUDO_ARRAY_ENTRY(PFIELD_LABEL, "label"),
70535 +       PSEUDO_ARRAY_ENTRY(PFIELD_DESC, "desc"),
70536 +       {
70537 +               .name   = NULL
70538 +       },
70539 +};
70540 +
70541 +/* read method for "a/metas/plugin/foo" */
70542 +static int show_plugin(struct seq_file *seq, void *cookie)
70543 +{
70544 +       struct inode   *host;
70545 +       struct file    *file;
70546 +       struct inode   *inode;
70547 +       reiser4_plugin *plug;
70548 +       plugin_entry   *entry;
70549 +       int             idx;
70550 +       plugin_set     *pset;
70551 +
70552 +       file  = seq->private;
70553 +       inode = file->f_dentry->d_inode;
70554 +
70555 +       host  = get_inode_host(inode);
70556 +       idx   = reiser4_inode_data(inode)->file_plugin_data.pseudo_info.datum;
70557 +       entry = &pentry[idx];
70558 +       pset  = reiser4_inode_data(host)->pset;
70559 +       plug  = pset_get(pset, entry->memb);
70560 +
70561 +       if (plug != NULL)
70562 +               seq_printf(seq, "%i %s %s",
70563 +                          plug->h.id, plug->h.label, plug->h.desc);
70564 +       return 0;
70565 +}
70566 +
70567 +/*
70568 + * write method for "a/metas/plugin/foo": extract plugin label from the user
70569 + * supplied buffer @buf and update plugin foo, if possible.
70570 + */
70571 +static int set_plugin(struct file *file, const char *buf)
70572 +{
70573 +       struct inode   *host;
70574 +       struct inode   *inode;
70575 +       reiser4_plugin *plug;
70576 +       plugin_entry   *entry;
70577 +       int             idx;
70578 +       plugin_set     *pset;
70579 +       int             result;
70580 +       reiser4_context ctx;
70581 +
70582 +       inode = file->f_dentry->d_inode;
70583 +       init_context(&ctx, inode->i_sb);
70584 +
70585 +       host  = get_inode_host(inode);
70586 +       idx   = reiser4_inode_data(inode)->file_plugin_data.pseudo_info.datum;
70587 +       entry = &pentry[idx];
70588 +       pset  = reiser4_inode_data(host)->pset;
70589 +
70590 +       plug = lookup_plugin(entry->name, buf);
70591 +       if (plug != NULL) {
70592 +               result = force_plugin(host, entry->memb, plug);
70593 +               if (result == 0) {
70594 +                       __u64 tograb;
70595 +
70596 +                       /*
70597 +                        * if plugin was updated successfully, save changes in
70598 +                        * the stat-data
70599 +                        */
70600 +                       tograb = inode_file_plugin(host)->estimate.update(host);
70601 +                       result = reiser4_grab_space(tograb, BA_CAN_COMMIT);
70602 +                       if (result == 0)
70603 +                               result = reiser4_mark_inode_dirty(host);
70604 +               }
70605 +       } else
70606 +               result = RETERR(-ENOENT);
70607 +       context_set_commit_async(&ctx);
70608 +       reiser4_exit_context(&ctx);
70609 +       return result;
70610 +}
70611 +
70612 +/*
70613 + * helper function to implement ->lookup() method of pseudo directory plugin
70614 + * for the file that contains multiple similar children pseudo files.
70615 + *
70616 + * For example, "a/metas/plugin/" directory contains files for each plugin
70617 + * associated with the host file "a". Handling of read/write for these file is
70618 + * exactly the same, the only difference being the pset member id for the
70619 + * corresponding plugin. Similarly, "a/metas/plugin/foo/" itself contains
70620 + * files that are used to provide user access to the corresponding fields of
70621 + * the "foo" plugin, and all such fields can be handled similarly (see
70622 + * show_plugin_field())
70623 + *
70624 + * To avoid code duplication in such situation, an array is constructed that
70625 + * is used as a map from the name of "child" object to the corresponding
70626 + * "datum". All child objects are handled by the same pseudo plugin, and are
70627 + * differentiated by the datum installed into pseudo file inode.
70628 + */
70629 +static int array_lookup_pseudo(struct inode *parent, struct dentry ** dentry,
70630 +                              plugin_entry *array, pseudo_plugin *pplug)
70631 +{
70632 +       int result;
70633 +       int idx;
70634 +       struct inode *pseudo;
70635 +
70636 +       pseudo = ERR_PTR(-ENOENT);
70637 +       /* search for the given name in the array */
70638 +       for (idx = 0; array[idx].name != NULL; ++ idx) {
70639 +               if (!strcmp((*dentry)->d_name.name, array[idx].name)) {
70640 +                       pseudo = add_pseudo(parent, pplug, dentry);
70641 +                       break;
70642 +               }
70643 +       }
70644 +       if (IS_ERR(pseudo))
70645 +               result = PTR_ERR(pseudo);
70646 +       else {
70647 +               result = 0;
70648 +               /* if name was found, set datum in the inode */
70649 +               pseudo_set_datum(pseudo, idx);
70650 +       }
70651 +       return result;
70652 +}
70653 +
70654 +/*
70655 + * helper method to implement array for the situation when we have multiple
70656 + * child pseudo files with similar functionality. See comment for
70657 + * array_lookup_pseudo().
70658 + */
70659 +static int array_readdir_pseudo(struct file *f, void *dirent, filldir_t filld,
70660 +                               plugin_entry *array, int size)
70661 +{
70662 +       loff_t off;
70663 +       ino_t  ino;
70664 +
70665 +       off = f->f_pos;
70666 +       if (off < 0)
70667 +               return 0;
70668 +
70669 +       /* for god's sake, why switch(loff_t) requires __cmpdi2? */
70670 +       switch ((int)off) {
70671 +       case 0:
70672 +               ino = f->f_dentry->d_inode->i_ino;
70673 +               if (filld(dirent, ".", 1, off, ino, DT_DIR) < 0)
70674 +                       break;
70675 +               ++ off;
70676 +               /* fallthrough */
70677 +       case 1:
70678 +               ino = parent_ino(f->f_dentry);
70679 +               if (filld(dirent, "..", 2, off, ino, DT_DIR) < 0)
70680 +                       break;
70681 +               ++ off;
70682 +               /* fallthrough */
70683 +       default:
70684 +               /* scan array for the names */
70685 +               for (; off < size + 1; ++ off) {
70686 +                       const char *name;
70687 +
70688 +                       name = array[off - 2].name;
70689 +                       if (filld(dirent, name, strlen(name),
70690 +                                 off, off + (long)f, DT_REG) < 0)
70691 +                               break;
70692 +               }
70693 +       }
70694 +       f->f_pos = off;
70695 +       return 0;
70696 +}
70697 +
70698 +
70699 +/*
70700 + * ->lookup() method for the "a/metas/plugin/foo/" directory. It uses array
70701 + * representation of child objects, described in the comment for
70702 + * array_lookup_pseudo().
70703 + */
70704 +static int lookup_plugin_field(struct inode *parent, struct dentry ** dentry)
70705 +{
70706 +       return array_lookup_pseudo(parent, dentry, fentry,
70707 +                                  pseudo_plugin_by_id(PSEUDO_PLUGIN_FIELD_ID));
70708 +}
70709 +
70710 +/*
70711 + * read method for "a/metas/plugin/foo/field"
70712 + */
70713 +static int show_plugin_field(struct seq_file *seq, void *cookie)
70714 +{
70715 +       struct inode   *parent;
70716 +       struct inode   *host;
70717 +       struct file    *file;
70718 +       struct inode   *inode;
70719 +       reiser4_plugin *plug;
70720 +       plugin_entry   *entry;
70721 +       int             pidx;
70722 +       int             idx;
70723 +       plugin_set     *pset;
70724 +
70725 +       file  = seq->private;
70726 +       inode = file->f_dentry->d_inode;
70727 +
70728 +       parent  = get_inode_parent(inode);
70729 +       host  = get_inode_host(inode);
70730 +       pidx  = reiser4_inode_data(parent)->file_plugin_data.pseudo_info.datum;
70731 +       idx   = reiser4_inode_data(inode)->file_plugin_data.pseudo_info.datum;
70732 +       entry = &pentry[pidx];
70733 +       pset  = reiser4_inode_data(host)->pset;
70734 +       plug  = pset_get(pset, entry->memb);
70735 +
70736 +       if (plug != NULL) {
70737 +               switch (idx) {
70738 +               case PFIELD_TYPEID:
70739 +                       seq_printf(seq, "%i", plug->h.type_id);
70740 +                       break;
70741 +               case PFIELD_ID:
70742 +                       seq_printf(seq, "%i", plug->h.id);
70743 +                       break;
70744 +               case PFIELD_LABEL:
70745 +                       seq_printf(seq, "%s", plug->h.label);
70746 +                       break;
70747 +               case PFIELD_DESC:
70748 +                       seq_printf(seq, "%s", plug->h.desc);
70749 +                       break;
70750 +               }
70751 +       }
70752 +
70753 +       return 0;
70754 +}
70755 +
70756 +/*
70757 + * ->readdir() method for "a/metas/plugin/foo/". It uses array representation of
70758 + * child objects, described in the comment for array_lookup_pseudo().
70759 + */
70760 +static int readdir_plugin_field(struct file *f, void *dirent, filldir_t filld)
70761 +{
70762 +       return array_readdir_pseudo(f, dirent, filld,
70763 +                                   fentry, sizeof_array(fentry));
70764 +}
70765 +
70766 +/*
70767 + * ->lookup() method for the "a/metas/plugin/" directory. It uses array
70768 + * representation of child objects, described in the comment for
70769 + * array_lookup_pseudo().
70770 + */
70771 +static int lookup_plugins(struct inode *parent, struct dentry ** dentry)
70772 +{
70773 +       return array_lookup_pseudo(parent, dentry, pentry,
70774 +                                  pseudo_plugin_by_id(PSEUDO_PLUGIN_ID));
70775 +}
70776 +
70777 +/*
70778 + * ->readdir() method for "a/metas/plugin/". It uses array representation of
70779 + * child objects, described in the comment for array_lookup_pseudo().
70780 + */
70781 +static int readdir_plugins(struct file *f, void *dirent, filldir_t filld)
70782 +{
70783 +       return array_readdir_pseudo(f, dirent, filld,
70784 +                                   pentry, sizeof_array(pentry));
70785 +}
70786 +
70787 +/*
70788 + * seq-methods for the "a/metas/items"
70789 + */
70790 +
70791 +/*
70792 + * start iteration over a sequence of items for the host file. This iterator
70793 + * uses the same cursor as a readdir iterator above.
70794 + */
70795 +static void * items_start(struct seq_file *m, loff_t *pos)
70796 +{
70797 +       struct inode   *host;
70798 +       readdir_cookie *c;
70799 +       file_plugin    *fplug;
70800 +       reiser4_key     headkey;
70801 +       int             result;
70802 +       loff_t          entryno;
70803 +
70804 +       /*
70805 +        * first, find first item in the file, then, scan to the *pos-th one.
70806 +        */
70807 +
70808 +       host = get_seq_pseudo_host(m);
70809 +       fplug = inode_file_plugin(host);
70810 +
70811 +       down(&host->i_sem);
70812 +       if (fplug->key_by_inode == NULL) {
70813 +               finish(NULL);
70814 +               return NULL;
70815 +       }
70816 +
70817 +       /* construct a key of the first item */
70818 +       fplug->key_by_inode(host, 0, &headkey);
70819 +
70820 +       c = kmalloc(sizeof *c, GFP_KERNEL);
70821 +       if (c == NULL) {
70822 +               finish(NULL);
70823 +               return ERR_PTR(RETERR(-ENOMEM));
70824 +       }
70825 +
70826 +       /* find first item */
70827 +       result = object_lookup(host,
70828 +                              &headkey,
70829 +                              &c->coord,
70830 +                              &c->lh,
70831 +                              ZNODE_READ_LOCK,
70832 +                              FIND_MAX_NOT_MORE_THAN,
70833 +                              TWIG_LEVEL,
70834 +                              LEAF_LEVEL,
70835 +                              0,
70836 +                              NULL);
70837 +
70838 +       tap_init(&c->tap, &c->coord, &c->lh, ZNODE_READ_LOCK);
70839 +       if (result == 0)
70840 +               result = tap_load(&c->tap); {
70841 +               if (result == 0) {
70842 +                       /*
70843 +                        * skip @pos items
70844 +                        */
70845 +                       for (entryno = 0; entryno != *pos; ++ entryno) {
70846 +                               result = go_next_unit(&c->tap);
70847 +                               if (result == -E_NO_NEIGHBOR) {
70848 +                                       finish(c);
70849 +                                       return NULL;
70850 +                               }
70851 +                               if (result != 0)
70852 +                                       break;
70853 +                               if (!fplug->owns_item(host, c->tap.coord)) {
70854 +                                       finish(c);
70855 +                                       return NULL;
70856 +                               }
70857 +                       }
70858 +               }
70859 +       }
70860 +       if (result != 0) {
70861 +               finish(c);
70862 +               return ERR_PTR(result);
70863 +       } else
70864 +               return c;
70865 +}
70866 +
70867 +/*
70868 + * stop iteration over a sequence of items for the host file
70869 + */
70870 +static void items_stop(struct seq_file *m, void *v)
70871 +{
70872 +       up(&get_seq_pseudo_host(m)->i_sem);
70873 +       finish(v);
70874 +}
70875 +
70876 +/* go to the next item in the host file */
70877 +static void * items_next(struct seq_file *m, void *v, loff_t *pos)
70878 +{
70879 +       readdir_cookie *c;
70880 +       struct inode   *host;
70881 +       int result;
70882 +
70883 +       c = v;
70884 +       ++ (*pos);
70885 +       host = get_seq_pseudo_host(m);
70886 +       result = go_next_unit(&c->tap);
70887 +       if (result == 0) {
70888 +               if (!inode_file_plugin(host)->owns_item(host, c->tap.coord)) {
70889 +                       finish(c);
70890 +                       return NULL;
70891 +               } else
70892 +                       return v;
70893 +       } else {
70894 +               finish(c);
70895 +               return ERR_PTR(result);
70896 +       }
70897 +}
70898 +
70899 +/* output information about single item of the host file */
70900 +static int items_show(struct seq_file *m, void *v)
70901 +{
70902 +       readdir_cookie *c;
70903 +       item_plugin    *iplug;
70904 +       char            buf[KEY_BUF_LEN];
70905 +       reiser4_key     key;
70906 +
70907 +
70908 +       c = v;
70909 +       iplug = item_plugin_by_coord(&c->coord);
70910 +
70911 +       /* output key... */
70912 +       sprintf_key(buf, unit_key_by_coord(&c->coord, &key));
70913 +       /* ... and item plugin label... */
70914 +       seq_printf(m, "%s %s ", buf, iplug->h.label);
70915 +       if (iplug->b.show != NULL)
70916 +               /* ... and call ->b.show() method of item plugin, if any, to
70917 +                * do the rest */
70918 +               iplug->b.show(m, &c->coord);
70919 +       seq_printf(m, "\n");
70920 +       return 0;
70921 +}
70922 +
70923 +extern int
70924 +invoke_create_method(struct inode *, struct dentry *,
70925 +                    reiser4_object_create_data *);
70926 +
70927 +/*
70928 + * write method for the "a/metas/new" file. Extract file name from the user
70929 + * supplied buffer @buf, and create regular file with that name within host
70930 + * file (that is better to be a directory).
70931 + */
70932 +static int get_new(struct file *file, const char *buf)
70933 +{
70934 +       int result;
70935 +
70936 +       /* check that @buf contains no slashes */
70937 +       if (strchr(buf, '/') == NULL) {
70938 +               struct dentry *d;
70939 +               struct qstr name;
70940 +               unsigned int  c;
70941 +               unsigned long hash;
70942 +
70943 +               reiser4_object_create_data data;
70944 +               xmemset(&data, 0, sizeof data);
70945 +
70946 +               data.mode = S_IFREG | 0 /* mode */;
70947 +               data.id = UNIX_FILE_PLUGIN_ID;
70948 +
70949 +               name.name = buf;
70950 +               c = *(const unsigned char *)buf;
70951 +
70952 +               /* build hash of the name */
70953 +               hash = init_name_hash();
70954 +               do {
70955 +                       buf++;
70956 +                       hash = partial_name_hash(c, hash);
70957 +                       c = *(const unsigned char *)buf;
70958 +               } while (c);
70959 +               name.len = buf - (const char *) name.name;
70960 +               name.hash = end_name_hash(hash);
70961 +
70962 +               /* allocate dentry */
70963 +               d = d_alloc(file->f_dentry->d_parent->d_parent, &name);
70964 +               if (d == NULL)
70965 +                       result = RETERR(-ENOMEM);
70966 +               else {
70967 +                       /* call ->create() method of the host directory */
70968 +                       result = invoke_create_method(get_pseudo_host(file),
70969 +                                                     d, &data);
70970 +                       reiser4_free_dentry_fsdata(d);
70971 +               }
70972 +       } else
70973 +               result = RETERR(-EINVAL);
70974 +       return result;
70975 +}
70976 +
70977 +/*
70978 + * initialize pseudo plugins.
70979 + */
70980 +pseudo_plugin pseudo_plugins[LAST_PSEUDO_ID] = {
70981 +       [PSEUDO_METAS_ID] = {
70982 +               .h = {
70983 +                       .type_id = REISER4_PSEUDO_PLUGIN_TYPE,
70984 +                       .id = PSEUDO_METAS_ID,
70985 +                       .pops = NULL,
70986 +                       .label = "metas",
70987 +                       .desc = "meta-files",
70988 +                       .linkage = TYPE_SAFE_LIST_LINK_ZERO
70989 +               },
70990 +               .parent      = TOP_LEVEL,
70991 +               .try         = try_by_label,
70992 +               .readdirable = 0,
70993 +               .lookup      = lookup_table,
70994 +               .lookup_mode = S_IFDIR | S_IRUGO | S_IXUGO,
70995 +               .read_type   = PSEUDO_READ_NONE,
70996 +               .write_type  = PSEUDO_WRITE_NONE,
70997 +               .readdir     = readdir_table
70998 +       },
70999 +       [PSEUDO_UID_ID] = {
71000 +               .h = {
71001 +                       .type_id = REISER4_PSEUDO_PLUGIN_TYPE,
71002 +                       .id = PSEUDO_UID_ID,
71003 +                       .pops = NULL,
71004 +                       .label = "uid",
71005 +                       .desc = "returns owner",
71006 +                       .linkage = TYPE_SAFE_LIST_LINK_ZERO
71007 +               },
71008 +               .parent      = PSEUDO_METAS_ID,
71009 +               .try         = try_by_label,
71010 +               .readdirable = 1,
71011 +               .lookup      = NULL,
71012 +               .lookup_mode = S_IFREG | S_IRUGO | S_IWUSR,
71013 +               .read_type   = PSEUDO_READ_SINGLE,
71014 +               .read        = {
71015 +                        .single_show = show_uid
71016 +                },
71017 +               .write_type  = PSEUDO_WRITE_STRING,
71018 +               .write       = {
71019 +                        .gets        = store_uid
71020 +                }
71021 +       },
71022 +       [PSEUDO_GID_ID] = {
71023 +               .h = {
71024 +                       .type_id = REISER4_PSEUDO_PLUGIN_TYPE,
71025 +                       .id = PSEUDO_GID_ID,
71026 +                       .pops = NULL,
71027 +                       .label = "gid",
71028 +                       .desc = "returns group",
71029 +                       .linkage = TYPE_SAFE_LIST_LINK_ZERO
71030 +               },
71031 +               .parent      = PSEUDO_METAS_ID,
71032 +               .try         = try_by_label,
71033 +               .readdirable = 1,
71034 +               .lookup      = NULL,
71035 +               .lookup_mode = S_IFREG | S_IRUGO | S_IWUSR,
71036 +               .read_type   = PSEUDO_READ_SINGLE,
71037 +               .read        = {
71038 +                        .single_show = show_gid
71039 +                },
71040 +               .write_type  = PSEUDO_WRITE_STRING,
71041 +               .write       = {
71042 +                        .gets        = get_gid
71043 +                }
71044 +       },
71045 +       [PSEUDO_RWX_ID] = {
71046 +               .h = {
71047 +                       .type_id = REISER4_PSEUDO_PLUGIN_TYPE,
71048 +                       .id = PSEUDO_RWX_ID,
71049 +                       .pops = NULL,
71050 +                       .label = "rwx",
71051 +                       .desc = "returns rwx permissions",
71052 +                       .linkage = TYPE_SAFE_LIST_LINK_ZERO
71053 +               },
71054 +               .parent      = PSEUDO_METAS_ID,
71055 +               .try         = try_by_label,
71056 +               .readdirable = 1,
71057 +               .lookup      = NULL,
71058 +               .lookup_mode = S_IFREG | S_IRUGO | S_IWUSR,
71059 +               .read_type   = PSEUDO_READ_SINGLE,
71060 +               .read        = {
71061 +                        .single_show = show_rwx
71062 +                },
71063 +               .write_type  = PSEUDO_WRITE_STRING,
71064 +               .write       = {
71065 +                        .gets        = get_rwx
71066 +                }
71067 +       },
71068 +       [PSEUDO_OID_ID] = {
71069 +               .h = {
71070 +                       .type_id = REISER4_PSEUDO_PLUGIN_TYPE,
71071 +                       .id = PSEUDO_OID_ID,
71072 +                       .pops = NULL,
71073 +                       .label = "oid",
71074 +                       .desc = "returns object id",
71075 +                       .linkage = TYPE_SAFE_LIST_LINK_ZERO
71076 +               },
71077 +               .parent      = PSEUDO_METAS_ID,
71078 +               .try         = try_by_label,
71079 +               .readdirable = 1,
71080 +               .lookup      = NULL,
71081 +               .lookup_mode = S_IFREG | S_IRUGO,
71082 +               .read_type   = PSEUDO_READ_SINGLE,
71083 +               .read        = {
71084 +                        .single_show = show_oid
71085 +                },
71086 +               .write_type  = PSEUDO_WRITE_NONE
71087 +       },
71088 +       [PSEUDO_KEY_ID] = {
71089 +               .h = {
71090 +                       .type_id = REISER4_PSEUDO_PLUGIN_TYPE,
71091 +                       .id = PSEUDO_KEY_ID,
71092 +                       .pops = NULL,
71093 +                       .label = "key",
71094 +                       .desc = "returns object's key",
71095 +                       .linkage = TYPE_SAFE_LIST_LINK_ZERO
71096 +               },
71097 +               .parent      = PSEUDO_METAS_ID,
71098 +               .try         = try_by_label,
71099 +               .readdirable = 1,
71100 +               .lookup      = NULL,
71101 +               .lookup_mode = S_IFREG | S_IRUGO,
71102 +               .read_type   = PSEUDO_READ_SINGLE,
71103 +               .read        = {
71104 +                        .single_show = show_key
71105 +                },
71106 +               .write_type  = PSEUDO_WRITE_NONE
71107 +       },
71108 +       [PSEUDO_SIZE_ID] = {
71109 +               .h = {
71110 +                       .type_id = REISER4_PSEUDO_PLUGIN_TYPE,
71111 +                       .id = PSEUDO_SIZE_ID,
71112 +                       .pops = NULL,
71113 +                       .label = "size",
71114 +                       .desc = "returns object's size",
71115 +                       .linkage = TYPE_SAFE_LIST_LINK_ZERO
71116 +               },
71117 +               .parent      = PSEUDO_METAS_ID,
71118 +               .try         = try_by_label,
71119 +               .readdirable = 1,
71120 +               .lookup      = NULL,
71121 +               .lookup_mode = S_IFREG | S_IRUGO,
71122 +               .read_type   = PSEUDO_READ_SINGLE,
71123 +               .read        = {
71124 +                        .single_show = show_size
71125 +                },
71126 +               .write_type  = PSEUDO_WRITE_NONE
71127 +       },
71128 +       [PSEUDO_NLINK_ID] = {
71129 +               .h = {
71130 +                       .type_id = REISER4_PSEUDO_PLUGIN_TYPE,
71131 +                       .id = PSEUDO_NLINK_ID,
71132 +                       .pops = NULL,
71133 +                       .label = "nlink",
71134 +                       .desc = "returns nlink count",
71135 +                       .linkage = TYPE_SAFE_LIST_LINK_ZERO
71136 +               },
71137 +               .parent      = PSEUDO_METAS_ID,
71138 +               .try         = try_by_label,
71139 +               .readdirable = 1,
71140 +               .lookup      = NULL,
71141 +               .lookup_mode = S_IFREG | S_IRUGO,
71142 +               .read_type   = PSEUDO_READ_SINGLE,
71143 +               .read        = {
71144 +                        .single_show = show_nlink
71145 +                },
71146 +               .write_type  = PSEUDO_WRITE_NONE
71147 +       },
71148 +       [PSEUDO_LOCALITY_ID] = {
71149 +               .h = {
71150 +                       .type_id = REISER4_PSEUDO_PLUGIN_TYPE,
71151 +                       .id = PSEUDO_LOCALITY_ID,
71152 +                       .pops = NULL,
71153 +                       .label = "locality",
71154 +                       .desc = "returns object's locality",
71155 +                       .linkage = TYPE_SAFE_LIST_LINK_ZERO
71156 +               },
71157 +               .parent      = PSEUDO_METAS_ID,
71158 +               .try         = try_by_label,
71159 +               .readdirable = 1,
71160 +               .lookup      = NULL,
71161 +               .lookup_mode = S_IFREG | S_IRUGO,
71162 +               .read_type   = PSEUDO_READ_SINGLE,
71163 +               .read        = {
71164 +                        .single_show = show_locality
71165 +                },
71166 +               .write_type  = PSEUDO_WRITE_NONE
71167 +       },
71168 +       [PSEUDO_PSEUDOS_ID] = {
71169 +               .h = {
71170 +                       .type_id = REISER4_PSEUDO_PLUGIN_TYPE,
71171 +                       .id = PSEUDO_PSEUDOS_ID,
71172 +                       .pops = NULL,
71173 +                       .label = "pseudo",
71174 +                       .desc = "returns a list of pseudo files",
71175 +                       .linkage = TYPE_SAFE_LIST_LINK_ZERO
71176 +               },
71177 +               .parent      = PSEUDO_METAS_ID,
71178 +               .try         = try_by_label,
71179 +               .readdirable = 1,
71180 +               .lookup      = NULL,
71181 +               .lookup_mode = S_IFREG | S_IRUGO,
71182 +               .read_type   = PSEUDO_READ_SEQ,
71183 +               .read        = {
71184 +                        .ops = {
71185 +                                .start = pseudos_start,
71186 +                                .stop  = pseudos_stop,
71187 +                                .next  = pseudos_next,
71188 +                                .show  = pseudos_show
71189 +                        }
71190 +                },
71191 +               .write_type  = PSEUDO_WRITE_NONE
71192 +       },
71193 +       [PSEUDO_BMAP_ID] = {
71194 +               .h = {
71195 +                       .type_id = REISER4_PSEUDO_PLUGIN_TYPE,
71196 +                       .id = PSEUDO_BMAP_ID,
71197 +                       .pops = NULL,
71198 +                       .label = "bmap",
71199 +                       .desc = "returns a list blocks for this file",
71200 +                       .linkage = TYPE_SAFE_LIST_LINK_ZERO
71201 +               },
71202 +               .parent      = PSEUDO_METAS_ID,
71203 +               .try         = try_by_label,
71204 +               .readdirable = 1,
71205 +               .lookup      = NULL,
71206 +               .lookup_mode = S_IFREG | S_IRUGO,
71207 +               .read_type   = PSEUDO_READ_SEQ,
71208 +               .read        = {
71209 +                        .ops = {
71210 +                                .start = bmap_start,
71211 +                                .stop  = bmap_stop,
71212 +                                .next  = bmap_next,
71213 +                                .show  = bmap_show
71214 +                        }
71215 +                },
71216 +               .write_type  = PSEUDO_WRITE_NONE
71217 +       },
71218 +       [PSEUDO_READDIR_ID] = {
71219 +               .h = {
71220 +                       .type_id = REISER4_PSEUDO_PLUGIN_TYPE,
71221 +                       .id = PSEUDO_READDIR_ID,
71222 +                       .pops = NULL,
71223 +                       .label = "readdir",
71224 +                       .desc = "returns a list of names in the dir",
71225 +                       .linkage = TYPE_SAFE_LIST_LINK_ZERO
71226 +               },
71227 +               .parent      = PSEUDO_METAS_ID,
71228 +               .try         = try_by_label,
71229 +               .readdirable = 1,
71230 +               .lookup      = NULL,
71231 +               .lookup_mode = S_IFREG | S_IRUGO,
71232 +               .read_type   = PSEUDO_READ_SEQ,
71233 +               .read        = {
71234 +                        .ops = {
71235 +                                .start = readdir_start,
71236 +                                .stop  = readdir_stop,
71237 +                                .next  = readdir_next,
71238 +                                .show  = readdir_show
71239 +                        }
71240 +                },
71241 +               .write_type  = PSEUDO_WRITE_NONE
71242 +       },
71243 +       [PSEUDO_PLUGIN_ID] = {
71244 +               .h = {
71245 +                       .type_id = REISER4_PSEUDO_PLUGIN_TYPE,
71246 +                       .id = PSEUDO_PLUGIN_ID,
71247 +                       .pops = NULL,
71248 +                       .label = "plugin",
71249 +                       .desc = "plugin",
71250 +                       .linkage = TYPE_SAFE_LIST_LINK_ZERO
71251 +               },
71252 +               .parent      = PSEUDO_PLUGINS_ID,
71253 +               .try         = NULL,
71254 +               .readdirable = 0,
71255 +               .lookup      = lookup_plugin_field,
71256 +               /*
71257 +                * foo/metas/plugin/bar is much like a directory. So, why
71258 +                * there is no S_IFDIR term in the .lookup_mode, you ask?
71259 +                *
71260 +                * fs/namei.c:may_open():
71261 +                *
71262 +                *     if (S_ISDIR(inode->i_mode) && (flag & FMODE_WRITE))
71263 +                *         return -EISDIR;
71264 +                *
71265 +                * Directory cannot be opened for write. How smart.
71266 +                */
71267 +               .lookup_mode = S_IFREG | S_IRUGO | S_IWUSR | S_IXUGO,
71268 +               .read_type   = PSEUDO_READ_SINGLE,
71269 +               .read        = {
71270 +                        .single_show = show_plugin
71271 +                },
71272 +               .write_type  = PSEUDO_WRITE_STRING,
71273 +               .write       = {
71274 +                        .gets        = set_plugin
71275 +                },
71276 +               .readdir     = readdir_plugin_field
71277 +       },
71278 +       [PSEUDO_PLUGINS_ID] = {
71279 +               .h = {
71280 +                       .type_id = REISER4_PSEUDO_PLUGIN_TYPE,
71281 +                       .id = PSEUDO_PLUGINS_ID,
71282 +                       .pops = NULL,
71283 +                       .label = "plugin",
71284 +                       .desc = "list of plugins",
71285 +                       .linkage = TYPE_SAFE_LIST_LINK_ZERO
71286 +               },
71287 +               .parent      = PSEUDO_METAS_ID,
71288 +               .try         = try_by_label,
71289 +               .readdirable = 1,
71290 +               .lookup      = lookup_plugins,
71291 +               .lookup_mode = S_IFDIR | S_IRUGO | S_IXUGO,
71292 +               .read_type   = PSEUDO_READ_NONE,
71293 +               .write_type  = PSEUDO_WRITE_NONE,
71294 +               .readdir     = readdir_plugins
71295 +       },
71296 +       [PSEUDO_PLUGIN_FIELD_ID] = {
71297 +               .h = {
71298 +                       .type_id = REISER4_PSEUDO_PLUGIN_TYPE,
71299 +                       .id = PSEUDO_PLUGIN_ID,
71300 +                       .pops = NULL,
71301 +                       .label = "plugin-field",
71302 +                       .desc = "plugin field",
71303 +                       .linkage = TYPE_SAFE_LIST_LINK_ZERO
71304 +               },
71305 +               .parent      = PSEUDO_PLUGIN_ID,
71306 +               .try         = NULL,
71307 +               .readdirable = 0,
71308 +               .lookup      = NULL,
71309 +               .lookup_mode = S_IFREG | S_IRUGO,
71310 +               .read_type   = PSEUDO_READ_SINGLE,
71311 +               .read        = {
71312 +                        .single_show = show_plugin_field
71313 +                },
71314 +               .write_type  = PSEUDO_WRITE_NONE,
71315 +               .readdir     = NULL
71316 +       },
71317 +       [PSEUDO_ITEMS_ID] = {
71318 +               .h = {
71319 +                       .type_id = REISER4_PSEUDO_PLUGIN_TYPE,
71320 +                       .id = PSEUDO_ITEMS_ID,
71321 +                       .pops = NULL,
71322 +                       .label = "items",
71323 +                       .desc = "returns a list of items for this file",
71324 +                       .linkage = TYPE_SAFE_LIST_LINK_ZERO
71325 +               },
71326 +               .parent      = PSEUDO_METAS_ID,
71327 +               .try         = try_by_label,
71328 +               .readdirable = 1,
71329 +               .lookup      = NULL,
71330 +               .lookup_mode = S_IFREG | S_IRUGO,
71331 +               .read_type   = PSEUDO_READ_SEQ,
71332 +               .read        = {
71333 +                        .ops = {
71334 +                                .start = items_start,
71335 +                                .stop  = items_stop,
71336 +                                .next  = items_next,
71337 +                                .show  = items_show
71338 +                        }
71339 +                },
71340 +               .write_type  = PSEUDO_WRITE_NONE
71341 +       },
71342 +       [PSEUDO_NEW_ID] = {
71343 +               .h = {
71344 +                       .type_id = REISER4_PSEUDO_PLUGIN_TYPE,
71345 +                       .id = PSEUDO_NEW_ID,
71346 +                       .pops = NULL,
71347 +                       .label = "new",
71348 +                       .desc = "creates new file in the host",
71349 +                       .linkage = TYPE_SAFE_LIST_LINK_ZERO
71350 +               },
71351 +               .parent      = PSEUDO_METAS_ID,
71352 +               .try         = try_by_label,
71353 +               .readdirable = 1,
71354 +               .lookup      = NULL,
71355 +               .lookup_mode = S_IFREG | S_IWUSR,
71356 +               .read_type   = PSEUDO_READ_NONE,
71357 +               .read        = {
71358 +                        .single_show = show_rwx
71359 +                },
71360 +               .write_type  = PSEUDO_WRITE_STRING,
71361 +               .write       = {
71362 +                        .gets        = get_new
71363 +                }
71364 +       },
71365 +};
71366 +
71367 +/* Make Linus happy.
71368 +   Local variables:
71369 +   c-indentation-style: "K&R"
71370 +   mode-name: "LC"
71371 +   c-basic-offset: 8
71372 +   tab-width: 8
71373 +   fill-column: 120
71374 +   scroll-step: 1
71375 +   End:
71376 +*/
71377 diff -rupN linux-2.6.8-rc3/fs/reiser4/plugin/pseudo/pseudo.h linux-2.6.8-rc3-a/fs/reiser4/plugin/pseudo/pseudo.h
71378 --- linux-2.6.8-rc3/fs/reiser4/plugin/pseudo/pseudo.h   1970-01-01 03:00:00.000000000 +0300
71379 +++ linux-2.6.8-rc3-a/fs/reiser4/plugin/pseudo/pseudo.h 2004-08-05 21:20:53.271620783 +0400
71380 @@ -0,0 +1,176 @@
71381 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
71382 + * reiser4/README */
71383 +
71384 +/* Handling of "pseudo" files representing unified access to meta data in
71385 +   reiser4. See pseudo.c for more comments. */
71386 +
71387 +#if !defined( __REISER4_PSEUDO_H__ )
71388 +#define __REISER4_PSEUDO_H__
71389 +
71390 +#include "../plugin_header.h"
71391 +#include "../../key.h"
71392 +
71393 +#include <linux/fs.h>
71394 +#include <linux/seq_file.h>
71395 +
71396 +/*
71397 + * tag used by wrappers in plugin/file/pseudo.c to perform actions for the
71398 + * particular flavor of pseudo file.
71399 + */
71400 +typedef enum {
71401 +       /* this pseudo file cannot be read */
71402 +       PSEUDO_READ_NONE,
71403 +       /* this pseudo file used seq_* functions (fs/seq_file.c) to generate
71404 +        * it's content */
71405 +       PSEUDO_READ_SEQ,
71406 +       /* this pseudo file contains single value */
71407 +       PSEUDO_READ_SINGLE,
71408 +       /* this pseudo file has some special ->read() method that should be
71409 +        * called */
71410 +       PSEUDO_READ_FORWARD
71411 +} pseudo_read_type;
71412 +
71413 +typedef enum {
71414 +       /* this pseudo file cannot be written into */
71415 +       PSEUDO_WRITE_NONE,
71416 +       /* this pseudo file's content is generated by sprintf() */
71417 +       PSEUDO_WRITE_STRING,
71418 +       /* this pseudo file has some special ->write() method that should be
71419 +        * called */
71420 +       PSEUDO_WRITE_FORWARD
71421 +} pseudo_write_type;
71422 +
71423 +/* low level operations on the pseudo files.
71424 +
71425 +   Methods from this interface are directly callable by reiser4 system call.
71426 +
71427 +   This operation structure looks suspiciously like yet another plugin
71428 +   type. Doing so would simplify some things. For example, there are already
71429 +   functions to look up plugin by name, etc.
71430 +
71431 +*/
71432 +struct pseudo_plugin;
71433 +typedef struct pseudo_plugin pseudo_plugin;
71434 +struct pseudo_plugin {
71435 +
71436 +       /* common fields */
71437 +       plugin_header h;
71438 +
71439 +       /*
71440 +        * id of plugin of the parent pseudo file in the directory
71441 +        * hierarchy. See comment for
71442 +        * plugin/pseudo/pseudo.c:lookup_of_plugin().
71443 +        */
71444 +       int parent;
71445 +
71446 +       /*
71447 +        * check whether this pseudo file matches name @name within @parent
71448 +        */
71449 +       int (*try) (pseudo_plugin *pplug,
71450 +                   const struct inode *parent, const char *name);
71451 +       /*
71452 +        * true if this pseudo file is visible in readdir.
71453 +        */
71454 +       int readdirable;
71455 +       /* lookup method applicable to this pseudo file by method name.
71456 +
71457 +          This is for something like "foo/..acl/dup", here "../acl" is the
71458 +          name of a pseudo file, and "dup" is name of an operation (method)
71459 +          applicable to "../acl". Once "..acl" is resolved to ACL object,
71460 +          ->lookup( "dup" ) can be called to get operation.
71461 +
71462 +       */
71463 +       int (*lookup)(struct inode *parent, struct dentry ** dentry);
71464 +
71465 +       /*
71466 +        * rwx bits returned by stat(2) for this pseudo file
71467 +        */
71468 +       umode_t lookup_mode;
71469 +
71470 +       /* NOTE-NIKITA some other operations. Reiser4 syntax people should
71471 +          add something here. */
71472 +
71473 +       /*
71474 +        * how content of this pseudo file is generated
71475 +        */
71476 +       pseudo_read_type read_type;
71477 +       union {
71478 +               /* for PSEUDO_READ_SEQ */
71479 +               struct seq_operations ops;
71480 +               /* for PSEUDO_READ_SINGLE */
71481 +               int (*single_show) (struct seq_file *, void *);
71482 +               /* for PSEUDO_READ_FORWARD */
71483 +               ssize_t (*read)(struct file *, char __user *, size_t , loff_t *);
71484 +       } read;
71485 +
71486 +       /*
71487 +        * how this pseudo file reacts to write(2) calls
71488 +        */
71489 +       pseudo_write_type write_type;
71490 +       union {
71491 +               /* for PSEUDO_WRITE_STRING */
71492 +               int (*gets)(struct file *, const char *);
71493 +               /* for PSEUDO_WRITE_FORWARD */
71494 +               ssize_t (*write)(struct file *,
71495 +                                const char __user *, size_t , loff_t *);
71496 +       } write;
71497 +       /*
71498 +        * ->readdir method
71499 +        */
71500 +       int (*readdir)(struct file *f, void *dirent, filldir_t filld);
71501 +};
71502 +
71503 +/* portion of reiser4_inode specific for pseudo files */
71504 +typedef struct pseudo_info {
71505 +       /* pseudo file plugin controlling this file */
71506 +       pseudo_plugin *plugin;
71507 +       /* host object, for /etc/passwd/..oid, this is pointer to inode of
71508 +        * /etc/passwd */
71509 +       struct inode  *host;
71510 +       /* immediate parent object. This is different from ->host for deeply
71511 +        * nested pseudo files like foo/..plugin/foo */
71512 +       struct inode  *parent;
71513 +       /* for private use of pseudo file plugin */
71514 +       unsigned long  datum;
71515 +} pseudo_info_t;
71516 +
71517 +extern int lookup_pseudo_file(struct inode *parent, struct dentry **dentry);
71518 +
71519 +/*
71520 + * ids of pseudo files. See plugin/pseudo/pseudo.c for more details on each
71521 + * particular pseudo file.
71522 + */
71523 +typedef enum {
71524 +       PSEUDO_METAS_ID,
71525 +       PSEUDO_UID_ID,
71526 +       PSEUDO_GID_ID,
71527 +       PSEUDO_RWX_ID,
71528 +       PSEUDO_OID_ID,
71529 +       PSEUDO_KEY_ID,
71530 +       PSEUDO_SIZE_ID,
71531 +       PSEUDO_NLINK_ID,
71532 +       PSEUDO_LOCALITY_ID,
71533 +       PSEUDO_PSEUDOS_ID,
71534 +       PSEUDO_BMAP_ID,
71535 +       PSEUDO_READDIR_ID,
71536 +       PSEUDO_PLUGIN_ID,
71537 +       PSEUDO_PLUGINS_ID,
71538 +       PSEUDO_PLUGIN_FIELD_ID,
71539 +       PSEUDO_ITEMS_ID,
71540 +       PSEUDO_NEW_ID,
71541 +       LAST_PSEUDO_ID
71542 +} reiser4_pseudo_id;
71543 +
71544 +/* __REISER4_PSEUDO_H__ */
71545 +#endif
71546 +
71547 +/* Make Linus happy.
71548 +   Local variables:
71549 +   c-indentation-style: "K&R"
71550 +   mode-name: "LC"
71551 +   c-basic-offset: 8
71552 +   tab-width: 8
71553 +   fill-column: 120
71554 +   scroll-step: 1
71555 +   End:
71556 +*/
71557 diff -rupN linux-2.6.8-rc3/fs/reiser4/plugin/security/perm.c linux-2.6.8-rc3-a/fs/reiser4/plugin/security/perm.c
71558 --- linux-2.6.8-rc3/fs/reiser4/plugin/security/perm.c   1970-01-01 03:00:00.000000000 +0300
71559 +++ linux-2.6.8-rc3-a/fs/reiser4/plugin/security/perm.c 2004-08-05 21:20:52.974683414 +0400
71560 @@ -0,0 +1,76 @@
71561 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
71562 +/* NIKITA-FIXME-HANS: this comment describes what code? */
71563 +/* definition of item plugins. */
71564 +
71565 +#include "../plugin.h"
71566 +#include "../plugin_header.h"
71567 +#include "../../debug.h"
71568 +
71569 +#include <linux/fs.h>
71570 +#include <linux/dcache.h>      /* for struct dentry */
71571 +#include <linux/quotaops.h>
71572 +
71573 +static int
71574 +mask_ok_common(struct inode *inode, int mask)
71575 +{
71576 +       return vfs_permission(inode, mask);
71577 +}
71578 +
71579 +static int
71580 +setattr_ok_common(struct dentry *dentry, struct iattr *attr)
71581 +{
71582 +       int result;
71583 +       struct inode *inode;
71584 +
71585 +       assert("nikita-2272", dentry != NULL);
71586 +       assert("nikita-2273", attr != NULL);
71587 +
71588 +       inode = dentry->d_inode;
71589 +       assert("nikita-2274", inode != NULL);
71590 +
71591 +       result = inode_change_ok(inode, attr);
71592 +       if (result == 0) {
71593 +               unsigned int valid;
71594 +
71595 +               valid = attr->ia_valid;
71596 +               if ((valid & ATTR_UID && attr->ia_uid != inode->i_uid) ||
71597 +                   (valid & ATTR_GID && attr->ia_gid != inode->i_gid))
71598 +                           result = DQUOT_TRANSFER(inode, attr) ? -EDQUOT : 0;
71599 +       }
71600 +       return result;
71601 +}
71602 +
71603 +perm_plugin perm_plugins[LAST_PERM_ID] = {
71604 +/* NIKITA-FIXME-HANS: what file contains rwx permissions methods code? */
71605 +       [RWX_PERM_ID] = {
71606 +                        .h = {
71607 +                              .type_id = REISER4_PERM_PLUGIN_TYPE,
71608 +                              .id = RWX_PERM_ID,
71609 +                              .pops = NULL,
71610 +                              .label = "rwx",
71611 +                              .desc = "standard UNIX permissions",
71612 +                              .linkage = TYPE_SAFE_LIST_LINK_ZERO
71613 +                        },
71614 +                        .read_ok = NULL,
71615 +                        .write_ok = NULL,
71616 +                        .lookup_ok = NULL,
71617 +                        .create_ok = NULL,
71618 +                        .link_ok = NULL,
71619 +                        .unlink_ok = NULL,
71620 +                        .delete_ok = NULL,
71621 +                        .mask_ok = mask_ok_common,
71622 +                        .setattr_ok = setattr_ok_common,
71623 +                        .getattr_ok = NULL,
71624 +                        .rename_ok = NULL,
71625 +       },
71626 +};
71627 +
71628 +/* Make Linus happy.
71629 +   Local variables:
71630 +   c-indentation-style: "K&R"
71631 +   mode-name: "LC"
71632 +   c-basic-offset: 8
71633 +   tab-width: 8
71634 +   fill-column: 120
71635 +   End:
71636 +*/
71637 diff -rupN linux-2.6.8-rc3/fs/reiser4/plugin/security/perm.h linux-2.6.8-rc3-a/fs/reiser4/plugin/security/perm.h
71638 --- linux-2.6.8-rc3/fs/reiser4/plugin/security/perm.h   1970-01-01 03:00:00.000000000 +0300
71639 +++ linux-2.6.8-rc3-a/fs/reiser4/plugin/security/perm.h 2004-08-05 21:20:52.937691217 +0400
71640 @@ -0,0 +1,88 @@
71641 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
71642 +
71643 +/* Perm (short for "permissions") plugins common stuff. */
71644 +
71645 +#if !defined( __REISER4_PERM_H__ )
71646 +#define __REISER4_PERM_H__
71647 +
71648 +#include "../../forward.h"
71649 +#include "../plugin_header.h"
71650 +
71651 +#include <linux/types.h>
71652 +#include <linux/fs.h>          /* for struct file  */
71653 +#include <linux/dcache.h>      /* for struct dentry */
71654 +
71655 +/* interface for perm plugin.
71656 +
71657 +   Perm plugin method can be implemented through:
71658 +
71659 +    1. consulting ->i_mode bits in stat data
71660 +
71661 +    2. obtaining acl from the tree and inspecting it
71662 +
71663 +    3. asking some kernel module or user-level program to authorize access.
71664 +
71665 +   This allows for integration with things like capabilities, SELinux-style
71666 +   secutiry contexts, etc.
71667 +
71668 +*/
71669 +/* NIKITA-FIXME-HANS: define what this is targeted for.  It does not seem to be intended for use with sys_reiser4.  Explain. */
71670 +typedef struct perm_plugin {
71671 +       /* generic plugin fields */
71672 +       plugin_header h;
71673 +
71674 +       /* check permissions for read/write */
71675 +       int (*read_ok) (struct file * file, const char *buf, size_t size, loff_t * off);
71676 +       int (*write_ok) (struct file * file, const char *buf, size_t size, loff_t * off);
71677 +
71678 +       /* check permissions for lookup */
71679 +       int (*lookup_ok) (struct inode * parent, struct dentry * dentry);
71680 +
71681 +       /* check permissions for create */
71682 +       int (*create_ok) (struct inode * parent, struct dentry * dentry, reiser4_object_create_data * data);
71683 +
71684 +       /* check permissions for linking @where to @existing */
71685 +       int (*link_ok) (struct dentry * existing, struct inode * parent, struct dentry * where);
71686 +
71687 +       /* check permissions for unlinking @victim from @parent */
71688 +       int (*unlink_ok) (struct inode * parent, struct dentry * victim);
71689 +
71690 +       /* check permissions for deletion of @object whose last reference is
71691 +          by @parent */
71692 +       int (*delete_ok) (struct inode * parent, struct dentry * victim);
71693 +       int (*mask_ok) (struct inode * inode, int mask);
71694 +       /* check whether attribute change is acceptable */
71695 +       int (*setattr_ok) (struct dentry * dentry, struct iattr * attr);
71696 +
71697 +       /* check whether stat(2) is allowed */
71698 +       int (*getattr_ok) (struct vfsmount * mnt UNUSED_ARG, struct dentry * dentry, struct kstat * stat);
71699 +       /* check whether rename(2) is allowed */
71700 +       int (*rename_ok) (struct inode * old_dir, struct dentry * old,
71701 +                         struct inode * new_dir, struct dentry * new);
71702 +} perm_plugin;
71703 +/* NIKITA-FIXME-HANS: I really hate things like this that kill the ability of Meta-. to work.  Please eliminate this macro, exce */
71704 +/* call ->check_ok method of perm plugin for inode */
71705 +#define perm_chk(inode, check, ...)                    \
71706 +({                                                     \
71707 +       perm_plugin *perm;                              \
71708 +                                                       \
71709 +       perm = inode_perm_plugin(inode);                \
71710 +       (perm == NULL || perm->check ## _ok == NULL) ?  \
71711 +               0 :                                     \
71712 +               perm->check ## _ok(__VA_ARGS__);        \
71713 +})
71714 +
71715 +typedef enum { RWX_PERM_ID, LAST_PERM_ID } reiser4_perm_id;
71716 +
71717 +/* __REISER4_PERM_H__ */
71718 +#endif
71719 +
71720 +/* Make Linus happy.
71721 +   Local variables:
71722 +   c-indentation-style: "K&R"
71723 +   mode-name: "LC"
71724 +   c-basic-offset: 8
71725 +   tab-width: 8
71726 +   fill-column: 120
71727 +   End:
71728 +*/
71729 diff -rupN linux-2.6.8-rc3/fs/reiser4/plugin/space/bitmap.c linux-2.6.8-rc3-a/fs/reiser4/plugin/space/bitmap.c
71730 --- linux-2.6.8-rc3/fs/reiser4/plugin/space/bitmap.c    1970-01-01 03:00:00.000000000 +0300
71731 +++ linux-2.6.8-rc3-a/fs/reiser4/plugin/space/bitmap.c  2004-08-05 21:20:53.355603069 +0400
71732 @@ -0,0 +1,1602 @@
71733 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
71734 +
71735 +#include "../../debug.h"
71736 +#include "../../dformat.h"
71737 +#include "../../txnmgr.h"
71738 +#include "../../jnode.h"
71739 +#include "../../block_alloc.h"
71740 +#include "../../tree.h"
71741 +#include "../../super.h"
71742 +#include "../../lib.h"
71743 +
71744 +#include "../plugin.h"
71745 +#include "../../diskmap.h"
71746 +
71747 +#include "space_allocator.h"
71748 +#include "bitmap.h"
71749 +
71750 +#include <linux/types.h>
71751 +#include <linux/fs.h>          /* for struct super_block  */
71752 +#include <asm/semaphore.h>
71753 +#include <linux/vmalloc.h>
71754 +
71755 +/* Proposed (but discarded) optimization: dynamic loading/unloading of bitmap
71756 + * blocks
71757 +
71758 +   A useful optimization of reiser4 bitmap handling would be dynamic bitmap
71759 +   blocks loading/unloading which is different from v3.x where all bitmap
71760 +   blocks are loaded at mount time.
71761 +
71762 +   To implement bitmap blocks unloading we need to count bitmap block usage
71763 +   and detect currently unused blocks allowing them to be unloaded. It is not
71764 +   a simple task since we allow several threads to modify one bitmap block
71765 +   simultaneously.
71766 +
71767 +   Briefly speaking, the following schema is proposed: we count in special
71768 +   variable associated with each bitmap block. That is for counting of block
71769 +   alloc/dealloc operations on that bitmap block. With a deferred block
71770 +   deallocation feature of reiser4 all those operation will be represented in
71771 +   atom dirty/deleted lists as jnodes for freshly allocated or deleted
71772 +   nodes.
71773 +
71774 +   So, we increment usage counter for each new node allocated or deleted, and
71775 +   decrement it at atom commit one time for each node from the dirty/deleted
71776 +   atom's list.  Of course, freshly allocated node deletion and node reusing
71777 +   from atom deleted (if we do so) list should decrement bitmap usage counter
71778 +   also.
71779 +
71780 +   This schema seems to be working but that reference counting is
71781 +   not easy to debug. I think we should agree with Hans and do not implement
71782 +   it in v4.0. Current code implements "on-demand" bitmap blocks loading only.
71783 +
71784 +   For simplicity all bitmap nodes (both commit and working bitmap blocks) are
71785 +   loaded into memory on fs mount time or each bitmap nodes are loaded at the
71786 +   first access to it, the "dont_load_bitmap" mount option controls whether
71787 +   bimtap nodes should be loaded at mount time. Dynamic unloading of bitmap
71788 +   nodes currently is not supported. */
71789 +
71790 +#define CHECKSUM_SIZE    4
71791 +
71792 +#define BYTES_PER_LONG (sizeof(long))
71793 +
71794 +#if BITS_PER_LONG == 64
71795 +#  define LONG_INT_SHIFT (6)
71796 +#else
71797 +#  define LONG_INT_SHIFT (5)
71798 +#endif
71799 +
71800 +#define LONG_INT_MASK (BITS_PER_LONG - 1)
71801 +
71802 +typedef unsigned long ulong_t;
71803 +
71804 +
71805 +#define bmap_size(blocksize)       ((blocksize) - CHECKSUM_SIZE)
71806 +#define bmap_bit_count(blocksize)   (bmap_size(blocksize) << 3)
71807 +
71808 +/* Block allocation/deallocation are done through special bitmap objects which
71809 +   are allocated in an array at fs mount. */
71810 +struct bitmap_node {
71811 +       struct semaphore sema;  /* long term lock object */
71812 +
71813 +       jnode *wjnode;          /* j-nodes for WORKING ... */
71814 +       jnode *cjnode;          /* ... and COMMIT bitmap blocks */
71815 +
71816 +       bmap_off_t first_zero_bit;      /* for skip_busy option implementation */
71817 +
71818 +       atomic_t loaded;        /* a flag which shows that bnode is loaded
71819 +                                * already */
71820 +};
71821 +
71822 +static inline char *
71823 +bnode_working_data(struct bitmap_node *bnode)
71824 +{
71825 +       char *data;
71826 +
71827 +       data = jdata(bnode->wjnode);
71828 +       assert("zam-429", data != NULL);
71829 +
71830 +       return data + CHECKSUM_SIZE;
71831 +}
71832 +
71833 +static inline char *
71834 +bnode_commit_data(const struct bitmap_node *bnode)
71835 +{
71836 +       char *data;
71837 +
71838 +       data = jdata(bnode->cjnode);
71839 +       assert("zam-430", data != NULL);
71840 +
71841 +       return data + CHECKSUM_SIZE;
71842 +}
71843 +
71844 +static inline __u32
71845 +bnode_commit_crc(const struct bitmap_node *bnode)
71846 +{
71847 +       char *data;
71848 +
71849 +       data = jdata(bnode->cjnode);
71850 +       assert("vpf-261", data != NULL);
71851 +
71852 +       return d32tocpu((d32 *) data);
71853 +}
71854 +
71855 +static inline void
71856 +bnode_set_commit_crc(struct bitmap_node *bnode, __u32 crc)
71857 +{
71858 +       char *data;
71859 +
71860 +       data = jdata(bnode->cjnode);
71861 +       assert("vpf-261", data != NULL);
71862 +
71863 +       cputod32(crc, (d32 *) data);
71864 +}
71865 +
71866 +/* ZAM-FIXME-HANS: is the idea that this might be a union someday? having
71867 + * written the code, does this added abstraction still have */
71868 +/* ANSWER(Zam): No, the abstractions is in the level above (exact place is the
71869 + * reiser4_space_allocator structure) */
71870 +/* ZAM-FIXME-HANS: I don't understand your english in comment above. */
71871 +/* FIXME-HANS(Zam): I don't understand the questions like "might be a union
71872 + * someday?". What they about?  If there is a reason to have a union, it should
71873 + * be a union, if not, it should not be a union.  "..might be someday" means no
71874 + * reason. */
71875 +struct bitmap_allocator_data {
71876 +       /* an array for bitmap blocks direct access */
71877 +       struct bitmap_node *bitmap;
71878 +};
71879 +
71880 +#define get_barray(super) \
71881 +(((struct bitmap_allocator_data *)(get_super_private(super)->space_allocator.u.generic)) -> bitmap)
71882 +
71883 +#define get_bnode(super, i) (get_barray(super) + i)
71884 +
71885 +/* allocate and initialize jnode with JNODE_BITMAP type */
71886 +static jnode *
71887 +bnew(void)
71888 +{
71889 +       jnode *jal = jalloc();
71890 +
71891 +       if (jal)
71892 +               jnode_init(jal, current_tree, JNODE_BITMAP);
71893 +
71894 +       return jal;
71895 +}
71896 +
71897 +/* this file contains:
71898 +   - bitmap based implementation of space allocation plugin
71899 +   - all the helper functions like set bit, find_first_zero_bit, etc */
71900 +
71901 +/* Audited by: green(2002.06.12) */
71902 +static int
71903 +find_next_zero_bit_in_word(ulong_t word, int start_bit)
71904 +{
71905 +       unsigned int mask = 1 << start_bit;
71906 +       int i = start_bit;
71907 +
71908 +       while ((word & mask) != 0) {
71909 +               mask <<= 1;
71910 +               if (++i >= BITS_PER_LONG)
71911 +                       break;
71912 +       }
71913 +
71914 +       return i;
71915 +}
71916 +
71917 +#include <asm/bitops.h>
71918 +
71919 +#define reiser4_set_bit(nr, addr)    ext2_set_bit(nr, addr)
71920 +#define reiser4_clear_bit(nr, addr)  ext2_clear_bit(nr, addr)
71921 +#define reiser4_test_bit(nr, addr)  ext2_test_bit(nr, addr)
71922 +
71923 +#define reiser4_find_next_zero_bit(addr, maxoffset, offset) \
71924 +ext2_find_next_zero_bit(addr, maxoffset, offset)
71925 +
71926 +/* Search for a set bit in the bit array [@start_offset, @max_offset[, offsets
71927 + * are counted from @addr, return the offset of the first bit if it is found,
71928 + * @maxoffset otherwise. */
71929 +static bmap_off_t reiser4_find_next_set_bit(
71930 +       void *addr, bmap_off_t max_offset, bmap_off_t start_offset)
71931 +{
71932 +       ulong_t *base = addr;
71933 +        /* start_offset is in bits, convert it to byte offset within bitmap. */
71934 +       int word_nr = start_offset >> LONG_INT_SHIFT;
71935 +       /* bit number within the byte. */
71936 +       int bit_nr = start_offset & LONG_INT_MASK;
71937 +       int max_word_nr = (max_offset - 1) >> LONG_INT_SHIFT;
71938 +
71939 +       assert("zam-387", max_offset != 0);
71940 +
71941 +       /* Unaligned @start_offset case.  */
71942 +       if (bit_nr != 0) {
71943 +               bmap_nr_t nr;
71944 +
71945 +               nr = find_next_zero_bit_in_word(~(base[word_nr]), bit_nr);
71946 +
71947 +               if (nr < BITS_PER_LONG)
71948 +                       return (word_nr << LONG_INT_SHIFT) + nr;
71949 +
71950 +               ++word_nr;
71951 +       }
71952 +
71953 +       /* Fast scan trough aligned words. */
71954 +       while (word_nr <= max_word_nr) {
71955 +               if (base[word_nr] != 0) {
71956 +                       return (word_nr << LONG_INT_SHIFT)
71957 +                           + find_next_zero_bit_in_word(~(base[word_nr]), 0);
71958 +               }
71959 +
71960 +               ++word_nr;
71961 +       }
71962 +
71963 +       return max_offset;
71964 +}
71965 +
71966 +/* search for the first set bit in single word. */
71967 +static int find_last_set_bit_in_word (ulong_t word, int start_bit)
71968 +{
71969 +       unsigned bit_mask;
71970 +       int nr = start_bit;
71971 +
71972 +       assert ("zam-965", start_bit < BITS_PER_LONG);
71973 +       assert ("zam-966", start_bit >= 0);
71974 +
71975 +       bit_mask = (1 << nr);
71976 +
71977 +       while (bit_mask != 0) {
71978 +               if (bit_mask & word)
71979 +                       return nr;
71980 +               bit_mask >>= 1;
71981 +               nr --;
71982 +       }
71983 +       return BITS_PER_LONG;
71984 +}
71985 +
71986 +/* Search bitmap for a set bit in backward direction from the end to the
71987 + * beginning of given region
71988 + *
71989 + * @result: result offset of the last set bit
71990 + * @addr:   base memory address,
71991 + * @low_off:  low end of the search region, edge bit included into the region,
71992 + * @high_off: high end of the search region, edge bit included into the region,
71993 + *
71994 + * @return: 0 - set bit was found, -1 otherwise.
71995 + */
71996 +static int
71997 +reiser4_find_last_set_bit (bmap_off_t * result, void * addr, bmap_off_t low_off, bmap_off_t high_off)
71998 +{
71999 +       ulong_t * base = addr;
72000 +       int last_word;
72001 +       int first_word;
72002 +       int last_bit;
72003 +       int nr;
72004 +
72005 +       assert ("zam-961", high_off >= 0);
72006 +       assert ("zam-962", high_off >= low_off);
72007 +
72008 +       last_word = high_off >> LONG_INT_SHIFT;
72009 +       last_bit = high_off & LONG_INT_MASK;
72010 +       first_word = low_off >> LONG_INT_SHIFT;
72011 +
72012 +       if (last_bit < BITS_PER_LONG) {
72013 +               nr = find_last_set_bit_in_word(base[last_word], last_bit);
72014 +               if (nr < BITS_PER_LONG) {
72015 +                       *result = (last_word << LONG_INT_SHIFT) + nr;
72016 +                       return 0;
72017 +               }
72018 +               -- last_word;
72019 +       }
72020 +       while (last_word >= first_word) {
72021 +               if (base[last_word] != 0x0) {
72022 +                       last_bit = find_last_set_bit_in_word(base[last_word], BITS_PER_LONG - 1);
72023 +                       assert ("zam-972", last_bit < BITS_PER_LONG);
72024 +                       *result = (last_word << LONG_INT_SHIFT) + last_bit;
72025 +                       return 0;
72026 +               }
72027 +               -- last_word;
72028 +       }
72029 +
72030 +       return -1;              /* set bit not found */
72031 +}
72032 +
72033 +/* Search bitmap for a clear bit in backward direction from the end to the
72034 + * beginning of given region */
72035 +static int
72036 +reiser4_find_last_zero_bit (bmap_off_t * result, void * addr, bmap_off_t low_off, bmap_off_t high_off)
72037 +{
72038 +       ulong_t * base = addr;
72039 +       int last_word;
72040 +       int first_word;
72041 +       int last_bit;
72042 +       int nr;
72043 +
72044 +       last_word = high_off >> LONG_INT_SHIFT;
72045 +       last_bit = high_off & LONG_INT_MASK;
72046 +       first_word = low_off >> LONG_INT_SHIFT;
72047 +
72048 +       if (last_bit < BITS_PER_LONG) {
72049 +               nr = find_last_set_bit_in_word(~base[last_word], last_bit);
72050 +               if (nr < BITS_PER_LONG) {
72051 +                        *result = (last_word << LONG_INT_SHIFT) + nr;
72052 +                        return 0;
72053 +               }
72054 +               -- last_word;
72055 +       }
72056 +       while (last_word >= first_word) {
72057 +               if (base[last_word] != (ulong_t)(-1)) {
72058 +                       *result =  (last_word << LONG_INT_SHIFT) +
72059 +                               find_last_set_bit_in_word(~base[last_word], BITS_PER_LONG - 1);
72060 +                       return 0;
72061 +               }
72062 +               -- last_word;
72063 +       }
72064 +
72065 +       return -1;      /* zero bit not found */
72066 +}
72067 +
72068 +/* Audited by: green(2002.06.12) */
72069 +static void
72070 +reiser4_clear_bits(char *addr, bmap_off_t start, bmap_off_t end)
72071 +{
72072 +       int first_byte;
72073 +       int last_byte;
72074 +
72075 +       unsigned char first_byte_mask = 0xFF;
72076 +       unsigned char last_byte_mask = 0xFF;
72077 +
72078 +       assert("zam-410", start < end);
72079 +
72080 +       first_byte = start >> 3;
72081 +       last_byte = (end - 1) >> 3;
72082 +
72083 +       if (last_byte > first_byte + 1)
72084 +               xmemset(addr + first_byte + 1, 0, (size_t) (last_byte - first_byte - 1));
72085 +
72086 +       first_byte_mask >>= 8 - (start & 0x7);
72087 +       last_byte_mask <<= ((end - 1) & 0x7) + 1;
72088 +
72089 +       if (first_byte == last_byte) {
72090 +               addr[first_byte] &= (first_byte_mask | last_byte_mask);
72091 +       } else {
72092 +               addr[first_byte] &= first_byte_mask;
72093 +               addr[last_byte] &= last_byte_mask;
72094 +       }
72095 +}
72096 +
72097 +/* Audited by: green(2002.06.12) */
72098 +/* ZAM-FIXME-HANS: comment this */
72099 +static void
72100 +reiser4_set_bits(char *addr, bmap_off_t start, bmap_off_t end)
72101 +{
72102 +       int first_byte;
72103 +       int last_byte;
72104 +
72105 +       unsigned char first_byte_mask = 0xFF;
72106 +       unsigned char last_byte_mask = 0xFF;
72107 +
72108 +       assert("zam-386", start < end);
72109 +
72110 +       first_byte = start >> 3;
72111 +       last_byte = (end - 1) >> 3;
72112 +
72113 +       if (last_byte > first_byte + 1)
72114 +               xmemset(addr + first_byte + 1, 0xFF, (size_t) (last_byte - first_byte - 1));
72115 +
72116 +       first_byte_mask <<= start & 0x7;
72117 +       last_byte_mask >>= 7 - ((end - 1) & 0x7);
72118 +
72119 +       if (first_byte == last_byte) {
72120 +               addr[first_byte] |= (first_byte_mask & last_byte_mask);
72121 +       } else {
72122 +               addr[first_byte] |= first_byte_mask;
72123 +               addr[last_byte] |= last_byte_mask;
72124 +       }
72125 +}
72126 +
72127 +#define ADLER_BASE    65521
72128 +#define ADLER_NMAX    5552
72129 +
72130 +/* Calculates the adler32 checksum for the data pointed by `data` of the
72131 +    length `len`. This function was originally taken from zlib, version 1.1.3,
72132 +    July 9th, 1998.
72133 +
72134 +    Copyright (C) 1995-1998 Jean-loup Gailly and Mark Adler
72135 +
72136 +    This software is provided 'as-is', without any express or implied
72137 +    warranty.  In no event will the authors be held liable for any damages
72138 +    arising from the use of this software.
72139 +
72140 +    Permission is granted to anyone to use this software for any purpose,
72141 +    including commercial applications, and to alter it and redistribute it
72142 +    freely, subject to the following restrictions:
72143 +
72144 +    1. The origin of this software must not be misrepresented; you must not
72145 +       claim that you wrote the original software. If you use this software
72146 +       in a product, an acknowledgment in the product documentation would be
72147 +       appreciated but is not required.
72148 +    2. Altered source versions must be plainly marked as such, and must not be
72149 +       misrepresented as being the original software.
72150 +    3. This notice may not be removed or altered from any source distribution.
72151 +
72152 +    Jean-loup Gailly        Mark Adler
72153 +    jloup@gzip.org          madler@alumni.caltech.edu
72154 +
72155 +    The above comment applies only to the adler32 function.
72156 +*/
72157 +
72158 +static __u32
72159 +adler32(char *data, __u32 len)
72160 +{
72161 +       unsigned char *t = data;
72162 +       __u32 s1 = 1;
72163 +       __u32 s2 = 0;
72164 +       int k;
72165 +
72166 +       while (len > 0) {
72167 +               k = len < ADLER_NMAX ? len : ADLER_NMAX;
72168 +               len -= k;
72169 +
72170 +               while (k--) {
72171 +                       s1 += *t++;
72172 +                       s2 += s1;
72173 +               }
72174 +
72175 +               s1 %= ADLER_BASE;
72176 +               s2 %= ADLER_BASE;
72177 +       }
72178 +       return (s2 << 16) | s1;
72179 +}
72180 +
72181 +static __u32
72182 +bnode_calc_crc(const struct bitmap_node *bnode)
72183 +{
72184 +       struct super_block *super;
72185 +
72186 +       super = jnode_get_tree(bnode->wjnode)->super;
72187 +       return adler32(bnode_commit_data(bnode), bmap_size(super->s_blocksize));
72188 +}
72189 +
72190 +#define REISER4_CHECK_BMAP_CRC (0)
72191 +
72192 +#if REISER4_CHECK_BMAP_CRC
72193 +static int
72194 +bnode_check_crc(const struct bitmap_node *bnode)
72195 +{
72196 +       if (bnode_calc_crc(bnode) != bnode_commit_crc (bnode)) {
72197 +               bmap_nr_t bmap;
72198 +               struct super_block *super;
72199 +
72200 +               super = jnode_get_tree(bnode->wjnode)->super;
72201 +               bmap = bnode - get_bnode(super, 0)
72202 +
72203 +               warning("vpf-263",
72204 +                       "Checksum for the bitmap block %llu is incorrect", bmap);
72205 +               return RETERR(-EIO);
72206 +       } else
72207 +               return 0;
72208 +}
72209 +
72210 +/* REISER4_CHECK_BMAP_CRC */
72211 +#else
72212 +
72213 +#define bnode_check_crc(bnode) (0)
72214 +
72215 +/* REISER4_CHECK_BMAP_CRC */
72216 +#endif
72217 +
72218 +/* Recalculates the adler32 checksum for only 1 byte change.
72219 +    adler - previous adler checksum
72220 +    old_data, data - old, new byte values.
72221 +    tail == (chunk - offset) : length, checksum was calculated for, - offset of
72222 +    the changed byte within this chunk.
72223 +    This function can be used for checksum calculation optimisation.
72224 +*/
72225 +
72226 +static __u32
72227 +adler32_recalc(__u32 adler, unsigned char old_data, unsigned char data, __u32 tail)
72228 +{
72229 +       __u32 delta = data - old_data + 2 * ADLER_BASE;
72230 +       __u32 s1 = adler & 0xffff;
72231 +       __u32 s2 = (adler >> 16) & 0xffff;
72232 +
72233 +       s1 = (delta + s1) % ADLER_BASE;
72234 +       s2 = (delta * tail + s2) % ADLER_BASE;
72235 +
72236 +       return (s2 << 16) | s1;
72237 +}
72238 +
72239 +
72240 +#define LIMIT(val, boundary) ((val) > (boundary) ? (boundary) : (val))
72241 +
72242 +/* A number of bitmap blocks for given fs. This number can be stored on disk
72243 +   or calculated on fly; it depends on disk format.
72244 +VS-FIXME-HANS: explain calculation, using device with block count of 8 * 4096 blocks as an example.
72245 +   FIXME-VS: number of blocks in a filesystem is taken from reiser4
72246 +   super private data */
72247 +/* Audited by: green(2002.06.12) */
72248 +static bmap_nr_t
72249 +get_nr_bmap(const struct super_block *super)
72250 +{
72251 +       assert("zam-393", reiser4_block_count(super) != 0);
72252 +
72253 +       return div64_32(reiser4_block_count(super) - 1, bmap_bit_count(super->s_blocksize), NULL) + 1;
72254 +
72255 +}
72256 +
72257 +/* calculate bitmap block number and offset within that bitmap block */
72258 +static void
72259 +parse_blocknr(const reiser4_block_nr * block, bmap_nr_t * bmap, bmap_off_t * offset)
72260 +{
72261 +       struct super_block *super = get_current_context()->super;
72262 +
72263 +       *bmap = div64_32(*block, bmap_bit_count(super->s_blocksize), offset);
72264 +
72265 +       assert("zam-433", *bmap < get_nr_bmap(super));
72266 +}
72267 +
72268 +#if REISER4_DEBUG
72269 +/* Audited by: green(2002.06.12) */
72270 +static void
72271 +check_block_range(const reiser4_block_nr * start, const reiser4_block_nr * len)
72272 +{
72273 +       struct super_block *sb = reiser4_get_current_sb();
72274 +
72275 +       assert("zam-436", sb != NULL);
72276 +
72277 +       assert("zam-455", start != NULL);
72278 +       assert("zam-437", *start != 0);
72279 +       assert("zam-541", !blocknr_is_fake(start));
72280 +       assert("zam-441", *start < reiser4_block_count(sb));
72281 +
72282 +       if (len != NULL) {
72283 +               assert("zam-438", *len != 0);
72284 +               assert("zam-442", *start + *len <= reiser4_block_count(sb));
72285 +       }
72286 +}
72287 +
72288 +static void
72289 +check_bnode_loaded(const struct bitmap_node *bnode)
72290 +{
72291 +       assert("zam-485", bnode != NULL);
72292 +       assert("zam-483", jnode_page(bnode->wjnode) != NULL);
72293 +       assert("zam-484", jnode_page(bnode->cjnode) != NULL);
72294 +       assert("nikita-2820", jnode_is_loaded(bnode->wjnode));
72295 +       assert("nikita-2821", jnode_is_loaded(bnode->cjnode));
72296 +}
72297 +
72298 +#else
72299 +
72300 +#  define check_block_range(start, len) do { /* nothing */} while(0)
72301 +#  define check_bnode_loaded(bnode)     do { /* nothing */} while(0)
72302 +
72303 +#endif
72304 +
72305 +/* modify bnode->first_zero_bit (if we free bits before); bnode should be
72306 +   spin-locked */
72307 +static inline void
72308 +adjust_first_zero_bit(struct bitmap_node *bnode, bmap_off_t offset)
72309 +{
72310 +       if (offset < bnode->first_zero_bit)
72311 +               bnode->first_zero_bit = offset;
72312 +}
72313 +
72314 +/* return a physical disk address for logical bitmap number @bmap */
72315 +/* FIXME-VS: this is somehow related to disk layout? */
72316 +/* ZAM-FIXME-HANS: your answer is? Use not more than one function dereference
72317 + * per block allocation so that performance is not affected.  Probably this
72318 + * whole file should be considered part of the disk layout plugin, and other
72319 + * disk layouts can use other defines and efficiency will not be significantly
72320 + * affected.  */
72321 +
72322 +#define REISER4_FIRST_BITMAP_BLOCK \
72323 +       ((REISER4_MASTER_OFFSET / PAGE_CACHE_SIZE) + 2)
72324 +
72325 +/* Audited by: green(2002.06.12) */
72326 +reiser4_internal void
72327 +get_bitmap_blocknr(struct super_block *super, bmap_nr_t bmap, reiser4_block_nr * bnr)
72328 +{
72329 +
72330 +       assert("zam-390", bmap < get_nr_bmap(super));
72331 +
72332 +#ifdef CONFIG_REISER4_BADBLOCKS
72333 +#define BITMAP_PLUGIN_DISKMAP_ID ((0xc0e1<<16) | (0xe0ff))
72334 +       /* Check if the diskmap have this already, first. */
72335 +       if ( reiser4_get_diskmap_value( BITMAP_PLUGIN_DISKMAP_ID, bmap, bnr) == 0 )
72336 +               return; /* Found it in diskmap */
72337 +#endif
72338 +       /* FIXME_ZAM: before discussing of disk layouts and disk format
72339 +          plugins I implement bitmap location scheme which is close to scheme
72340 +          used in reiser 3.6 */
72341 +       if (bmap == 0) {
72342 +               *bnr = REISER4_FIRST_BITMAP_BLOCK;
72343 +       } else {
72344 +               *bnr = bmap * bmap_bit_count(super->s_blocksize);
72345 +       }
72346 +}
72347 +
72348 +/* construct a fake block number for shadow bitmap (WORKING BITMAP) block */
72349 +/* Audited by: green(2002.06.12) */
72350 +reiser4_internal void
72351 +get_working_bitmap_blocknr(bmap_nr_t bmap, reiser4_block_nr * bnr)
72352 +{
72353 +       *bnr = (reiser4_block_nr) ((bmap & ~REISER4_BLOCKNR_STATUS_BIT_MASK) | REISER4_BITMAP_BLOCKS_STATUS_VALUE);
72354 +}
72355 +
72356 +/* bnode structure initialization */
72357 +static void
72358 +init_bnode(struct bitmap_node *bnode,
72359 +          struct super_block *super UNUSED_ARG, bmap_nr_t bmap UNUSED_ARG)
72360 +{
72361 +       xmemset(bnode, 0, sizeof (struct bitmap_node));
72362 +
72363 +       sema_init(&bnode->sema, 1);
72364 +       atomic_set(&bnode->loaded, 0);
72365 +}
72366 +
72367 +static void
72368 +release(jnode *node)
72369 +{
72370 +       jrelse(node);
72371 +       JF_SET(node, JNODE_HEARD_BANSHEE);
72372 +       jput(node);
72373 +}
72374 +
72375 +/* This function is for internal bitmap.c use because it assumes that jnode is
72376 +   in under full control of this thread */
72377 +static void
72378 +done_bnode(struct bitmap_node *bnode)
72379 +{
72380 +       if (bnode) {
72381 +               atomic_set(&bnode->loaded, 0);
72382 +               if (bnode->wjnode != NULL)
72383 +                       release(bnode->wjnode);
72384 +               if (bnode->cjnode != NULL)
72385 +                       release(bnode->cjnode);
72386 +               bnode->wjnode = bnode->cjnode = NULL;
72387 +       }
72388 +}
72389 +
72390 +/* ZAM-FIXME-HANS: comment this.  Called only by load_and_lock_bnode()*/
72391 +static int
72392 +prepare_bnode(struct bitmap_node *bnode, jnode **cjnode_ret, jnode **wjnode_ret)
72393 +{
72394 +       struct super_block *super;
72395 +       jnode *cjnode;
72396 +       jnode *wjnode;
72397 +       bmap_nr_t bmap;
72398 +       int ret;
72399 +
72400 +       super = reiser4_get_current_sb();
72401 +
72402 +       *wjnode_ret = wjnode = bnew();
72403 +       if (wjnode == NULL)
72404 +               return RETERR(-ENOMEM);
72405 +
72406 +       *cjnode_ret = cjnode = bnew();
72407 +       if (cjnode == NULL)
72408 +               return RETERR(-ENOMEM);
72409 +
72410 +       bmap = bnode - get_bnode(super, 0);
72411 +
72412 +       get_working_bitmap_blocknr(bmap, &wjnode->blocknr);
72413 +       get_bitmap_blocknr(super, bmap, &cjnode->blocknr);
72414 +
72415 +       jref(cjnode);
72416 +       jref(wjnode);
72417 +
72418 +       /* load commit bitmap */
72419 +       ret = jload_gfp(cjnode, GFP_NOFS, 1);
72420 +
72421 +       if (ret)
72422 +               goto error;
72423 +
72424 +       /* allocate memory for working bitmap block. Note that for
72425 +        * bitmaps jinit_new() doesn't actually modifies node content,
72426 +        * so parallel calls to this are ok. */
72427 +       ret = jinit_new(wjnode, GFP_NOFS);
72428 +
72429 +       if (ret != 0) {
72430 +               jrelse(cjnode);
72431 +               goto error;
72432 +       }
72433 +
72434 +       return 0;
72435 +
72436 + error:
72437 +       jput(cjnode);
72438 +       jput(wjnode);
72439 +       *wjnode_ret = *cjnode_ret = NULL;
72440 +       return ret;
72441 +
72442 +}
72443 +
72444 +       static int
72445 +check_adler32_jnode(jnode *jnode, unsigned long size) {
72446 +       return (adler32(jdata(jnode) + CHECKSUM_SIZE, size) != *(__u32 *)jdata(jnode));
72447 +}
72448 +
72449 +/* Check the bnode data on read. */
72450 +static int check_struct_bnode(struct bitmap_node *bnode, __u32 blksize) {
72451 +       void *data;
72452 +
72453 +       /* Check CRC */
72454 +       if (check_adler32_jnode(bnode->cjnode, bmap_size(blksize))) {
72455 +               warning("vpf-1361", "Checksum for the bitmap block %llu "
72456 +                       "is incorrect", bnode->cjnode->blocknr);
72457 +
72458 +               return -EINVAL;
72459 +       }
72460 +
72461 +       data = jdata(bnode->cjnode) + CHECKSUM_SIZE;
72462 +
72463 +       /* Check the very first bit -- it must be busy. */
72464 +       if (!reiser4_test_bit(0, data)) {
72465 +               warning("vpf-1362", "The allocator block %llu is not marked as used.",
72466 +                       bnode->cjnode->blocknr);
72467 +
72468 +               return -EINVAL;
72469 +       }
72470 +
72471 +       return 0;
72472 +}
72473 +
72474 +/* load bitmap blocks "on-demand" */
72475 +static int
72476 +load_and_lock_bnode(struct bitmap_node *bnode)
72477 +{
72478 +       int ret;
72479 +
72480 +       jnode *cjnode;
72481 +       jnode *wjnode;
72482 +
72483 +       assert("nikita-3040", schedulable());
72484 +
72485 +/* ZAM-FIXME-HANS: since bitmaps are never unloaded, this does not
72486 + * need to be atomic, right? Just leave a comment that if bitmaps were
72487 + * unloadable, this would need to be atomic.  */
72488 +       if (atomic_read(&bnode->loaded)) {
72489 +               /* bitmap is already loaded, nothing to do */
72490 +               check_bnode_loaded(bnode);
72491 +               down(&bnode->sema);
72492 +               assert("nikita-2827", atomic_read(&bnode->loaded));
72493 +               return 0;
72494 +       }
72495 +
72496 +       ret = prepare_bnode(bnode, &cjnode, &wjnode);
72497 +       if (ret == 0) {
72498 +               down(&bnode->sema);
72499 +
72500 +               if (!atomic_read(&bnode->loaded)) {
72501 +                       assert("nikita-2822", cjnode != NULL);
72502 +                       assert("nikita-2823", wjnode != NULL);
72503 +                       assert("nikita-2824", jnode_is_loaded(cjnode));
72504 +                       assert("nikita-2825", jnode_is_loaded(wjnode));
72505 +
72506 +                       bnode->wjnode = wjnode;
72507 +                       bnode->cjnode = cjnode;
72508 +
72509 +                       ret = check_struct_bnode(bnode, current_blocksize);
72510 +                       if (!ret) {
72511 +                               cjnode = wjnode = NULL;
72512 +                               atomic_set(&bnode->loaded, 1);
72513 +                               /* working bitmap is initialized by on-disk
72514 +                                * commit bitmap. This should be performed
72515 +                                * under semaphore. */
72516 +                               xmemcpy(bnode_working_data(bnode),
72517 +                                       bnode_commit_data(bnode),
72518 +                                       bmap_size(current_blocksize));
72519 +                       } else {
72520 +                               up(&bnode->sema);
72521 +                       }
72522 +               } else
72523 +                       /* race: someone already loaded bitmap while we were
72524 +                        * busy initializing data. */
72525 +                       check_bnode_loaded(bnode);
72526 +       }
72527 +
72528 +       if (wjnode != NULL)
72529 +               release(wjnode);
72530 +       if (cjnode != NULL)
72531 +               release(cjnode);
72532 +
72533 +       return ret;
72534 +}
72535 +
72536 +static void
72537 +release_and_unlock_bnode(struct bitmap_node *bnode)
72538 +{
72539 +       check_bnode_loaded(bnode);
72540 +       up(&bnode->sema);
72541 +}
72542 +
72543 +/* This function does all block allocation work but only for one bitmap
72544 +   block.*/
72545 +/* FIXME_ZAM: It does not allow us to allocate block ranges across bitmap
72546 +   block responsibility zone boundaries. This had no sense in v3.6 but may
72547 +   have it in v4.x */
72548 +/* ZAM-FIXME-HANS: do you mean search one bitmap block forward? */
72549 +static int
72550 +search_one_bitmap_forward(bmap_nr_t bmap, bmap_off_t * offset, bmap_off_t max_offset,
72551 +                         int min_len, int max_len)
72552 +{
72553 +       struct super_block *super = get_current_context()->super;
72554 +       struct bitmap_node *bnode = get_bnode(super, bmap);
72555 +
72556 +       char *data;
72557 +
72558 +       bmap_off_t search_end;
72559 +       bmap_off_t start;
72560 +       bmap_off_t end;
72561 +
72562 +       int set_first_zero_bit = 0;
72563 +
72564 +       int ret;
72565 +
72566 +       assert("zam-364", min_len > 0);
72567 +       assert("zam-365", max_len >= min_len);
72568 +       assert("zam-366", *offset < max_offset);
72569 +
72570 +       ret = load_and_lock_bnode(bnode);
72571 +
72572 +       if (ret)
72573 +               return ret;
72574 +
72575 +       data = bnode_working_data(bnode);
72576 +
72577 +       start = *offset;
72578 +
72579 +       if (bnode->first_zero_bit >= start) {
72580 +               start = bnode->first_zero_bit;
72581 +               set_first_zero_bit = 1;
72582 +       }
72583 +
72584 +       while (start + min_len < max_offset) {
72585 +
72586 +               start = reiser4_find_next_zero_bit((long *) data, max_offset, start);
72587 +               if (set_first_zero_bit) {
72588 +                       bnode->first_zero_bit = start;
72589 +                       set_first_zero_bit = 0;
72590 +               }
72591 +               if (start >= max_offset)
72592 +                       break;
72593 +
72594 +               search_end = LIMIT(start + max_len, max_offset);
72595 +               end = reiser4_find_next_set_bit((long *) data, search_end, start);
72596 +               if (end >= start + min_len) {
72597 +                       /* we can't trust find_next_set_bit result if set bit
72598 +                          was not fount, result may be bigger than
72599 +                          max_offset */
72600 +                       if (end > search_end)
72601 +                               end = search_end;
72602 +
72603 +                       ret = end - start;
72604 +                       *offset = start;
72605 +
72606 +                       reiser4_set_bits(data, start, end);
72607 +
72608 +                       /* FIXME: we may advance first_zero_bit if [start,
72609 +                          end] region overlaps the first_zero_bit point */
72610 +
72611 +                       break;
72612 +               }
72613 +
72614 +               start = end + 1;
72615 +       }
72616 +
72617 +       release_and_unlock_bnode(bnode);
72618 +
72619 +       return ret;
72620 +}
72621 +
72622 +static int
72623 +search_one_bitmap_backward (bmap_nr_t bmap, bmap_off_t * start_offset, bmap_off_t end_offset,
72624 +                           int min_len, int max_len)
72625 +{
72626 +       struct super_block *super = get_current_context()->super;
72627 +       struct bitmap_node *bnode = get_bnode(super, bmap);
72628 +       char *data;
72629 +       bmap_off_t start;
72630 +       int ret;
72631 +
72632 +       assert("zam-958", min_len > 0);
72633 +       assert("zam-959", max_len >= min_len);
72634 +       assert("zam-960", *start_offset >= end_offset);
72635 +
72636 +       ret = load_and_lock_bnode(bnode);
72637 +       if (ret)
72638 +               return ret;
72639 +
72640 +       data = bnode_working_data(bnode);
72641 +       start = *start_offset;
72642 +
72643 +       while (1) {
72644 +               bmap_off_t end, search_end;
72645 +
72646 +               /* Find the beginning of the zero filled region */
72647 +               if (reiser4_find_last_zero_bit(&start, data, end_offset, start))
72648 +                       break;
72649 +               /* Is there more than `min_len' bits from `start' to
72650 +                * `end_offset'?  */
72651 +               if (start < end_offset + min_len - 1)
72652 +                       break;
72653 +
72654 +               /* Do not search to `end_offset' if we need to find less than
72655 +                * `max_len' zero bits. */
72656 +               if (end_offset + max_len - 1 < start)
72657 +                       search_end = start - max_len + 1;
72658 +               else
72659 +                       search_end = end_offset;
72660 +
72661 +               if (reiser4_find_last_set_bit(&end, data, search_end, start))
72662 +                       end = search_end;
72663 +               else
72664 +                       end ++;
72665 +
72666 +               if (end + min_len <= start + 1) {
72667 +                       if (end < search_end)
72668 +                               end = search_end;
72669 +                       ret = start - end + 1;
72670 +                       *start_offset = end; /* `end' is lowest offset */
72671 +                       assert ("zam-987", reiser4_find_next_set_bit(data, start + 1, end) >= start + 1);
72672 +                       reiser4_set_bits(data, end, start + 1);
72673 +                       break;
72674 +               }
72675 +
72676 +               if (end <= end_offset)
72677 +                       /* left search boundary reached. */
72678 +                       break;
72679 +               start = end - 1;
72680 +       }
72681 +
72682 +       release_and_unlock_bnode(bnode);
72683 +       return ret;
72684 +}
72685 +
72686 +/* allocate contiguous range of blocks in bitmap */
72687 +static int bitmap_alloc_forward(reiser4_block_nr * start, const reiser4_block_nr * end,
72688 +                               int min_len, int max_len)
72689 +{
72690 +       bmap_nr_t bmap, end_bmap;
72691 +       bmap_off_t offset, end_offset;
72692 +       int len;
72693 +
72694 +       reiser4_block_nr tmp;
72695 +
72696 +       struct super_block *super = get_current_context()->super;
72697 +       const bmap_off_t max_offset = bmap_bit_count(super->s_blocksize);
72698 +
72699 +       parse_blocknr(start, &bmap, &offset);
72700 +
72701 +       tmp = *end - 1;
72702 +       parse_blocknr(&tmp, &end_bmap, &end_offset);
72703 +       ++end_offset;
72704 +
72705 +       assert("zam-358", end_bmap >= bmap);
72706 +       assert("zam-359", ergo(end_bmap == bmap, end_offset > offset));
72707 +
72708 +       for (; bmap < end_bmap; bmap++, offset = 0) {
72709 +               len = search_one_bitmap_forward(bmap, &offset, max_offset, min_len, max_len);
72710 +               if (len != 0)
72711 +                       goto out;
72712 +       }
72713 +
72714 +       len = search_one_bitmap_forward(bmap, &offset, end_offset, min_len, max_len);
72715 +out:
72716 +       *start = bmap * max_offset + offset;
72717 +       return len;
72718 +}
72719 +
72720 +/* allocate contiguous range of blocks in bitmap (from @start to @end in
72721 + * backward direction) */
72722 +static int bitmap_alloc_backward(reiser4_block_nr * start, const reiser4_block_nr * end,
72723 +                                int min_len, int max_len)
72724 +{
72725 +       bmap_nr_t bmap, end_bmap;
72726 +       bmap_off_t offset, end_offset;
72727 +       int len;
72728 +       struct super_block *super = get_current_context()->super;
72729 +       const bmap_off_t max_offset = bmap_bit_count(super->s_blocksize);
72730 +
72731 +       parse_blocknr(start, &bmap, &offset);
72732 +       parse_blocknr(end, &end_bmap, &end_offset);
72733 +
72734 +       assert("zam-961", end_bmap <= bmap);
72735 +       assert("zam-962", ergo(end_bmap == bmap, end_offset <= offset));
72736 +
72737 +       for (; bmap > end_bmap; bmap --, offset = max_offset - 1) {
72738 +               len = search_one_bitmap_backward(bmap, &offset, 0, min_len, max_len);
72739 +               if (len != 0)
72740 +                       goto out;
72741 +       }
72742 +
72743 +       len = search_one_bitmap_backward(bmap, &offset, end_offset, min_len, max_len);
72744 + out:
72745 +       *start = bmap * max_offset + offset;
72746 +       return len;
72747 +}
72748 +
72749 +/* plugin->u.space_allocator.alloc_blocks() */
72750 +reiser4_internal int
72751 +alloc_blocks_forward(reiser4_blocknr_hint * hint, int needed,
72752 +                        reiser4_block_nr * start, reiser4_block_nr * len)
72753 +{
72754 +       struct super_block *super = get_current_context()->super;
72755 +       int actual_len;
72756 +
72757 +       reiser4_block_nr search_start;
72758 +       reiser4_block_nr search_end;
72759 +
72760 +       assert("zam-398", super != NULL);
72761 +       assert("zam-412", hint != NULL);
72762 +       assert("zam-397", hint->blk < reiser4_block_count(super));
72763 +
72764 +       if (hint->max_dist == 0)
72765 +               search_end = reiser4_block_count(super);
72766 +       else
72767 +               search_end = LIMIT(hint->blk + hint->max_dist, reiser4_block_count(super));
72768 +
72769 +       /* We use @hint -> blk as a search start and search from it to the end
72770 +          of the disk or in given region if @hint -> max_dist is not zero */
72771 +       search_start = hint->blk;
72772 +
72773 +       actual_len = bitmap_alloc_forward(&search_start, &search_end, 1, needed);
72774 +
72775 +       /* There is only one bitmap search if max_dist was specified or first
72776 +          pass was from the beginning of the bitmap. We also do one pass for
72777 +          scanning bitmap in backward direction. */
72778 +       if (!(actual_len != 0 || hint->max_dist != 0  || search_start == 0)) {
72779 +               /* next step is a scanning from 0 to search_start */
72780 +               search_end = search_start;
72781 +               search_start = 0;
72782 +               actual_len = bitmap_alloc_forward(&search_start, &search_end, 1, needed);
72783 +       }
72784 +       if (actual_len == 0)
72785 +               return RETERR(-ENOSPC);
72786 +       if (actual_len < 0)
72787 +               return RETERR(actual_len);
72788 +       *len = actual_len;
72789 +       *start = search_start;
72790 +       return 0;
72791 +}
72792 +
72793 +static int alloc_blocks_backward (reiser4_blocknr_hint * hint, int needed,
72794 +                                 reiser4_block_nr * start, reiser4_block_nr * len)
72795 +{
72796 +       reiser4_block_nr search_start;
72797 +       reiser4_block_nr search_end;
72798 +       int actual_len;
72799 +
72800 +       ON_DEBUG(struct super_block * super = reiser4_get_current_sb());
72801 +
72802 +       assert ("zam-969", super != NULL);
72803 +       assert ("zam-970", hint != NULL);
72804 +       assert ("zam-971", hint->blk < reiser4_block_count(super));
72805 +
72806 +       search_start = hint->blk;
72807 +       if (hint->max_dist == 0 || search_start <= hint->max_dist)
72808 +               search_end = 0;
72809 +       else
72810 +               search_end = search_start - hint->max_dist;
72811 +
72812 +       actual_len = bitmap_alloc_backward(&search_start, &search_end, 1, needed);
72813 +       if (actual_len == 0)
72814 +               return RETERR(-ENOSPC);
72815 +       if (actual_len < 0)
72816 +               return RETERR(actual_len);
72817 +       *len = actual_len;
72818 +       *start = search_start;
72819 +       return 0;
72820 +}
72821 +
72822 +/* plugin->u.space_allocator.alloc_blocks() */
72823 +reiser4_internal int
72824 +alloc_blocks_bitmap(reiser4_space_allocator * allocator UNUSED_ARG,
72825 +                       reiser4_blocknr_hint * hint, int needed,
72826 +                       reiser4_block_nr * start, reiser4_block_nr * len)
72827 +{
72828 +       if (hint->backward)
72829 +               return alloc_blocks_backward(hint, needed, start, len);
72830 +       return alloc_blocks_forward(hint, needed, start, len);
72831 +}
72832 +
72833 +/* plugin->u.space_allocator.dealloc_blocks(). */
72834 +/* It just frees blocks in WORKING BITMAP. Usually formatted an unformatted
72835 +   nodes deletion is deferred until transaction commit.  However, deallocation
72836 +   of temporary objects like wandered blocks and transaction commit records
72837 +   requires immediate node deletion from WORKING BITMAP.*/
72838 +reiser4_internal void
72839 +dealloc_blocks_bitmap(reiser4_space_allocator * allocator UNUSED_ARG, reiser4_block_nr start, reiser4_block_nr len)
72840 +{
72841 +       struct super_block *super = reiser4_get_current_sb();
72842 +
72843 +       bmap_nr_t bmap;
72844 +       bmap_off_t offset;
72845 +
72846 +       struct bitmap_node *bnode;
72847 +       int ret;
72848 +
72849 +       assert("zam-468", len != 0);
72850 +       check_block_range(&start, &len);
72851 +
72852 +       parse_blocknr(&start, &bmap, &offset);
72853 +
72854 +       assert("zam-469", offset + len <= bmap_bit_count(super->s_blocksize));
72855 +
72856 +       bnode = get_bnode(super, bmap);
72857 +
72858 +       assert("zam-470", bnode != NULL);
72859 +
72860 +       ret = load_and_lock_bnode(bnode);
72861 +       assert("zam-481", ret == 0);
72862 +
72863 +       reiser4_clear_bits(bnode_working_data(bnode), offset, (bmap_off_t) (offset + len));
72864 +
72865 +       adjust_first_zero_bit(bnode, offset);
72866 +
72867 +       release_and_unlock_bnode(bnode);
72868 +}
72869 +
72870 +
72871 +/* plugin->u.space_allocator.check_blocks(). */
72872 +reiser4_internal void
72873 +check_blocks_bitmap(const reiser4_block_nr * start, const reiser4_block_nr * len, int desired)
72874 +{
72875 +#if REISER4_DEBUG
72876 +       struct super_block *super = reiser4_get_current_sb();
72877 +
72878 +       bmap_nr_t bmap;
72879 +       bmap_off_t start_offset;
72880 +       bmap_off_t end_offset;
72881 +
72882 +       struct bitmap_node *bnode;
72883 +       int ret;
72884 +
72885 +       assert("zam-622", len != NULL);
72886 +       check_block_range(start, len);
72887 +       parse_blocknr(start, &bmap, &start_offset);
72888 +
72889 +       end_offset = start_offset + *len;
72890 +       assert("nikita-2214", end_offset <= bmap_bit_count(super->s_blocksize));
72891 +
72892 +       bnode = get_bnode(super, bmap);
72893 +
72894 +       assert("nikita-2215", bnode != NULL);
72895 +
72896 +       ret = load_and_lock_bnode(bnode);
72897 +       assert("zam-626", ret == 0);
72898 +
72899 +       assert("nikita-2216", jnode_is_loaded(bnode->wjnode));
72900 +
72901 +       if (desired) {
72902 +               assert("zam-623", reiser4_find_next_zero_bit(bnode_working_data(bnode), end_offset, start_offset)
72903 +                      >= end_offset);
72904 +       } else {
72905 +               assert("zam-624", reiser4_find_next_set_bit(bnode_working_data(bnode), end_offset, start_offset)
72906 +                      >= end_offset);
72907 +       }
72908 +
72909 +       release_and_unlock_bnode(bnode);
72910 +#endif
72911 +}
72912 +
72913 +/* conditional insertion of @node into atom's overwrite set  if it was not there */
72914 +static void
72915 +cond_add_to_overwrite_set (txn_atom * atom, jnode * node)
72916 +{
72917 +       assert("zam-546", atom != NULL);
72918 +       assert("zam-547", atom->stage == ASTAGE_PRE_COMMIT);
72919 +       assert("zam-548", node != NULL);
72920 +
72921 +       LOCK_ATOM(atom);
72922 +       LOCK_JNODE(node);
72923 +
72924 +       if (node->atom == NULL) {
72925 +               JF_SET(node, JNODE_OVRWR);
72926 +               insert_into_atom_ovrwr_list(atom, node);
72927 +       } else {
72928 +               assert("zam-549", node->atom == atom);
72929 +       }
72930 +
72931 +       UNLOCK_JNODE(node);
72932 +       UNLOCK_ATOM(atom);
72933 +}
72934 +
72935 +/* an actor which applies delete set to COMMIT bitmap pages and link modified
72936 +   pages in a single-linked list */
72937 +static int
72938 +apply_dset_to_commit_bmap(txn_atom * atom, const reiser4_block_nr * start, const reiser4_block_nr * len, void *data)
72939 +{
72940 +
72941 +       bmap_nr_t bmap;
72942 +       bmap_off_t offset;
72943 +       int ret;
72944 +
72945 +       long long *blocks_freed_p = data;
72946 +
72947 +       struct bitmap_node *bnode;
72948 +
72949 +       struct super_block *sb = reiser4_get_current_sb();
72950 +
72951 +       check_block_range(start, len);
72952 +
72953 +       parse_blocknr(start, &bmap, &offset);
72954 +
72955 +       /* FIXME-ZAM: we assume that all block ranges are allocated by this
72956 +          bitmap-based allocator and each block range can't go over a zone of
72957 +          responsibility of one bitmap block; same assumption is used in
72958 +          other journal hooks in bitmap code. */
72959 +       bnode = get_bnode(sb, bmap);
72960 +       assert("zam-448", bnode != NULL);
72961 +
72962 +       /* it is safe to unlock atom with is in ASTAGE_PRE_COMMIT */
72963 +       assert ("zam-767", atom->stage == ASTAGE_PRE_COMMIT);
72964 +       ret = load_and_lock_bnode(bnode);
72965 +       if (ret)
72966 +               return ret;
72967 +
72968 +       /* put bnode into atom's overwrite set */
72969 +       cond_add_to_overwrite_set (atom, bnode->cjnode);
72970 +
72971 +       data = bnode_commit_data(bnode);
72972 +
72973 +       ret = bnode_check_crc(bnode);
72974 +       if (ret != 0)
72975 +               return ret;
72976 +
72977 +       if (len != NULL) {
72978 +               /* FIXME-ZAM: a check that all bits are set should be there */
72979 +               assert("zam-443", offset + *len <= bmap_bit_count(sb->s_blocksize));
72980 +               reiser4_clear_bits(data, offset, (bmap_off_t) (offset + *len));
72981 +
72982 +               (*blocks_freed_p) += *len;
72983 +       } else {
72984 +               reiser4_clear_bit(offset, data);
72985 +               (*blocks_freed_p)++;
72986 +       }
72987 +
72988 +       bnode_set_commit_crc(bnode, bnode_calc_crc(bnode));
72989 +
72990 +       release_and_unlock_bnode(bnode);
72991 +
72992 +       return 0;
72993 +}
72994 +
72995 +/* plugin->u.space_allocator.pre_commit_hook(). */
72996 +/* It just applies transaction changes to fs-wide COMMIT BITMAP, hoping the
72997 +   rest is done by transaction manager (allocate wandered locations for COMMIT
72998 +   BITMAP blocks, copy COMMIT BITMAP blocks data). */
72999 +/* Only one instance of this function can be running at one given time, because
73000 +   only one transaction can be committed a time, therefore it is safe to access
73001 +   some global variables without any locking */
73002 +
73003 +#if REISER4_COPY_ON_CAPTURE
73004 +
73005 +extern spinlock_t scan_lock;
73006 +
73007 +reiser4_internal int
73008 +pre_commit_hook_bitmap(void)
73009 +{
73010 +       struct super_block * super = reiser4_get_current_sb();
73011 +       txn_atom *atom;
73012 +
73013 +       long long blocks_freed = 0;
73014 +
73015 +       atom = get_current_atom_locked ();
73016 +       BUG_ON(atom->stage != ASTAGE_PRE_COMMIT);
73017 +       assert ("zam-876", atom->stage == ASTAGE_PRE_COMMIT);
73018 +       spin_unlock_atom(atom);
73019 +
73020 +
73021 +
73022 +       {                       /* scan atom's captured list and find all freshly allocated nodes,
73023 +                                * mark corresponded bits in COMMIT BITMAP as used */
73024 +               /* how cpu significant is this scan, should we someday have a freshly_allocated list? -Hans */
73025 +               capture_list_head *head = ATOM_CLEAN_LIST(atom);
73026 +               jnode *node;
73027 +
73028 +               spin_lock(&scan_lock);
73029 +               node = capture_list_front(head);
73030 +
73031 +               while (!capture_list_end(head, node)) {
73032 +                       int ret;
73033 +
73034 +                       assert("vs-1445", NODE_LIST(node) == CLEAN_LIST);
73035 +                       BUG_ON(node->atom != atom);
73036 +                       JF_SET(node, JNODE_SCANNED);
73037 +                       spin_unlock(&scan_lock);
73038 +
73039 +                       /* we detect freshly allocated jnodes */
73040 +                       if (JF_ISSET(node, JNODE_RELOC)) {
73041 +                               bmap_nr_t bmap;
73042 +
73043 +                               bmap_off_t offset;
73044 +                               bmap_off_t index;
73045 +                               struct bitmap_node *bn;
73046 +                               __u32 size = bmap_size(super->s_blocksize);
73047 +                               char byte;
73048 +                               __u32 crc;
73049 +
73050 +                               assert("zam-559", !JF_ISSET(node, JNODE_OVRWR));
73051 +                               assert("zam-460", !blocknr_is_fake(&node->blocknr));
73052 +
73053 +                               parse_blocknr(&node->blocknr, &bmap, &offset);
73054 +                               bn = get_bnode(super, bmap);
73055 +
73056 +                               index = offset >> 3;
73057 +                               assert("vpf-276", index < size);
73058 +
73059 +                               ret = bnode_check_crc(bnode);
73060 +                               if (ret != 0)
73061 +                                       return ret;
73062 +
73063 +                               check_bnode_loaded(bn);
73064 +                               load_and_lock_bnode(bn);
73065 +
73066 +                               byte = *(bnode_commit_data(bn) + index);
73067 +                               reiser4_set_bit(offset, bnode_commit_data(bn));
73068 +
73069 +                               crc = adler32_recalc(bnode_commit_crc(bn), byte,
73070 +                                                    *(bnode_commit_data(bn) +
73071 +                                                      index),
73072 +                                                    size - index),
73073 +
73074 +                               bnode_set_commit_crc(bn, crc);
73075 +
73076 +                               release_and_unlock_bnode(bn);
73077 +
73078 +                               ret = bnode_check_crc(bnode);
73079 +                               if (ret != 0)
73080 +                                       return ret;
73081 +
73082 +                               /* working of this depends on how it inserts
73083 +                                  new j-node into clean list, because we are
73084 +                                  scanning the same list now. It is OK, if
73085 +                                  insertion is done to the list front */
73086 +                               cond_add_to_overwrite_set (atom, bn->cjnode);
73087 +                       }
73088 +
73089 +                       spin_lock(&scan_lock);
73090 +                       JF_CLR(node, JNODE_SCANNED);
73091 +                       node = capture_list_next(node);
73092 +               }
73093 +               spin_unlock(&scan_lock);
73094 +       }
73095 +
73096 +       blocknr_set_iterator(atom, &atom->delete_set, apply_dset_to_commit_bmap, &blocks_freed, 0);
73097 +
73098 +       blocks_freed -= atom->nr_blocks_allocated;
73099 +
73100 +       {
73101 +               reiser4_super_info_data *sbinfo;
73102 +
73103 +               sbinfo = get_super_private(super);
73104 +
73105 +               reiser4_spin_lock_sb(sbinfo);
73106 +               sbinfo->blocks_free_committed += blocks_freed;
73107 +               reiser4_spin_unlock_sb(sbinfo);
73108 +       }
73109 +
73110 +       return 0;
73111 +}
73112 +
73113 +#else /* ! REISER4_COPY_ON_CAPTURE */
73114 +
73115 +reiser4_internal int
73116 +pre_commit_hook_bitmap(void)
73117 +{
73118 +       struct super_block * super = reiser4_get_current_sb();
73119 +       txn_atom *atom;
73120 +
73121 +       long long blocks_freed = 0;
73122 +
73123 +       atom = get_current_atom_locked ();
73124 +       assert ("zam-876", atom->stage == ASTAGE_PRE_COMMIT);
73125 +       spin_unlock_atom(atom);
73126 +
73127 +       {                       /* scan atom's captured list and find all freshly allocated nodes,
73128 +                                * mark corresponded bits in COMMIT BITMAP as used */
73129 +               capture_list_head *head = ATOM_CLEAN_LIST(atom);
73130 +               jnode *node = capture_list_front(head);
73131 +
73132 +               while (!capture_list_end(head, node)) {
73133 +                       /* we detect freshly allocated jnodes */
73134 +                       if (JF_ISSET(node, JNODE_RELOC)) {
73135 +                               int ret;
73136 +                               bmap_nr_t bmap;
73137 +
73138 +                               bmap_off_t offset;
73139 +                               bmap_off_t index;
73140 +                               struct bitmap_node *bn;
73141 +                               __u32 size = bmap_size(super->s_blocksize);
73142 +                               __u32 crc;
73143 +                               char byte;
73144 +
73145 +                               assert("zam-559", !JF_ISSET(node, JNODE_OVRWR));
73146 +                               assert("zam-460", !blocknr_is_fake(&node->blocknr));
73147 +
73148 +                               parse_blocknr(&node->blocknr, &bmap, &offset);
73149 +                               bn = get_bnode(super, bmap);
73150 +
73151 +                               index = offset >> 3;
73152 +                               assert("vpf-276", index < size);
73153 +
73154 +                               ret = bnode_check_crc(bnode);
73155 +                               if (ret != 0)
73156 +                                       return ret;
73157 +
73158 +                               check_bnode_loaded(bn);
73159 +                               load_and_lock_bnode(bn);
73160 +
73161 +                               byte = *(bnode_commit_data(bn) + index);
73162 +                               reiser4_set_bit(offset, bnode_commit_data(bn));
73163 +
73164 +                               crc = adler32_recalc(bnode_commit_crc(bn), byte,
73165 +                                                    *(bnode_commit_data(bn) +
73166 +                                                      index),
73167 +                                                    size - index),
73168 +
73169 +                               bnode_set_commit_crc(bn, crc);
73170 +
73171 +                               release_and_unlock_bnode(bn);
73172 +
73173 +                               ret = bnode_check_crc(bn);
73174 +                               if (ret != 0)
73175 +                                       return ret;
73176 +
73177 +                               /* working of this depends on how it inserts
73178 +                                  new j-node into clean list, because we are
73179 +                                  scanning the same list now. It is OK, if
73180 +                                  insertion is done to the list front */
73181 +                               cond_add_to_overwrite_set (atom, bn->cjnode);
73182 +                       }
73183 +
73184 +                       node = capture_list_next(node);
73185 +               }
73186 +       }
73187 +
73188 +       blocknr_set_iterator(atom, &atom->delete_set, apply_dset_to_commit_bmap, &blocks_freed, 0);
73189 +
73190 +       blocks_freed -= atom->nr_blocks_allocated;
73191 +
73192 +       {
73193 +               reiser4_super_info_data *sbinfo;
73194 +
73195 +               sbinfo = get_super_private(super);
73196 +
73197 +               reiser4_spin_lock_sb(sbinfo);
73198 +               sbinfo->blocks_free_committed += blocks_freed;
73199 +               reiser4_spin_unlock_sb(sbinfo);
73200 +       }
73201 +
73202 +       return 0;
73203 +}
73204 +#endif /* ! REISER4_COPY_ON_CAPTURE */
73205 +
73206 +/* plugin->u.space_allocator.init_allocator
73207 +    constructor of reiser4_space_allocator object. It is called on fs mount */
73208 +reiser4_internal int
73209 +init_allocator_bitmap(reiser4_space_allocator * allocator, struct super_block *super, void *arg UNUSED_ARG)
73210 +{
73211 +       struct bitmap_allocator_data *data = NULL;
73212 +       bmap_nr_t bitmap_blocks_nr;
73213 +       bmap_nr_t i;
73214 +
73215 +       assert("nikita-3039", schedulable());
73216 +
73217 +       /* getting memory for bitmap allocator private data holder */
73218 +       data = reiser4_kmalloc(sizeof (struct bitmap_allocator_data), GFP_KERNEL);
73219 +
73220 +       if (data == NULL)
73221 +               return RETERR(-ENOMEM);
73222 +
73223 +       /* allocation and initialization for the array of bnodes */
73224 +       bitmap_blocks_nr = get_nr_bmap(super);
73225 +
73226 +       /* FIXME-ZAM: it is not clear what to do with huge number of bitmaps
73227 +          which is bigger than 2^32 (= 8 * 4096 * 4096 * 2^32 bytes = 5.76e+17,
73228 +          may I never meet someone who still uses the ia32 architecture when
73229 +          storage devices of that size enter the market, and wants to use ia32
73230 +          with that storage device, much less reiser4. ;-) -Hans). Kmalloc is not possible and,
73231 +          probably, another dynamic data structure should replace a static
73232 +          array of bnodes. */
73233 +       /*data->bitmap = reiser4_kmalloc((size_t) (sizeof (struct bitmap_node) * bitmap_blocks_nr), GFP_KERNEL);*/
73234 +       data->bitmap = vmalloc(sizeof (struct bitmap_node) * bitmap_blocks_nr);
73235 +       if (data->bitmap == NULL) {
73236 +               reiser4_kfree(data);
73237 +               return RETERR(-ENOMEM);
73238 +       }
73239 +
73240 +       for (i = 0; i < bitmap_blocks_nr; i++)
73241 +               init_bnode(data->bitmap + i, super, i);
73242 +
73243 +       allocator->u.generic = data;
73244 +
73245 +#if REISER4_DEBUG
73246 +       get_super_private(super)->min_blocks_used += bitmap_blocks_nr;
73247 +#endif
73248 +
73249 +       /* Load all bitmap blocks at mount time. */
73250 +       if (!test_bit(REISER4_DONT_LOAD_BITMAP, &get_super_private(super)->fs_flags)) {
73251 +               __u64 start_time, elapsed_time;
73252 +               struct bitmap_node * bnode;
73253 +               int ret;
73254 +
73255 +               if (REISER4_DEBUG)
73256 +                       printk(KERN_INFO "loading reiser4 bitmap...");
73257 +               start_time = jiffies;
73258 +
73259 +               for (i = 0; i < bitmap_blocks_nr; i++) {
73260 +                       bnode = data->bitmap + i;
73261 +                       ret = load_and_lock_bnode(bnode);
73262 +                       if (ret) {
73263 +                               destroy_allocator_bitmap(allocator, super);
73264 +                               return ret;
73265 +                       }
73266 +                       release_and_unlock_bnode(bnode);
73267 +               }
73268 +
73269 +               elapsed_time = jiffies - start_time;
73270 +               if (REISER4_DEBUG)
73271 +                       printk("...done (%llu jiffies)\n",
73272 +                              (unsigned long long)elapsed_time);
73273 +       }
73274 +
73275 +       return 0;
73276 +}
73277 +
73278 +/* plugin->u.space_allocator.destroy_allocator
73279 +   destructor. It is called on fs unmount */
73280 +reiser4_internal int
73281 +destroy_allocator_bitmap(reiser4_space_allocator * allocator, struct super_block *super)
73282 +{
73283 +       bmap_nr_t bitmap_blocks_nr;
73284 +       bmap_nr_t i;
73285 +
73286 +       struct bitmap_allocator_data *data = allocator->u.generic;
73287 +
73288 +       assert("zam-414", data != NULL);
73289 +       assert("zam-376", data->bitmap != NULL);
73290 +
73291 +       bitmap_blocks_nr = get_nr_bmap(super);
73292 +
73293 +       for (i = 0; i < bitmap_blocks_nr; i++) {
73294 +               struct bitmap_node *bnode = data->bitmap + i;
73295 +
73296 +               down(&bnode->sema);
73297 +
73298 +#if REISER4_DEBUG
73299 +               if (atomic_read(&bnode->loaded)) {
73300 +                       jnode *wj = bnode->wjnode;
73301 +                       jnode *cj = bnode->cjnode;
73302 +
73303 +                       assert("zam-480", jnode_page(cj) != NULL);
73304 +                       assert("zam-633", jnode_page(wj) != NULL);
73305 +
73306 +                       assert("zam-634",
73307 +                              memcmp(jdata(wj), jdata(wj),
73308 +                                     bmap_size(super->s_blocksize)) == 0);
73309 +
73310 +               }
73311 +#endif
73312 +               done_bnode(bnode);
73313 +               up(&bnode->sema);
73314 +       }
73315 +
73316 +       /*reiser4_kfree(data->bitmap);*/
73317 +       vfree(data->bitmap);
73318 +       reiser4_kfree(data);
73319 +
73320 +       allocator->u.generic = NULL;
73321 +
73322 +       return 0;
73323 +}
73324 +
73325 +/*
73326 +   Local variables:
73327 +   c-indentation-style: "K&R"
73328 +   mode-name: "LC"
73329 +   c-basic-offset: 8
73330 +   tab-width: 8
73331 +   fill-column: 80
73332 +   scroll-step: 1
73333 +   End:
73334 +*/
73335 diff -rupN linux-2.6.8-rc3/fs/reiser4/plugin/space/bitmap.h linux-2.6.8-rc3-a/fs/reiser4/plugin/space/bitmap.h
73336 --- linux-2.6.8-rc3/fs/reiser4/plugin/space/bitmap.h    1970-01-01 03:00:00.000000000 +0300
73337 +++ linux-2.6.8-rc3-a/fs/reiser4/plugin/space/bitmap.h  2004-08-05 21:20:53.330608341 +0400
73338 @@ -0,0 +1,44 @@
73339 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
73340 +
73341 +#if !defined (__REISER4_PLUGIN_SPACE_BITMAP_H__)
73342 +#define __REISER4_PLUGIN_SPACE_BITMAP_H__
73343 +
73344 +#include "../../dformat.h"
73345 +#include "../../block_alloc.h"
73346 +
73347 +#include <linux/types.h>       /* for __u??  */
73348 +#include <linux/fs.h>          /* for struct super_block  */
73349 +/* EDWARD-FIXME-HANS: write something as informative as the below for every .h file lacking it. */
73350 +/* declarations of functions implementing methods of space allocator plugin for
73351 +   bitmap based allocator. The functions themselves are in bitmap.c */
73352 +extern int init_allocator_bitmap(reiser4_space_allocator *, struct super_block *, void *);
73353 +extern int destroy_allocator_bitmap(reiser4_space_allocator *, struct super_block *);
73354 +extern int alloc_blocks_bitmap(reiser4_space_allocator *,
73355 +                              reiser4_blocknr_hint *, int needed, reiser4_block_nr * start, reiser4_block_nr * len);
73356 +extern void check_blocks_bitmap(const reiser4_block_nr *, const reiser4_block_nr *, int);
73357 +
73358 +extern void dealloc_blocks_bitmap(reiser4_space_allocator *, reiser4_block_nr, reiser4_block_nr);
73359 +extern int pre_commit_hook_bitmap(void);
73360 +
73361 +#define post_commit_hook_bitmap() do{}while(0)
73362 +#define post_write_back_hook_bitmap() do{}while(0)
73363 +#define print_info_bitmap(pref, al) do{}while(0)
73364 +
73365 +typedef __u64 bmap_nr_t;
73366 +typedef __u32 bmap_off_t;
73367 +
73368 +/* exported for user-level simulator */
73369 +extern void get_bitmap_blocknr(struct super_block *, bmap_nr_t, reiser4_block_nr *);
73370 +
73371 +#endif                         /* __REISER4_PLUGIN_SPACE_BITMAP_H__ */
73372 +
73373 +/* Make Linus happy.
73374 +   Local variables:
73375 +   c-indentation-style: "K&R"
73376 +   mode-name: "LC"
73377 +   c-basic-offset: 8
73378 +   tab-width: 8
73379 +   fill-column: 120
73380 +   scroll-step: 1
73381 +   End:
73382 +*/
73383 diff -rupN linux-2.6.8-rc3/fs/reiser4/plugin/space/space_allocator.h linux-2.6.8-rc3-a/fs/reiser4/plugin/space/space_allocator.h
73384 --- linux-2.6.8-rc3/fs/reiser4/plugin/space/space_allocator.h   1970-01-01 03:00:00.000000000 +0300
73385 +++ linux-2.6.8-rc3-a/fs/reiser4/plugin/space/space_allocator.h 2004-08-05 21:20:52.972683836 +0400
73386 @@ -0,0 +1,80 @@
73387 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
73388 +
73389 +#ifndef __SPACE_ALLOCATOR_H__
73390 +#define __SPACE_ALLOCATOR_H__
73391 +
73392 +#include "../../forward.h"
73393 +#include "bitmap.h"
73394 +/* NIKITA-FIXME-HANS: surely this could use a comment. Something about how bitmap is the only space allocator for now,
73395 + * but... */
73396 +#define DEF_SPACE_ALLOCATOR(allocator)                                                                                 \
73397 +                                                                                                                       \
73398 +static inline int sa_init_allocator (reiser4_space_allocator * al, struct super_block *s, void * opaque)               \
73399 +{                                                                                                                      \
73400 +       return init_allocator_##allocator (al, s, opaque);                                                              \
73401 +}                                                                                                                      \
73402 +                                                                                                                       \
73403 +static inline void sa_destroy_allocator (reiser4_space_allocator *al, struct super_block *s)                           \
73404 +{                                                                                                                      \
73405 +       destroy_allocator_##allocator (al, s);                                                                          \
73406 +}                                                                                                                      \
73407 +                                                                                                                       \
73408 +static inline int sa_alloc_blocks (reiser4_space_allocator *al, reiser4_blocknr_hint * hint,                           \
73409 +                                  int needed, reiser4_block_nr * start, reiser4_block_nr * len)                        \
73410 +{                                                                                                                      \
73411 +       return alloc_blocks_##allocator (al, hint, needed, start, len);                                                 \
73412 +}                                                                                                                      \
73413 +static inline void sa_dealloc_blocks (reiser4_space_allocator * al, reiser4_block_nr start, reiser4_block_nr len)      \
73414 +{                                                                                                                      \
73415 +       dealloc_blocks_##allocator (al, start, len);                                                                    \
73416 +}                                                                                                                      \
73417 +                                                                                                                       \
73418 +static inline void sa_check_blocks (const reiser4_block_nr * start, const reiser4_block_nr * end, int desired)                 \
73419 +{                                                                                                                      \
73420 +       check_blocks_##allocator (start, end, desired);                                                                 \
73421 +}                                                                                                                      \
73422 +                                                                                                                       \
73423 +static inline void sa_pre_commit_hook (void)                                                                           \
73424 +{                                                                                                                      \
73425 +       pre_commit_hook_##allocator ();                                                                                 \
73426 +}                                                                                                                      \
73427 +                                                                                                                       \
73428 +static inline void sa_post_commit_hook (void)                                                                          \
73429 +{                                                                                                                      \
73430 +       post_commit_hook_##allocator ();                                                                                \
73431 +}                                                                                                                      \
73432 +                                                                                                                       \
73433 +static inline void sa_post_write_back_hook (void)                                                                      \
73434 +{                                                                                                                      \
73435 +       post_write_back_hook_##allocator();                                                                             \
73436 +}                                                                                                                      \
73437 +                                                                                                                       \
73438 +static inline void sa_print_info(const char * prefix, reiser4_space_allocator * al)                                    \
73439 +{                                                                                                                      \
73440 +       print_info_##allocator (prefix, al);                                                                            \
73441 +}
73442 +
73443 +DEF_SPACE_ALLOCATOR(bitmap)
73444 +
73445 +/* this object is part of reiser4 private in-core super block */
73446 +struct reiser4_space_allocator {
73447 +       union {
73448 +               /* space allocators might use this pointer to reference their
73449 +                * data. */
73450 +               void *generic;
73451 +       } u;
73452 +};
73453 +
73454 +/* __SPACE_ALLOCATOR_H__ */
73455 +#endif
73456 +
73457 +/* Make Linus happy.
73458 +   Local variables:
73459 +   c-indentation-style: "K&R"
73460 +   mode-name: "LC"
73461 +   c-basic-offset: 8
73462 +   tab-width: 8
73463 +   fill-column: 120
73464 +   scroll-step: 1
73465 +   End:
73466 +*/
73467 diff -rupN linux-2.6.8-rc3/fs/reiser4/plugin/symlink.c linux-2.6.8-rc3-a/fs/reiser4/plugin/symlink.c
73468 --- linux-2.6.8-rc3/fs/reiser4/plugin/symlink.c 1970-01-01 03:00:00.000000000 +0300
73469 +++ linux-2.6.8-rc3-a/fs/reiser4/plugin/symlink.c       2004-08-05 21:20:52.910696911 +0400
73470 @@ -0,0 +1,85 @@
73471 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
73472 +
73473 +#include "../forward.h"
73474 +#include "../debug.h"
73475 +#include "item/static_stat.h"
73476 +#include "plugin.h"
73477 +#include "../tree.h"
73478 +#include "../vfs_ops.h"
73479 +#include "../inode.h"
73480 +#include "object.h"
73481 +
73482 +#include <linux/types.h>
73483 +#include <linux/fs.h>          /* for struct inode */
73484 +
73485 +/* symlink plugin's specific functions */
73486 +
73487 +reiser4_internal int
73488 +create_symlink(struct inode *symlink,  /* inode of symlink */
73489 +              struct inode *dir UNUSED_ARG,    /* parent directory */
73490 +              reiser4_object_create_data * data        /* info passed
73491 +                                                          * to us, this
73492 +                                                          * is filled by
73493 +                                                          * reiser4()
73494 +                                                          * syscall in
73495 +                                                          * particular */ )
73496 +{
73497 +       int result;
73498 +
73499 +       assert("nikita-680", symlink != NULL);
73500 +       assert("nikita-681", S_ISLNK(symlink->i_mode));
73501 +       assert("nikita-685", inode_get_flag(symlink, REISER4_NO_SD));
73502 +       assert("nikita-682", dir != NULL);
73503 +       assert("nikita-684", data != NULL);
73504 +       assert("nikita-686", data->id == SYMLINK_FILE_PLUGIN_ID);
73505 +
73506 +       /*
73507 +        * stat data of symlink has symlink extension in which we store
73508 +        * symlink content, that is, path symlink is pointing to.
73509 +        */
73510 +       reiser4_inode_data(symlink)->extmask |= (1 << SYMLINK_STAT);
73511 +
73512 +       assert("vs-838", symlink->u.generic_ip == 0);
73513 +       symlink->u.generic_ip = (void *) data->name;
73514 +
73515 +       assert("vs-843", symlink->i_size == 0);
73516 +       INODE_SET_FIELD(symlink, i_size, strlen(data->name));
73517 +
73518 +       /* insert stat data appended with data->name */
73519 +       result = write_sd_by_inode_common(symlink);
73520 +       if (result) {
73521 +               /* FIXME-VS: Make sure that symlink->u.generic_ip is not attached
73522 +                  to kmalloced data */
73523 +               INODE_SET_FIELD(symlink, i_size, 0);
73524 +       } else {
73525 +               assert("vs-849", symlink->u.generic_ip && inode_get_flag(symlink, REISER4_GENERIC_PTR_USED));
73526 +               assert("vs-850", !memcmp((char *) symlink->u.generic_ip, data->name, (size_t) symlink->i_size + 1));
73527 +       }
73528 +       return result;
73529 +}
73530 +
73531 +/* plugin->destroy_inode() */
73532 +reiser4_internal void
73533 +destroy_inode_symlink(struct inode * inode)
73534 +{
73535 +       assert("edward-799", inode_file_plugin(inode) == file_plugin_by_id(SYMLINK_FILE_PLUGIN_ID));
73536 +       assert("edward-800", !is_bad_inode(inode) && is_inode_loaded(inode));
73537 +       assert("edward-801", inode_get_flag(inode, REISER4_GENERIC_PTR_USED));
73538 +       assert("vs-839", S_ISLNK(inode->i_mode));
73539 +
73540 +       reiser4_kfree_in_sb(inode->u.generic_ip, inode->i_sb);
73541 +       inode->u.generic_ip = 0;
73542 +       inode_clr_flag(inode, REISER4_GENERIC_PTR_USED);
73543 +}
73544 +
73545 +/* Make Linus happy.
73546 +   Local variables:
73547 +   c-indentation-style: "K&R"
73548 +   mode-name: "LC"
73549 +   c-basic-offset: 8
73550 +   tab-width: 8
73551 +   fill-column: 120
73552 +   scroll-step: 1
73553 +   End:
73554 +*/
73555 +
73556 diff -rupN linux-2.6.8-rc3/fs/reiser4/plugin/symlink.h linux-2.6.8-rc3-a/fs/reiser4/plugin/symlink.h
73557 --- linux-2.6.8-rc3/fs/reiser4/plugin/symlink.h 1970-01-01 03:00:00.000000000 +0300
73558 +++ linux-2.6.8-rc3-a/fs/reiser4/plugin/symlink.h       2004-08-05 21:20:53.294615933 +0400
73559 @@ -0,0 +1,24 @@
73560 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
73561 +
73562 +#if !defined( __REISER4_SYMLINK_H__ )
73563 +#define __REISER4_SYMLINK_H__
73564 +
73565 +#include "../forward.h"
73566 +#include <linux/fs.h>          /* for struct inode */
73567 +
73568 +int create_symlink(struct inode *symlink, struct inode *dir, reiser4_object_create_data * data);
73569 +void destroy_inode_symlink(struct inode * inode);
73570 +
73571 +/* __REISER4_SYMLINK_H__ */
73572 +#endif
73573 +
73574 +/* Make Linus happy.
73575 +   Local variables:
73576 +   c-indentation-style: "K&R"
73577 +   mode-name: "LC"
73578 +   c-basic-offset: 8
73579 +   tab-width: 8
73580 +   fill-column: 120
73581 +   scroll-step: 1
73582 +   End:
73583 +*/
73584 diff -rupN linux-2.6.8-rc3/fs/reiser4/plugin/tail_policy.c linux-2.6.8-rc3-a/fs/reiser4/plugin/tail_policy.c
73585 --- linux-2.6.8-rc3/fs/reiser4/plugin/tail_policy.c     1970-01-01 03:00:00.000000000 +0300
73586 +++ linux-2.6.8-rc3-a/fs/reiser4/plugin/tail_policy.c   2004-08-05 21:20:53.334607498 +0400
73587 @@ -0,0 +1,109 @@
73588 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
73589 + * reiser4/README */
73590 +
73591 +/* Tail policy plugins */
73592 +
73593 +/* Tail policy is used by object plugin (of regular file) to convert file
73594 +   between two representations. TO BE CONTINUED.
73595 +NIKITA-FIXME-HANS: the "TO BE CONTINUED" means what?
73596 +GREV-FIXME-HANS: why the references to tails above?  fix comments and website.... tail implies it is less than the whole file that is formatted, and it is not.... not in v4....
73597 +
73598 +   Currently following policies are implemented:
73599 +
73600 +       never tail
73601 +
73602 +       always tail
73603 +
73604 +       only tail if file is smaller than 4 blocks (default).
73605 +*/
73606 +
73607 +#include "../tree.h"
73608 +#include "../inode.h"
73609 +#include "../super.h"
73610 +#include "object.h"
73611 +#include "plugin.h"
73612 +#include "node/node.h"
73613 +#include "plugin_header.h"
73614 +#include "../lib.h"
73615 +
73616 +#include <linux/pagemap.h>
73617 +#include <linux/fs.h>          /* For struct inode */
73618 +
73619 +/* Never store file's tail as direct item */
73620 +/* Audited by: green(2002.06.12) */
73621 +static int
73622 +have_formatting_never(const struct inode *inode UNUSED_ARG /* inode to operate on */ ,
73623 +               loff_t size UNUSED_ARG /* new object size */ )
73624 +{
73625 +       return 0;
73626 +}
73627 +
73628 +/* Always store file's tail as direct item */
73629 +/* Audited by: green(2002.06.12) */
73630 +static int
73631 +have_formatting_always(const struct inode *inode UNUSED_ARG    /* inode to operate on */ ,
73632 +                loff_t size UNUSED_ARG /* new object size */ )
73633 +{
73634 +       return 1;
73635 +}
73636 +
73637 +/* This function makes test if we should store file denoted @inode as tails only or
73638 +   as extents only. */
73639 +static int
73640 +have_formatting_default(const struct inode *inode UNUSED_ARG   /* inode to operate on */ ,
73641 +                 loff_t size /* new object size */ )
73642 +{
73643 +       assert("umka-1253", inode != NULL);
73644 +
73645 +       if (size > inode->i_sb->s_blocksize * 4)
73646 +               return 0;
73647 +
73648 +       return 1;
73649 +}
73650 +
73651 +/* tail plugins */
73652 +formatting_plugin formatting_plugins[LAST_TAIL_FORMATTING_ID] = {
73653 +       [NEVER_TAILS_FORMATTING_ID] = {
73654 +               .h = {
73655 +                       .type_id = REISER4_FORMATTING_PLUGIN_TYPE,
73656 +                       .id = NEVER_TAILS_FORMATTING_ID,
73657 +                       .pops = NULL,
73658 +                       .label = "never",
73659 +                       .desc = "Never store file's tail",
73660 +                       .linkage = TYPE_SAFE_LIST_LINK_ZERO
73661 +               },
73662 +               .have_tail = have_formatting_never
73663 +       },
73664 +       [ALWAYS_TAILS_FORMATTING_ID] = {
73665 +               .h = {
73666 +                       .type_id = REISER4_FORMATTING_PLUGIN_TYPE,
73667 +                       .id = ALWAYS_TAILS_FORMATTING_ID,
73668 +                       .pops = NULL,
73669 +                       .label = "always",
73670 +                       .desc = "Always store file's tail",
73671 +                       .linkage = TYPE_SAFE_LIST_LINK_ZERO
73672 +               },
73673 +               .have_tail = have_formatting_always
73674 +       },
73675 +       [SMALL_FILE_FORMATTING_ID] = {
73676 +               .h = {
73677 +                       .type_id = REISER4_FORMATTING_PLUGIN_TYPE,
73678 +                       .id = SMALL_FILE_FORMATTING_ID,
73679 +                       .pops = NULL,
73680 +                       .label = "4blocks",
73681 +                       .desc = "store files shorter than 4 blocks in tail items",
73682 +                       .linkage = TYPE_SAFE_LIST_LINK_ZERO
73683 +               },
73684 +               .have_tail = have_formatting_default
73685 +       }
73686 +};
73687 +
73688 +/* Make Linus happy.
73689 +   Local variables:
73690 +   c-indentation-style: "K&R"
73691 +   mode-name: "LC"
73692 +   c-basic-offset: 8
73693 +   tab-width: 8
73694 +   fill-column: 120
73695 +   End:
73696 +*/
73697 diff -rupN linux-2.6.8-rc3/fs/reiser4/pool.c linux-2.6.8-rc3-a/fs/reiser4/pool.c
73698 --- linux-2.6.8-rc3/fs/reiser4/pool.c   1970-01-01 03:00:00.000000000 +0300
73699 +++ linux-2.6.8-rc3-a/fs/reiser4/pool.c 2004-08-05 21:20:52.941690373 +0400
73700 @@ -0,0 +1,232 @@
73701 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
73702 + * reiser4/README */
73703 +
73704 +/* Fast pool allocation.
73705 +
73706 +   There are situations when some sub-system normally asks memory allocator
73707 +   for only few objects, but under some circumstances could require much
73708 +   more. Typical and actually motivating example is tree balancing. It needs
73709 +   to keep track of nodes that were involved into it, and it is well-known
73710 +   that in reasonable packed balanced tree most (92.938121%) percent of all
73711 +   balancings end up after working with only few nodes (3.141592 on
73712 +   average). But in rare cases balancing can involve much more nodes
73713 +   (3*tree_height+1 in extremal situation).
73714 +
73715 +   On the one hand, we don't want to resort to dynamic allocation (slab,
73716 +    malloc(), etc.) to allocate data structures required to keep track of
73717 +   nodes during balancing. On the other hand, we cannot statically allocate
73718 +   required amount of space on the stack, because first: it is useless wastage
73719 +   of precious resource, and second: this amount is unknown in advance (tree
73720 +   height can change).
73721 +
73722 +   Pools, implemented in this file are solution for this problem:
73723 +
73724 +    - some configurable amount of objects is statically preallocated on the
73725 +    stack
73726 +
73727 +    - if this preallocated pool is exhausted and more objects is requested
73728 +    they are allocated dynamically.
73729 +
73730 +   Pools encapsulate distinction between statically and dynamically allocated
73731 +   objects. Both allocation and recycling look exactly the same.
73732 +
73733 +   To keep track of dynamically allocated objects, pool adds its own linkage
73734 +   to each object.
73735 +
73736 +   NOTE-NIKITA This linkage also contains some balancing-specific data. This
73737 +   is not perfect. On the other hand, balancing is currently the only client
73738 +   of pool code.
73739 +
73740 +   NOTE-NIKITA Another desirable feature is to rewrite all pool manipulation
73741 +   functions in the style of tslist/tshash, i.e., make them unreadable, but
73742 +   type-safe.
73743 +
73744 +
73745 +*/
73746 +
73747 +#include "debug.h"
73748 +#include "pool.h"
73749 +#include "super.h"
73750 +
73751 +#include <linux/types.h>
73752 +#include <linux/err.h>
73753 +
73754 +/* initialise new pool object */
73755 +static void
73756 +reiser4_init_pool_obj(reiser4_pool_header * h  /* pool object to
73757 +                                                * initialise */ )
73758 +{
73759 +       pool_usage_list_clean(h);
73760 +       pool_level_list_clean(h);
73761 +       pool_extra_list_clean(h);
73762 +}
73763 +
73764 +/* initialise new pool */
73765 +reiser4_internal void
73766 +reiser4_init_pool(reiser4_pool * pool /* pool to initialise */ ,
73767 +                 size_t obj_size /* size of objects in @pool */ ,
73768 +                 int num_of_objs /* number of preallocated objects */ ,
73769 +                 char *data /* area for preallocated objects */ )
73770 +{
73771 +       reiser4_pool_header *h;
73772 +       int i;
73773 +
73774 +       assert("nikita-955", pool != NULL);
73775 +       assert("nikita-1044", obj_size > 0);
73776 +       assert("nikita-956", num_of_objs >= 0);
73777 +       assert("nikita-957", data != NULL);
73778 +
73779 +       xmemset(pool, 0, sizeof *pool);
73780 +       pool->obj_size = obj_size;
73781 +       pool->data = data;
73782 +       pool_usage_list_init(&pool->free);
73783 +       pool_usage_list_init(&pool->used);
73784 +       pool_extra_list_init(&pool->extra);
73785 +       xmemset(data, 0, obj_size * num_of_objs);
73786 +       for (i = 0; i < num_of_objs; ++i) {
73787 +               h = (reiser4_pool_header *) (data + i * obj_size);
73788 +               reiser4_init_pool_obj(h);
73789 +               pool_usage_list_push_back(&pool->free, h);
73790 +       }
73791 +}
73792 +
73793 +/* release pool resources
73794 +
73795 +   Release all resources acquired by this pool, specifically, dynamically
73796 +   allocated objects.
73797 +
73798 +*/
73799 +reiser4_internal void
73800 +reiser4_done_pool(reiser4_pool * pool UNUSED_ARG /* pool to destroy */ )
73801 +{
73802 +}
73803 +
73804 +/* allocate carry object from pool
73805 +
73806 +   First, try to get preallocated object. If this fails, resort to dynamic
73807 +   allocation.
73808 +
73809 +*/
73810 +reiser4_internal void *
73811 +reiser4_pool_alloc(reiser4_pool * pool /* pool to allocate object
73812 +                                        * from */ )
73813 +{
73814 +       reiser4_pool_header *result;
73815 +
73816 +       assert("nikita-959", pool != NULL);
73817 +       trace_stamp(TRACE_CARRY);
73818 +       reiser4_stat_inc(pool.alloc);
73819 +
73820 +       if (!pool_usage_list_empty(&pool->free)) {
73821 +               result = pool_usage_list_pop_front(&pool->free);
73822 +               pool_usage_list_clean(result);
73823 +               assert("nikita-965", pool_extra_list_is_clean(result));
73824 +       } else {
73825 +               reiser4_stat_inc(pool.kmalloc);
73826 +               /* pool is empty. Extra allocations don't deserve dedicated
73827 +                  slab to be served from, as they are expected to be rare. */
73828 +               result = reiser4_kmalloc(pool->obj_size, GFP_KERNEL);
73829 +               if (result != 0) {
73830 +                       reiser4_init_pool_obj(result);
73831 +                       pool_extra_list_push_front(&pool->extra, result);
73832 +               } else
73833 +                       return ERR_PTR(RETERR(-ENOMEM));
73834 +       }
73835 +       ++pool->objs;
73836 +       pool_level_list_clean(result);
73837 +       pool_usage_list_push_front(&pool->used, result);
73838 +       xmemset(result + 1, 0, pool->obj_size - sizeof *result);
73839 +       return result;
73840 +}
73841 +
73842 +/* return object back to the pool */
73843 +reiser4_internal void
73844 +reiser4_pool_free(reiser4_pool * pool,
73845 +                 reiser4_pool_header * h       /* pool to return object back
73846 +                                                * into */ )
73847 +{
73848 +       assert("nikita-961", h != NULL);
73849 +       assert("nikita-962", pool != NULL);
73850 +       trace_stamp(TRACE_CARRY);
73851 +
73852 +       -- pool->objs;
73853 +       assert("nikita-963", pool->objs >= 0);
73854 +
73855 +       pool_usage_list_remove_clean(h);
73856 +       pool_level_list_remove_clean(h);
73857 +       if (pool_extra_list_is_clean(h))
73858 +               pool_usage_list_push_front(&pool->free, h);
73859 +       else {
73860 +               pool_extra_list_remove_clean(h);
73861 +               reiser4_kfree(h);
73862 +       }
73863 +}
73864 +
73865 +/* add new object to the carry level list
73866 +
73867 +   Carry level is FIFO most of the time, but not always. Complications arise
73868 +   when make_space() function tries to go to the left neighbor and thus adds
73869 +   carry node before existing nodes, and also, when updating delimiting keys
73870 +   after moving data between two nodes, we want left node to be locked before
73871 +   right node.
73872 +
73873 +   Latter case is confusing at the first glance. Problem is that COP_UPDATE
73874 +   opration that updates delimiting keys is sometimes called with two nodes
73875 +   (when data are moved between two nodes) and sometimes with only one node
73876 +   (when leftmost item is deleted in a node). In any case operation is
73877 +   supplied with at least node whose left delimiting key is to be updated
73878 +   (that is "right" node).
73879 +
73880 +*/
73881 +reiser4_internal reiser4_pool_header *
73882 +add_obj(reiser4_pool * pool    /* pool from which to
73883 +                                * allocate new object */ ,
73884 +       pool_level_list_head * list     /* list where to add
73885 +                                        * object */ ,
73886 +       pool_ordering order /* where to add */ ,
73887 +       reiser4_pool_header * reference /* after (or
73888 +                                        * before) which
73889 +                                        * existing
73890 +                                        * object to
73891 +                                        * add */ )
73892 +{
73893 +       reiser4_pool_header *result;
73894 +
73895 +       assert("nikita-972", pool != NULL);
73896 +
73897 +       trace_stamp(TRACE_CARRY);
73898 +
73899 +       result = reiser4_pool_alloc(pool);
73900 +       if (IS_ERR(result))
73901 +               return result;
73902 +
73903 +       assert("nikita-973", result != NULL);
73904 +
73905 +       switch (order) {
73906 +       case POOLO_BEFORE:
73907 +               pool_level_list_insert_before(reference, result);
73908 +               break;
73909 +       case POOLO_AFTER:
73910 +               pool_level_list_insert_after(reference, result);
73911 +               break;
73912 +       case POOLO_LAST:
73913 +               pool_level_list_push_back(list, result);
73914 +               break;
73915 +       case POOLO_FIRST:
73916 +               pool_level_list_push_front(list, result);
73917 +               break;
73918 +       default:
73919 +               wrong_return_value("nikita-927", "order");
73920 +       }
73921 +       return result;
73922 +}
73923 +
73924 +/* Make Linus happy.
73925 +   Local variables:
73926 +   c-indentation-style: "K&R"
73927 +   mode-name: "LC"
73928 +   c-basic-offset: 8
73929 +   tab-width: 8
73930 +   fill-column: 120
73931 +   End:
73932 +*/
73933 diff -rupN linux-2.6.8-rc3/fs/reiser4/pool.h linux-2.6.8-rc3-a/fs/reiser4/pool.h
73934 --- linux-2.6.8-rc3/fs/reiser4/pool.h   1970-01-01 03:00:00.000000000 +0300
73935 +++ linux-2.6.8-rc3-a/fs/reiser4/pool.h 2004-08-05 21:20:52.947689108 +0400
73936 @@ -0,0 +1,70 @@
73937 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
73938 +
73939 +/* Fast pool allocation */
73940 +
73941 +#ifndef __REISER4_POOL_H__
73942 +#define __REISER4_POOL_H__
73943 +
73944 +#include "type_safe_list.h"
73945 +#include <linux/types.h>
73946 +
73947 +/* each pool object is either on a "used" or "free" list. */
73948 +TYPE_SAFE_LIST_DECLARE(pool_usage);
73949 +
73950 +/* list of extra pool objects */
73951 +TYPE_SAFE_LIST_DECLARE(pool_extra);
73952 +
73953 +/* list of pool objects on a given level */
73954 +TYPE_SAFE_LIST_DECLARE(pool_level);
73955 +
73956 +typedef struct reiser4_pool {
73957 +       size_t obj_size;
73958 +       int objs;
73959 +       char *data;
73960 +       pool_usage_list_head free;
73961 +       pool_usage_list_head used;
73962 +       pool_extra_list_head extra;
73963 +} reiser4_pool;
73964 +
73965 +typedef struct reiser4_pool_header {
73966 +       /* object is either on free or "used" lists */
73967 +       pool_usage_list_link usage_linkage;
73968 +       pool_level_list_link level_linkage;
73969 +       pool_extra_list_link extra_linkage;
73970 +} reiser4_pool_header;
73971 +
73972 +typedef enum {
73973 +       POOLO_BEFORE,
73974 +       POOLO_AFTER,
73975 +       POOLO_LAST,
73976 +       POOLO_FIRST
73977 +} pool_ordering;
73978 +
73979 +/* each pool object is either on a "used" or "free" list. */
73980 +TYPE_SAFE_LIST_DEFINE(pool_usage, reiser4_pool_header, usage_linkage);
73981 +/* list of extra pool objects */
73982 +TYPE_SAFE_LIST_DEFINE(pool_extra, reiser4_pool_header, extra_linkage);
73983 +/* list of pool objects on a given level */
73984 +TYPE_SAFE_LIST_DEFINE(pool_level, reiser4_pool_header, level_linkage);
73985 +
73986 +/* pool manipulation functions */
73987 +
73988 +extern void reiser4_init_pool(reiser4_pool * pool, size_t obj_size, int num_of_objs, char *data);
73989 +extern void reiser4_done_pool(reiser4_pool * pool);
73990 +extern void *reiser4_pool_alloc(reiser4_pool * pool);
73991 +extern void reiser4_pool_free(reiser4_pool * pool, reiser4_pool_header * h);
73992 +reiser4_pool_header *add_obj(reiser4_pool * pool, pool_level_list_head * list,
73993 +                            pool_ordering order, reiser4_pool_header * reference);
73994 +
73995 +/* __REISER4_POOL_H__ */
73996 +#endif
73997 +
73998 +/* Make Linus happy.
73999 +   Local variables:
74000 +   c-indentation-style: "K&R"
74001 +   mode-name: "LC"
74002 +   c-basic-offset: 8
74003 +   tab-width: 8
74004 +   fill-column: 120
74005 +   End:
74006 +*/
74007 diff -rupN linux-2.6.8-rc3/fs/reiser4/prof.c linux-2.6.8-rc3-a/fs/reiser4/prof.c
74008 --- linux-2.6.8-rc3/fs/reiser4/prof.c   1970-01-01 03:00:00.000000000 +0300
74009 +++ linux-2.6.8-rc3-a/fs/reiser4/prof.c 2004-08-05 21:20:53.181639762 +0400
74010 @@ -0,0 +1,273 @@
74011 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
74012 + * reiser4/README */
74013 +
74014 +/* profiling facilities. */
74015 +
74016 +/*
74017 + * This code is used to collect statistics about how many times particular
74018 + * function (or part of function) was called, and how long average call
74019 + * took. In addition (or, in the first place, depending on one's needs), it
74020 + * also keep track of through what call-chain profiled piece of code was
74021 + * entered. Latter is done by having a list of call-chains. Call-chains are
74022 + * obtained by series of calls to __builtin_return_address() (hence, this
74023 + * functionality requires kernel to be compiled with frame pointers). Whenever
74024 + * profiled region is just about to be left, call-chain is constructed and
74025 + * then compared against all chains already in the list. If match is found
74026 + * (cache hit!), its statistics are updated, otherwise (cache miss), entry
74027 + * with smallest hit count is selected and re-used to new call-chain.
74028 + *
74029 + * NOTE: this replacement policy has obvious deficiencies: after some time
74030 + * entries in the list accumulate high hit counts and will effectively prevent
74031 + * any new call-chain from finding a place in the list, even is this
74032 + * call-chain is frequently activated. Probably LRU should be used instead
74033 + * (this is not that hard, /proc/<pid>/sleep patch does this), but nobody
74034 + * complained so far.
74035 + *
74036 + */
74037 +
74038 +
74039 +#include "kattr.h"
74040 +#include "reiser4.h"
74041 +#include "context.h"
74042 +#include "super.h"
74043 +#include "prof.h"
74044 +
74045 +#include <linux/sysfs.h>
74046 +#include <linux/slab.h>
74047 +#include <linux/types.h>
74048 +#include <linux/fs.h>
74049 +#include <linux/spinlock.h>
74050 +#include <linux/kallsyms.h>
74051 +
74052 +#if REISER4_PROF
74053 +
74054 +#ifdef CONFIG_FRAME_POINTER
74055 +static void
74056 +update_prof_trace(reiser4_prof_cnt *cnt, int depth, int shift)
74057 +{
74058 +       int i;
74059 +       int minind;
74060 +       __u64 minhit;
74061 +       unsigned long hash;
74062 +       backtrace_path bt;
74063 +
74064 +       fill_backtrace(&bt, depth, shift);
74065 +
74066 +       for (i = 0, hash = 0 ; i < REISER4_BACKTRACE_DEPTH ; ++ i) {
74067 +               hash += (unsigned long)bt.trace[i];
74068 +       }
74069 +       minhit = ~0ull;
74070 +       minind = 0;
74071 +       for (i = 0 ; i < REISER4_PROF_TRACE_NUM ; ++ i) {
74072 +               if (hash == cnt->bt[i].hash) {
74073 +                       ++ cnt->bt[i].hits;
74074 +                       return;
74075 +               }
74076 +               if (cnt->bt[i].hits < minhit) {
74077 +                       minhit = cnt->bt[i].hits;
74078 +                       minind = i;
74079 +               }
74080 +       }
74081 +       cnt->bt[minind].path = bt;
74082 +       cnt->bt[minind].hash = hash;
74083 +       cnt->bt[minind].hits = 1;
74084 +}
74085 +#else
74086 +#define update_prof_trace(cnt, depth, shift) noop
74087 +#endif
74088 +
74089 +void update_prof_cnt(reiser4_prof_cnt *cnt, __u64 then, __u64 now,
74090 +                    unsigned long swtch_mark, __u64 start_jif,
74091 +                    int depth, int shift)
74092 +{
74093 +       __u64 delta;
74094 +
74095 +       delta = now - then;
74096 +       cnt->nr ++;
74097 +       cnt->total += delta;
74098 +       cnt->max = max(cnt->max, delta);
74099 +       if (swtch_mark == nr_context_switches()) {
74100 +               cnt->noswtch_nr ++;
74101 +               cnt->noswtch_total += delta;
74102 +               cnt->noswtch_max = max(cnt->noswtch_max, delta);
74103 +       }
74104 +       update_prof_trace(cnt, depth, shift);
74105 +}
74106 +
74107 +struct prof_attr_entry {
74108 +       struct attribute attr;
74109 +       char name[10];
74110 +};
74111 +
74112 +static struct prof_attr_entry prof_attr[REISER4_PROF_TRACE_NUM];
74113 +
74114 +static ssize_t
74115 +show_prof_attr(struct kobject *kobj, struct attribute *attr, char *buf)
74116 +{
74117 +       char *p;
74118 +       reiser4_prof_entry *entry;
74119 +       reiser4_prof_cnt   *val;
74120 +#ifdef CONFIG_FRAME_POINTER
74121 +       int pos;
74122 +       int j;
74123 +
74124 +       pos = ((struct prof_attr_entry *)attr) - prof_attr;
74125 +#endif
74126 +       entry = container_of(kobj, reiser4_prof_entry, kobj);
74127 +       val = &entry->cnt;
74128 +       p = buf;
74129 +       KATTR_PRINT(p, buf, "%llu %llu %llu %llu %llu %llu\n",
74130 +                   val->nr, val->total, val->max,
74131 +                   val->noswtch_nr, val->noswtch_total, val->noswtch_max);
74132 +#ifdef CONFIG_FRAME_POINTER
74133 +       if (val->bt[pos].hash != 0) {
74134 +               KATTR_PRINT(p, buf, "\t%llu: ", val->bt[pos].hits);
74135 +               for (j = 0 ; j < REISER4_BACKTRACE_DEPTH ; ++ j) {
74136 +                       char         *module;
74137 +                       const char   *name;
74138 +                       char          namebuf[128];
74139 +                       unsigned long address;
74140 +                       unsigned long offset;
74141 +                       unsigned long size;
74142 +
74143 +                       address = (unsigned long) val->bt[pos].path.trace[j];
74144 +                       name = kallsyms_lookup(address, &size,
74145 +                                              &offset, &module, namebuf);
74146 +                       KATTR_PRINT(p, buf, "\n\t\t%#lx ", address);
74147 +                       if (name != NULL)
74148 +                               KATTR_PRINT(p, buf, "%s+%#lx/%#lx",
74149 +                                           name, offset, size);
74150 +               }
74151 +               KATTR_PRINT(p, buf, "\n");
74152 +       }
74153 +#endif
74154 +       return (p - buf);
74155 +}
74156 +
74157 +/* zero a prof entry corresponding to @attr */
74158 +static ssize_t
74159 +store_prof_attr(struct kobject *kobj, struct attribute *attr, const char *buf, size_t size)
74160 +{
74161 +       reiser4_prof_entry *entry;
74162 +
74163 +       entry = container_of(kobj, reiser4_prof_entry, kobj);
74164 +       memset(&entry->cnt, 0, sizeof(reiser4_prof_cnt));
74165 +       return sizeof(reiser4_prof_cnt);
74166 +}
74167 +
74168 +static struct sysfs_ops prof_attr_ops = {
74169 +       .show = show_prof_attr,
74170 +       .store = store_prof_attr
74171 +};
74172 +
74173 +static struct kobj_type ktype_reiser4_prof = {
74174 +       .sysfs_ops      = &prof_attr_ops,
74175 +       .default_attrs  = NULL
74176 +};
74177 +
74178 +static decl_subsys(prof, &ktype_reiser4_prof, NULL);
74179 +
74180 +static struct kobject cpu_prof;
74181 +
74182 +#define DEFINE_PROF_ENTRY_0(attr_name,field_name)      \
74183 +       .field_name = {                                 \
74184 +               .kobj = {                               \
74185 +                       .name = attr_name       \
74186 +               }                                       \
74187 +       }
74188 +
74189 +
74190 +#define DEFINE_PROF_ENTRY(name)                                \
74191 +       DEFINE_PROF_ENTRY_0(#name,name)
74192 +
74193 +reiser4_prof reiser4_prof_defs = {
74194 +       DEFINE_PROF_ENTRY(fuse_wait),
74195 +#if 0
74196 +       DEFINE_PROF_ENTRY(cbk),
74197 +       DEFINE_PROF_ENTRY(init_context),
74198 +       DEFINE_PROF_ENTRY(jlook),
74199 +       DEFINE_PROF_ENTRY(writepage),
74200 +       DEFINE_PROF_ENTRY(jload),
74201 +       DEFINE_PROF_ENTRY(jrelse),
74202 +       DEFINE_PROF_ENTRY(flush_alloc),
74203 +       DEFINE_PROF_ENTRY(forward_squalloc),
74204 +       DEFINE_PROF_ENTRY(atom_wait_event),
74205 +       DEFINE_PROF_ENTRY(zget),
74206 +       /* write profiling */
74207 +       DEFINE_PROF_ENTRY(extent_write),
74208 +       /* read profiling */
74209 +       DEFINE_PROF_ENTRY(file_read)
74210 +#endif
74211 +};
74212 +
74213 +void calibrate_prof(void)
74214 +{
74215 +       __u64 start;
74216 +       __u64 end;
74217 +
74218 +       rdtscll(start);
74219 +       schedule_timeout(HZ/100);
74220 +       rdtscll(end);
74221 +       warning("nikita-2923", "1 sec. == %llu rdtsc.", (end - start) * 100);
74222 +}
74223 +
74224 +
74225 +int init_prof_kobject(void)
74226 +{
74227 +       int result;
74228 +       int i;
74229 +       reiser4_prof_entry *array;
74230 +
74231 +       for (i = 0; i < REISER4_PROF_TRACE_NUM; ++ i) {
74232 +               sprintf(prof_attr[i].name, "%i", i);
74233 +               prof_attr[i].attr.name = prof_attr[i].name;
74234 +               prof_attr[i].attr.mode = 0644;
74235 +       }
74236 +
74237 +       result = subsystem_register(&prof_subsys);
74238 +       if (result != 0)
74239 +               return result;
74240 +
74241 +       cpu_prof.kset = &prof_subsys.kset;
74242 +       snprintf(cpu_prof.name, KOBJ_NAME_LEN, "cpu_prof");
74243 +       result = kobject_register(&cpu_prof);
74244 +       if (result != 0)
74245 +               return result;
74246 +
74247 +       /* populate */
74248 +       array = (reiser4_prof_entry *)&reiser4_prof_defs;
74249 +       for(i = 0 ; i < sizeof(reiser4_prof_defs)/sizeof(reiser4_prof_entry);
74250 +           ++ i) {
74251 +               struct kobject *kobj;
74252 +               int j;
74253 +
74254 +               kobj = &array[i].kobj;
74255 +               kobj->ktype = &ktype_reiser4_prof;
74256 +               kobj->parent = kobject_get(&cpu_prof);
74257 +
74258 +               result = kobject_register(kobj);
74259 +               if (result != 0)
74260 +                       break;
74261 +
74262 +               for (j = 0; j < REISER4_PROF_TRACE_NUM; ++ j) {
74263 +                       result = sysfs_create_file(kobj, &prof_attr[j].attr);
74264 +                       if (result != 0)
74265 +                               break;
74266 +               }
74267 +       }
74268 +       if (result != 0)
74269 +               kobject_unregister(&cpu_prof);
74270 +       return result;
74271 +}
74272 +
74273 +void done_prof_kobject(void)
74274 +{
74275 +       kobject_unregister(&cpu_prof);
74276 +       subsystem_unregister(&prof_subsys);
74277 +}
74278 +
74279 +/* REISER4_PROF */
74280 +#else
74281 +
74282 +/* REISER4_PROF */
74283 +#endif
74284 diff -rupN linux-2.6.8-rc3/fs/reiser4/prof.h linux-2.6.8-rc3-a/fs/reiser4/prof.h
74285 --- linux-2.6.8-rc3/fs/reiser4/prof.h   1970-01-01 03:00:00.000000000 +0300
74286 +++ linux-2.6.8-rc3-a/fs/reiser4/prof.h 2004-08-05 21:20:52.945689530 +0400
74287 @@ -0,0 +1,130 @@
74288 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
74289 + * reiser4/README */
74290 +
74291 +/* profiling. This is i386, rdtsc-based profiling. See prof.c for comments. */
74292 +
74293 +#if !defined( __REISER4_PROF_H__ )
74294 +#define __REISER4_PROF_H__
74295 +
74296 +#include "kattr.h"
74297 +
74298 +#if (defined(__i386__) || defined(CONFIG_USERMODE)) && defined(CONFIG_REISER4_PROF)
74299 +#define REISER4_PROF (1)
74300 +#else
74301 +#define REISER4_PROF (0)
74302 +#endif
74303 +
74304 +#if REISER4_PROF
74305 +
74306 +#include <asm-i386/msr.h>
74307 +
74308 +#define REISER4_PROF_TRACE_NUM (30)
74309 +
74310 +/* data structure to keep call trace */
74311 +typedef struct {
74312 +       /* hash of trace---used for fast comparison */
74313 +       unsigned long hash;
74314 +       /* call trace proper---return addresses collected by
74315 +        * __builtin_return_address() */
74316 +       backtrace_path path;
74317 +       /* number of times profiled code was entered through this call
74318 +        * chain */
74319 +       __u64 hits;
74320 +} reiser4_trace;
74321 +
74322 +/* statistics for profiled region of code */
74323 +typedef struct {
74324 +       /* number of times region was entered */
74325 +       __u64 nr;
74326 +       /* total time spent in this region */
74327 +       __u64 total;
74328 +       /* maximal time per enter */
74329 +       __u64 max;
74330 +       /* number of times region was executed without context switch */
74331 +       __u64 noswtch_nr;
74332 +       /* total time spent in executions without context switch */
74333 +       __u64 noswtch_total;
74334 +       /* maximal time of execution without context switch */
74335 +       __u64 noswtch_max;
74336 +       /* array of back traces */
74337 +       reiser4_trace bt[REISER4_PROF_TRACE_NUM];
74338 +} reiser4_prof_cnt;
74339 +
74340 +/* profiler entry. */
74341 +typedef struct {
74342 +       /* sysfs placeholder */
74343 +       struct kobject kobj;
74344 +       /* statistics, see above */
74345 +       reiser4_prof_cnt cnt;
74346 +} reiser4_prof_entry;
74347 +
74348 +typedef struct {
74349 +       reiser4_prof_entry fuse_wait;
74350 +#if 0
74351 +       reiser4_prof_entry cbk;
74352 +       reiser4_prof_entry init_context;
74353 +       reiser4_prof_entry jlook;
74354 +       reiser4_prof_entry writepage;
74355 +       reiser4_prof_entry jload;
74356 +       reiser4_prof_entry jrelse;
74357 +       reiser4_prof_entry flush_alloc;
74358 +       reiser4_prof_entry forward_squalloc;
74359 +       reiser4_prof_entry atom_wait_event;
74360 +       reiser4_prof_entry zget;
74361 +       /* write profiling */
74362 +       reiser4_prof_entry extent_write;
74363 +       /* read profiling */
74364 +       reiser4_prof_entry file_read;
74365 +#endif
74366 +} reiser4_prof;
74367 +
74368 +extern reiser4_prof reiser4_prof_defs;
74369 +
74370 +extern unsigned long nr_context_switches(void);
74371 +void update_prof_cnt(reiser4_prof_cnt *cnt, __u64 then, __u64 now,
74372 +                    unsigned long swtch_mark, __u64 start_jif,
74373 +                    int delta, int shift);
74374 +void calibrate_prof(void);
74375 +
74376 +#define PROF_BEGIN(aname)                                                      \
74377 +       unsigned long __swtch_mark__ ## aname = nr_context_switches();          \
74378 +        __u64 __prof_jiffies ## aname = jiffies;                               \
74379 +       __u64 __prof_cnt__ ## aname = ({ __u64 __tmp_prof ;                     \
74380 +                                       rdtscll(__tmp_prof) ; __tmp_prof; })
74381 +
74382 +#define PROF_END(aname) __PROF_END(aname, REISER4_BACKTRACE_DEPTH, 0)
74383 +
74384 +#define __PROF_END(aname, depth, shift)                        \
74385 +({                                                     \
74386 +       __u64 __prof_end;                               \
74387 +                                                       \
74388 +       rdtscll(__prof_end);                            \
74389 +       update_prof_cnt(&reiser4_prof_defs.aname.cnt,   \
74390 +                       __prof_cnt__ ## aname,          \
74391 +                       __prof_end,                     \
74392 +                       __swtch_mark__ ## aname,        \
74393 +                       __prof_jiffies ## aname,        \
74394 +                       depth, shift );                 \
74395 +})
74396 +
74397 +extern int init_prof_kobject(void);
74398 +extern void done_prof_kobject(void);
74399 +
74400 +/* REISER4_PROF */
74401 +#else
74402 +
74403 +typedef struct reiser4_prof_cnt {} reiser4_prof_cnt;
74404 +typedef struct reiser4_prof {} reiser4_prof;
74405 +
74406 +#define PROF_BEGIN(aname) noop
74407 +#define PROF_END(aname) noop
74408 +#define __PROF_END(aname, depth, shift) noop
74409 +#define calibrate_prof() noop
74410 +
74411 +#define init_prof_kobject() (0)
74412 +#define done_prof_kobject() noop
74413 +
74414 +#endif
74415 +
74416 +/* __REISER4_PROF_H__ */
74417 +#endif
74418 diff -rupN linux-2.6.8-rc3/fs/reiser4/readahead.c linux-2.6.8-rc3-a/fs/reiser4/readahead.c
74419 --- linux-2.6.8-rc3/fs/reiser4/readahead.c      1970-01-01 03:00:00.000000000 +0300
74420 +++ linux-2.6.8-rc3-a/fs/reiser4/readahead.c    2004-08-05 21:20:52.854708720 +0400
74421 @@ -0,0 +1,383 @@
74422 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
74423 + * reiser4/README */
74424 +
74425 +#include "forward.h"
74426 +#include "tree.h"
74427 +#include "tree_walk.h"
74428 +#include "super.h"
74429 +#include "inode.h"
74430 +#include "key.h"
74431 +#include "znode.h"
74432 +
74433 +/* for nr_free_pagecache_pages(), totalram_pages */
74434 +#include <linux/swap.h>
74435 +
74436 +reiser4_internal void init_ra_info(ra_info_t * rai)
74437 +{
74438 +       rai->key_to_stop = *min_key();
74439 +}
74440 +
74441 +/* global formatted node readahead parameter. It can be set by mount option -o readahead:NUM:1 */
74442 +static inline int ra_adjacent_only(int flags)
74443 +{
74444 +       return flags & RA_ADJACENT_ONLY;
74445 +}
74446 +
74447 +/* this is used by formatted_readahead to decide whether read for right neighbor of node is to be issued. It returns 1
74448 +   if right neighbor's first key is less or equal to readahead's stop key */
74449 +static int
74450 +should_readahead_neighbor(znode *node, ra_info_t *info)
74451 +{
74452 +       return (UNDER_RW(dk, ZJNODE(node)->tree, read,
74453 +                        keyle(znode_get_rd_key(node), &info->key_to_stop)));
74454 +}
74455 +
74456 +#define LOW_MEM_PERCENTAGE (5)
74457 +
74458 +static int
74459 +low_on_memory(void)
74460 +{
74461 +       unsigned int freepages;
74462 +
74463 +       freepages = nr_free_pagecache_pages();
74464 +       return freepages < (totalram_pages * LOW_MEM_PERCENTAGE / 100);
74465 +}
74466 +
74467 +/* start read for @node and for a few of its right neighbors */
74468 +reiser4_internal void
74469 +formatted_readahead(znode *node, ra_info_t *info)
74470 +{
74471 +       ra_params_t *ra_params;
74472 +       znode *cur;
74473 +       int i;
74474 +       int grn_flags;
74475 +       lock_handle next_lh;
74476 +
74477 +       /* do nothing if node block number has not been assigned to node (which means it is still in cache). */
74478 +       if (blocknr_is_fake(znode_get_block(node)))
74479 +               return;
74480 +
74481 +       ra_params = get_current_super_ra_params();
74482 +
74483 +       if (znode_page(node) == NULL)
74484 +               jstartio(ZJNODE(node));
74485 +
74486 +       if (znode_get_level(node) != LEAF_LEVEL)
74487 +               return;
74488 +
74489 +       /* don't waste memory for read-ahead when low on memory */
74490 +       if (low_on_memory())
74491 +               return;
74492 +
74493 +       write_current_logf(READAHEAD_LOG, "...readahead\n");
74494 +
74495 +       /* We can have locked nodes on upper tree levels, in this situation lock
74496 +          priorities do not help to resolve deadlocks, we have to use TRY_LOCK
74497 +          here. */
74498 +       grn_flags = (GN_CAN_USE_UPPER_LEVELS | GN_TRY_LOCK);
74499 +
74500 +       i = 0;
74501 +       cur = zref(node);
74502 +       init_lh(&next_lh);
74503 +       while (i < ra_params->max) {
74504 +               const reiser4_block_nr *nextblk;
74505 +
74506 +               if (!should_readahead_neighbor(cur, info))
74507 +                       break;
74508 +
74509 +               if (reiser4_get_right_neighbor(&next_lh, cur, ZNODE_READ_LOCK, grn_flags))
74510 +                       break;
74511 +
74512 +               if (JF_ISSET(ZJNODE(next_lh.node), JNODE_EFLUSH)) {
74513 +                       /* emergency flushed znode is encountered. That means we are low on memory. Do not readahead
74514 +                          then */
74515 +                       break;
74516 +               }
74517 +
74518 +               nextblk = znode_get_block(next_lh.node);
74519 +               if (blocknr_is_fake(nextblk) ||
74520 +                   (ra_adjacent_only(ra_params->flags) && *nextblk != *znode_get_block(cur) + 1)) {
74521 +                       break;
74522 +               }
74523 +
74524 +               zput(cur);
74525 +               cur = zref(next_lh.node);
74526 +               done_lh(&next_lh);
74527 +               if (znode_page(cur) == NULL)
74528 +                       jstartio(ZJNODE(cur));
74529 +               else
74530 +                       /* Do not scan read-ahead window if pages already
74531 +                        * allocated (and i/o already started). */
74532 +                       break;
74533 +
74534 +               i ++;
74535 +       }
74536 +       zput(cur);
74537 +       done_lh(&next_lh);
74538 +
74539 +       write_current_logf(READAHEAD_LOG, "...readahead exits\n");
74540 +}
74541 +
74542 +static inline loff_t get_max_readahead(struct reiser4_file_ra_state *ra)
74543 +{
74544 +       /* NOTE: ra->max_window_size is initialized in
74545 +        * reiser4_get_file_fsdata(). */
74546 +       return ra->max_window_size;
74547 +}
74548 +
74549 +static inline loff_t get_min_readahead(struct reiser4_file_ra_state *ra)
74550 +{
74551 +       return VM_MIN_READAHEAD * 1024;
74552 +}
74553 +
74554 +
74555 +/* Start read for the given window. */
74556 +static loff_t do_reiser4_file_readahead (struct inode * inode, loff_t offset, loff_t size)
74557 +{
74558 +       reiser4_tree * tree = current_tree;
74559 +       reiser4_inode * object;
74560 +       reiser4_key start_key;
74561 +       reiser4_key stop_key;
74562 +
74563 +       lock_handle lock;
74564 +       lock_handle next_lock;
74565 +
74566 +       coord_t coord;
74567 +       tap_t tap;
74568 +
74569 +       loff_t result;
74570 +
74571 +       assert("zam-994", lock_stack_isclean(get_current_lock_stack()));
74572 +
74573 +       object = reiser4_inode_data(inode);
74574 +       key_by_inode_unix_file(inode, offset, &start_key);
74575 +       key_by_inode_unix_file(inode, offset + size, &stop_key);
74576 +
74577 +       init_lh(&lock);
74578 +       init_lh(&next_lock);
74579 +
74580 +       /* Stop on twig level */
74581 +       result = coord_by_key(
74582 +               current_tree, &start_key, &coord, &lock, ZNODE_READ_LOCK,
74583 +               FIND_EXACT, TWIG_LEVEL, TWIG_LEVEL, 0, NULL);
74584 +       if (result < 0)
74585 +               goto error;
74586 +       if (result != CBK_COORD_FOUND) {
74587 +               result = 0;
74588 +               goto error;
74589 +       }
74590 +
74591 +       tap_init(&tap, &coord, &lock, ZNODE_WRITE_LOCK);
74592 +       result = tap_load(&tap);
74593 +       if (result)
74594 +               goto error0;
74595 +
74596 +       /* Advance coord to right (even across node boundaries) while coord key
74597 +        * less than stop_key.  */
74598 +       while (1) {
74599 +               reiser4_key key;
74600 +               znode * child;
74601 +               reiser4_block_nr blk;
74602 +
74603 +               /* Currently this read-ahead is for formatted nodes only */
74604 +               if (!item_is_internal(&coord))
74605 +                       break;
74606 +
74607 +               item_key_by_coord(&coord, &key);
74608 +               if (keyge(&key, &stop_key))
74609 +                       break;
74610 +
74611 +               result = item_utmost_child_real_block(&coord, LEFT_SIDE, &blk);
74612 +               if (result || blk == 0)
74613 +                       break;
74614 +
74615 +               child = zget(tree, &blk, lock.node, LEAF_LEVEL, GFP_KERNEL);
74616 +
74617 +               if (IS_ERR(child)) {
74618 +                       result = PTR_ERR(child);
74619 +                       break;
74620 +               }
74621 +
74622 +               /* If znode's page is present that usually means that i/o was
74623 +                * already started for the page. */
74624 +               if (znode_page(child) == NULL) {
74625 +                       result = jstartio(ZJNODE(child));
74626 +                       if (result) {
74627 +                               zput(child);
74628 +                               break;
74629 +                       }
74630 +               }
74631 +               zput(child);
74632 +
74633 +               /* Advance coord by one unit ... */
74634 +               result = coord_next_unit(&coord);
74635 +               if (result == 0)
74636 +                       continue;
74637 +
74638 +               /* ... and continue on the right neighbor if needed. */
74639 +               result = reiser4_get_right_neighbor (
74640 +                       &next_lock, lock.node, ZNODE_READ_LOCK,
74641 +                       GN_CAN_USE_UPPER_LEVELS);
74642 +               if (result)
74643 +                       break;
74644 +
74645 +               if (znode_page(next_lock.node) == NULL) {
74646 +                       loff_t end_offset;
74647 +
74648 +                       result = jstartio(ZJNODE(next_lock.node));
74649 +                       if (result)
74650 +                               break;
74651 +
74652 +                       read_lock_dk(tree);
74653 +                       end_offset = get_key_offset(znode_get_ld_key(next_lock.node));
74654 +                       read_unlock_dk(tree);
74655 +
74656 +                       result = end_offset - offset;
74657 +                       break;
74658 +               }
74659 +
74660 +               result = tap_move(&tap, &next_lock);
74661 +               if (result)
74662 +                       break;
74663 +
74664 +               done_lh(&next_lock);
74665 +               coord_init_first_unit(&coord, lock.node);
74666 +       }
74667 +
74668 +       if (! result || result == -E_NO_NEIGHBOR)
74669 +               result = size;
74670 + error0:
74671 +       tap_done(&tap);
74672 + error:
74673 +       done_lh(&lock);
74674 +       done_lh(&next_lock);
74675 +       return result;
74676 +}
74677 +
74678 +typedef unsigned long long int ull_t;
74679 +#define PRINTK(...) noop
74680 +/* This is derived from the linux original read-ahead code (mm/readahead.c), and
74681 + * cannot be licensed from Namesys in its current state.  */
74682 +int reiser4_file_readahead (struct file * file, loff_t offset, size_t size)
74683 +{
74684 +       loff_t min;
74685 +       loff_t max;
74686 +       loff_t orig_next_size;
74687 +       loff_t actual;
74688 +       struct reiser4_file_ra_state * ra;
74689 +       struct inode * inode = file->f_dentry->d_inode;
74690 +
74691 +       assert ("zam-995", inode != NULL);
74692 +
74693 +       PRINTK ("R/A REQ: off=%llu, size=%llu\n", (ull_t)offset, (ull_t)size);
74694 +       ra = &reiser4_get_file_fsdata(file)->ra;
74695 +
74696 +       max = get_max_readahead(ra);
74697 +       if (max == 0)
74698 +               goto out;
74699 +
74700 +       min = get_min_readahead(ra);
74701 +       orig_next_size = ra->next_size;
74702 +
74703 +       if (!ra->slow_start) {
74704 +               ra->slow_start = 1;
74705 +               /*
74706 +                * Special case - first read from first page.
74707 +                * We'll assume it's a whole-file read, and
74708 +                * grow the window fast.
74709 +                */
74710 +               ra->next_size = max / 2;
74711 +               goto do_io;
74712 +
74713 +       }
74714 +
74715 +       /*
74716 +        * Is this request outside the current window?
74717 +        */
74718 +       if (offset < ra->start || offset > (ra->start + ra->size)) {
74719 +               /* R/A miss. */
74720 +
74721 +               /* next r/a window size is shrunk by fixed offset and enlarged
74722 +                * by 2 * size of read request.  This makes r/a window smaller
74723 +                * for small unordered requests and larger for big read
74724 +                * requests.  */
74725 +               ra->next_size += -2 * PAGE_CACHE_SIZE + 2 * size ;
74726 +               if (ra->next_size < 0)
74727 +                       ra->next_size = 0;
74728 +do_io:
74729 +               ra->start = offset;
74730 +               ra->size = size + orig_next_size;
74731 +               actual = do_reiser4_file_readahead(inode, offset, ra->size);
74732 +               if (actual > 0)
74733 +                       ra->size = actual;
74734 +
74735 +               ra->ahead_start = ra->start + ra->size;
74736 +               ra->ahead_size = ra->next_size;
74737 +
74738 +               actual =  do_reiser4_file_readahead(inode, ra->ahead_start, ra->ahead_size);
74739 +               if (actual > 0)
74740 +                       ra->ahead_size = actual;
74741 +
74742 +               PRINTK ("R/A MISS: cur = [%llu, +%llu[, ahead = [%llu, +%llu[\n",
74743 +                       (ull_t)ra->start, (ull_t)ra->size,
74744 +                       (ull_t)ra->ahead_start, (ull_t)ra->ahead_size);
74745 +       } else {
74746 +               /* R/A hit. */
74747 +
74748 +               /* Enlarge r/a window size. */
74749 +               ra->next_size += 2 * size;
74750 +               if (ra->next_size > max)
74751 +                       ra->next_size = max;
74752 +
74753 +               PRINTK("R/A HIT\n");
74754 +               while (offset + size >= ra->ahead_start) {
74755 +                       ra->start = ra->ahead_start;
74756 +                       ra->size = ra->ahead_size;
74757 +
74758 +                       ra->ahead_start = ra->start + ra->size;
74759 +                       ra->ahead_size = ra->next_size;
74760 +
74761 +                       actual = do_reiser4_file_readahead(
74762 +                               inode, ra->ahead_start, ra->ahead_size);
74763 +                       if (actual > 0) {
74764 +                               ra->ahead_size = actual;
74765 +                       }
74766 +
74767 +                       PRINTK ("R/A ADVANCE: cur = [%llu, +%llu[, ahead = [%llu, +%llu[\n",
74768 +                               (ull_t)ra->start, (ull_t)ra->size,
74769 +                               (ull_t)ra->ahead_start, (ull_t)ra->ahead_size);
74770 +
74771 +               }
74772 +       }
74773 +
74774 +out:
74775 +       return 0;
74776 +}
74777 +
74778 +reiser4_internal void
74779 +reiser4_readdir_readahead_init(struct inode *dir, tap_t *tap)
74780 +{
74781 +       reiser4_key *stop_key;
74782 +
74783 +       assert("nikita-3542", dir != NULL);
74784 +       assert("nikita-3543", tap != NULL);
74785 +
74786 +       stop_key = &tap->ra_info.key_to_stop;
74787 +       /* initialize readdir readahead information: include into readahead
74788 +        * stat data of all files of the directory */
74789 +       set_key_locality(stop_key, get_inode_oid(dir));
74790 +       set_key_type(stop_key, KEY_SD_MINOR);
74791 +       set_key_ordering(stop_key, get_key_ordering(max_key()));
74792 +       set_key_objectid(stop_key, get_key_objectid(max_key()));
74793 +       set_key_offset(stop_key, get_key_offset(max_key()));
74794 +}
74795 +
74796 +/*
74797 +   Local variables:
74798 +   c-indentation-style: "K&R"
74799 +   mode-name: "LC"
74800 +   c-basic-offset: 8
74801 +   tab-width: 8
74802 +   fill-column: 80
74803 +   End:
74804 +*/
74805 diff -rupN linux-2.6.8-rc3/fs/reiser4/readahead.h linux-2.6.8-rc3-a/fs/reiser4/readahead.h
74806 --- linux-2.6.8-rc3/fs/reiser4/readahead.h      1970-01-01 03:00:00.000000000 +0300
74807 +++ linux-2.6.8-rc3-a/fs/reiser4/readahead.h    2004-08-05 21:20:52.858707877 +0400
74808 @@ -0,0 +1,50 @@
74809 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
74810 +
74811 +#ifndef __READAHEAD_H__
74812 +#define __READAHEAD_H__
74813 +
74814 +#include "key.h"
74815 +
74816 +typedef enum {
74817 +       RA_ADJACENT_ONLY = 1,       /* only requests nodes which are adjacent. Default is NO (not only adjacent) */
74818 +} ra_global_flags;
74819 +
74820 +/* reiser4 super block has a field of this type. It controls readahead during tree traversals */
74821 +typedef struct formatted_read_ahead_params {
74822 +       unsigned long max; /* request not more than this amount of nodes. Default is totalram_pages / 4 */
74823 +       int flags;
74824 +} ra_params_t;
74825 +
74826 +
74827 +typedef struct {
74828 +       reiser4_key key_to_stop;
74829 +} ra_info_t;
74830 +
74831 +void formatted_readahead(znode *, ra_info_t *);
74832 +void init_ra_info(ra_info_t * rai);
74833 +
74834 +struct reiser4_file_ra_state {
74835 +       loff_t  start;          /* Current window */
74836 +       loff_t  size;
74837 +       loff_t  next_size;      /* Next window size */
74838 +       loff_t  ahead_start;    /* Ahead window */
74839 +       loff_t  ahead_size;
74840 +       loff_t  max_window_size; /* Maximum readahead window */
74841 +       loff_t  slow_start;      /* enlarging r/a size algorithm. */
74842 +};
74843 +
74844 +extern int reiser4_file_readahead(struct file *, loff_t, size_t);
74845 +extern void reiser4_readdir_readahead_init(struct inode *dir, tap_t *tap);
74846 +
74847 +/* __READAHEAD_H__ */
74848 +#endif
74849 +
74850 +/*
74851 +   Local variables:
74852 +   c-indentation-style: "K&R"
74853 +   mode-name: "LC"
74854 +   c-basic-offset: 8
74855 +   tab-width: 8
74856 +   fill-column: 120
74857 +   End:
74858 +*/
74859 diff -rupN linux-2.6.8-rc3/fs/reiser4/reiser4.h linux-2.6.8-rc3-a/fs/reiser4/reiser4.h
74860 --- linux-2.6.8-rc3/fs/reiser4/reiser4.h        1970-01-01 03:00:00.000000000 +0300
74861 +++ linux-2.6.8-rc3-a/fs/reiser4/reiser4.h      2004-08-05 21:20:53.420589362 +0400
74862 @@ -0,0 +1,501 @@
74863 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
74864 + * reiser4/README */
74865 +
74866 +/* definitions of common constants used by reiser4 */
74867 +
74868 +#if !defined( __REISER4_H__ )
74869 +#define __REISER4_H__
74870 +
74871 +#include <linux/config.h>
74872 +#include <asm/param.h>         /* for HZ */
74873 +#include <linux/errno.h>
74874 +#include <linux/types.h>
74875 +#include <linux/fs.h>
74876 +#include <asm/hardirq.h>
74877 +#include <linux/sched.h>
74878 +
74879 +#if defined(CONFIG_4KSTACKS)
74880 +#error "Please turn 4k stack off"
74881 +#endif
74882 +
74883 +/*
74884 + * reiser4 compilation options.
74885 + */
74886 +
74887 +#if defined(CONFIG_REISER4_DEBUG)
74888 +/* turn on assertion checks */
74889 +#define REISER4_DEBUG (1)
74890 +#else
74891 +#define REISER4_DEBUG (0)
74892 +#endif
74893 +
74894 +#if defined(CONFIG_REISER4_DEBUG_MODIFY)
74895 +/*
74896 + * Turn on "znode modification checks". In this mode znode check-sum is
74897 + * maintained in special field added to znode. Check-sum is updated during
74898 + * znode_make_dirty() (or, during zload()). It is checked that check-sum is
74899 + * only if ever updated between acquiring write lock on znode and calling
74900 + * znode_make_dirty(). This significantly slows down testing, but we should
74901 + * run our test-suite through with this every once in a while.  */
74902 +#define REISER4_DEBUG_MODIFY (1)
74903 +#else
74904 +#define REISER4_DEBUG_MODIFY (0)
74905 +#endif
74906 +
74907 +#if defined(CONFIG_REISER4_DEBUG_MEMCPY)
74908 +/* Provide our own memcpy/memmove to profile shifts. Reiser4 code uses
74909 + * xmem{cpy,move,set}() functions in stead of mem{cpy,move,set}(). When
74910 + * REISER4_DEBUG_MEMCPY is on, care is taken to uninline xmem* functions so
74911 + * that they show up in CPU profiling (/proc/profile, for example) separately
74912 + * from calling functions. This is done to estimate CPU consumption of memory
74913 + * shifts. When this mode is off, xmem* functions are preprocessed into their
74914 + * mem* analogs. */
74915 +#define REISER4_DEBUG_MEMCPY (1)
74916 +#else
74917 +#define REISER4_DEBUG_MEMCPY (0)
74918 +#endif
74919 +
74920 +#if defined(CONFIG_REISER4_DEBUG_NODE)
74921 +/*
74922 + * Check consistency of internal node structures. When this mode is on, node
74923 + * consistency check (implemented by plugin/node/node.c:node_check() function)
74924 + * are invoked in many places (including start and end of most node plugin
74925 + * methods). node_check() includes a lot of checks, see it for details.
74926 + *
74927 + * Node consistency checking (which is off by default) has to be activated by
74928 + * setting REISER4_CHECK_NODE bit in ->debug_flags field of
74929 + * reiser4_super_info_data. This can be done with debug_flags mount option.
74930 + */
74931 +#define REISER4_DEBUG_NODE (1)
74932 +#else
74933 +#define REISER4_DEBUG_NODE (0)
74934 +#endif
74935 +
74936 +#if defined(CONFIG_REISER4_ZERO_NEW_NODE)
74937 +/* If this is non-zero, clear content of new node, otherwise leave whatever
74938 +   may happen to be here */
74939 +#define REISER4_ZERO_NEW_NODE (1)
74940 +#else
74941 +#define REISER4_ZERO_NEW_NODE (0)
74942 +#endif
74943 +
74944 +#if defined(CONFIG_REISER4_TRACE)
74945 +/* tracing facility. When this is on, {ON,IF}_TRACE statements are
74946 + * activated. Thy print (through printk()) information about control flow
74947 + * (what function is called, with what arguments, etc.). {ON,IF}_TRACE
74948 + * statements contain "trace mask" and output is done only when this mask
74949 + * matches current trace mask, calculated by get_current_trace_flags()
74950 + * function. Current trace mask is combined from per-thread context mask
74951 + * (stored in reiser4_context), and per-super-block mask (stored in
74952 + * ->trace_flags field of reiser4_super_info_data). Per-super-block trace mask
74953 + * can be adjusted through:
74954 + *
74955 + *     1. mount option "trace_flags"
74956 + *
74957 + *     2. /sys/fs/reiser4/<dev>/trace_flags file.
74958 + *
74959 + */
74960 +#define REISER4_TRACE (1)
74961 +#else
74962 +#define REISER4_TRACE (0)
74963 +#endif
74964 +
74965 +#if defined(CONFIG_REISER4_EVENT_LOG)
74966 +/*
74967 + * Collect event logs. When this is on, logging macros/functions declared in
74968 + * fs/reiser4/log.h are activated. Event-logging facility is designed to cope
74969 + * with large amount of output data. To this end, event descriptions are
74970 + * buffered in the internal buffer (of REISER4_TRACE_BUF_SIZE bytes) and then
74971 + * written into user-visible log file. Log file is specified through log_file
74972 + * mount option.
74973 + *
74974 + * Events which are logged are specified through log_flags mount option (or
74975 + * /sys/fs/reiser4/<dev>/log_flags file). See
74976 + * fs/reiser4/debug.h:reiser4_log_flags for possible values.
74977 + *
74978 + * Note that event-logging is for gathering statistics (as opposed to tracing,
74979 + * which is for debugging).
74980 + *
74981 + * When running experiments with event-logging on, it's important to minimize
74982 + * an impact of event-logging to the system. It was found that one of the most
74983 + * disturbing effects of event-logging is continuous generation of dirty
74984 + * memory that triggers premature write-back and, generally, affects system
74985 + * behavior in various ways. To work around this set log file to named pipe,
74986 + * and use netcat(1) to dump log through over network.
74987 + *
74988 + */
74989 +#define REISER4_LOG (1)
74990 +#else
74991 +#define REISER4_LOG (0)
74992 +#endif
74993 +
74994 +#if defined(CONFIG_REISER4_STATS)
74995 +/*
74996 + * Collect statistics. In this mode reiser4 collects a lot of statistical
74997 + * information in the form if "stat-counters". There are global counters
74998 + * (per-super-block) and per-level counters collected separately for each
74999 + * level of the internal reiser4 tree. See fs/reiser4/stats.[ch] for the list
75000 + * of counters. Counters are exported under /sys/fs/reiser4/<dev>/stats/
75001 + *
75002 + * Note: this option consumes quite a bit of kernel memory.
75003 + */
75004 +#define REISER4_STATS (1)
75005 +#else
75006 +#define REISER4_STATS (0)
75007 +#endif
75008 +
75009 +#if defined(CONFIG_REISER4_DEBUG_OUTPUT)
75010 +/*
75011 + * In this mode various "debugging output" functions are compiled in. These
75012 + * functions output human readable representation of various reiser4 kernel
75013 + * data-structures (keys, tree nodes, items, etc.), which are used in error
75014 + * messages.
75015 + */
75016 +#define REISER4_DEBUG_OUTPUT (1)
75017 +#else
75018 +#define REISER4_DEBUG_OUTPUT (0)
75019 +#endif
75020 +
75021 +#if defined(CONFIG_REISER4_COPY_ON_CAPTURE)
75022 +/*
75023 + * Turns on copy-on-capture (COC) optimization. See
75024 + * http://www.namesys.com/v4/v4.html#cp_on_capture
75025 + */
75026 +#define REISER4_COPY_ON_CAPTURE (1)
75027 +#else
75028 +#define REISER4_COPY_ON_CAPTURE (0)
75029 +#endif
75030 +
75031 +#if defined(CONFIG_REISER4_LOCKPROF)
75032 +/*
75033 + * Turns on lock profiling mode. In this mode reiser4 spin-locks are
75034 + * instrumented to collect information about their contention and
75035 + * utilization. See fs/reiser4/spinprof.[ch] for details.
75036 + *
75037 + * Lock profiling results are exported as /sys/profregion/
75038 + */
75039 +#define REISER4_LOCKPROF (1)
75040 +#else
75041 +#define REISER4_LOCKPROF (0)
75042 +#endif
75043 +
75044 +#if defined(CONFIG_REISER4_LARGE_KEY)
75045 +/*
75046 + * Turn on large keys mode. In his mode (which is default), reiser4 key has 4
75047 + * 8-byte components. In the old "small key" mode, it's 3 8-byte
75048 + * components. Additional component, referred to as "ordering" is used to
75049 + * order items from which given object is composed of. As such, ordering is
75050 + * placed between locality and objectid. For directory item ordering contains
75051 + * initial prefix of the file name this item is for. This sorts all directory
75052 + * items within given directory lexicographically (but see
75053 + * fibration.[ch]). For file body and stat-data, ordering contains initial
75054 + * prefix of the name file was initially created with. In the common case
75055 + * (files with single name) this allows to order file bodies and stat-datas in
75056 + * the same order as their respective directory entries, thus speeding up
75057 + * readdir.
75058 + *
75059 + * Note, that kernel can only mount file system with the same key size as one
75060 + * it is compiled for, so flipping this option may render your data
75061 + * inaccessible.
75062 + */
75063 +#define REISER4_LARGE_KEY (1)
75064 +#else
75065 +#define REISER4_LARGE_KEY (0)
75066 +#endif
75067 +
75068 +#if defined(CONFIG_REISER4_ALL_IN_ONE)
75069 +/*
75070 + * Turn on all-on-one compilation mode. In this mode reiser4 is compiled as
75071 + * one single source file all-in-one.c that includes all other sources. This
75072 + * is supposed to result in better code, because compiler is free to perform
75073 + * all optimizations within the same compilation unit. To achieve this,
75074 + * (almost) all reiser4 functions are prefixed with reiser4_internal
75075 + * specifier. In normal compilation mode it expands to nothing, in all-in-one
75076 + * mode, it expands to "static", thus telling compiler that function is only
75077 + * used in this compilation unit (that is, in whole reiser4).
75078 + *
75079 + * Note-1: compilation in this mode would result in large number of warnings,
75080 + * because header files weren't updated.
75081 + *
75082 + * Note-2: in addition to generating better code this mode can be used to
75083 + * detect declared but not used functions, or declarations without definition.
75084 + *
75085 + * Note-3: this should be tried with -funit-at-a-time option of gcc 3.4
75086 + */
75087 +#define REISER4_ALL_IN_ONE (1)
75088 +#else
75089 +#define REISER4_ALL_IN_ONE (0)
75090 +#endif
75091 +
75092 +#if defined (CONFIG_REISER4_DEBUG_NODE_INVARIANT)
75093 +/*
75094 + * In this mode [zj]node invariants are checked. This mode is not usually on,
75095 + * because it consumes a lot of CPU. See [zj]node_invariant() and
75096 + * doc/lock-ordering for description of invariants checked.
75097 + */
75098 +#define REISER4_DEBUG_NODE_INVARIANT (1)
75099 +#else
75100 +#define REISER4_DEBUG_NODE_INVARIANT (0)
75101 +#endif
75102 +
75103 +#if defined(CONFIG_REISER4_DEBUG_SPIN_LOCKS) && defined(CONFIG_REISER4_DEBUG)
75104 +/*
75105 + * Turns on spin-lock debugging. Many (but not all) spin-locks used by reiser4
75106 + * are accessed through special wrapper macros defined in spin_macros.h. These
75107 + * macros allow, among other things, to specify for a given spin-lock type its
75108 + * "lock ordering predicate" that specifies what other locks may or may not be
75109 + * held simultaneously with this one. Spin-lock debugging checks for these
75110 + * ordering constraints along with trivial checks for proper lock/unlock
75111 + * nesting, etc. Note, that spin_macros.h also support spin-lock profiling
75112 + * described above (CONFIG_REISER4_LOCKPROF).
75113 + *
75114 + * Note: this is not available through fs/Kconfig. Adjust manually.
75115 + */
75116 +#define REISER4_DEBUG_SPIN_LOCKS (1)
75117 +#else
75118 +#define REISER4_DEBUG_SPIN_LOCKS (0)
75119 +#endif
75120 +
75121 +#define CONFIG_REISER4_DEBUG_CONTEXTS y
75122 +#if defined(CONFIG_REISER4_DEBUG_CONTEXTS) && defined(CONFIG_REISER4_DEBUG)
75123 +/*
75124 + * In this mode reiser4_context debugging is activated. reiser4_context is a
75125 + * data-structure created on stack at the beginning of reiser4 entry. In this
75126 + * mode, list of all "active" contexts is maintained, and periodically
75127 + * checked. This is to catch various hard-to-debug bugs like exiting without
75128 + * destroying context, or stack overflowing.
75129 + *
75130 + * Note: this is not available through fs/Kconfig. Adjust manually.
75131 + */
75132 +#define REISER4_DEBUG_CONTEXTS (1)
75133 +#else
75134 +#define REISER4_DEBUG_CONTEXTS (0)
75135 +#endif
75136 +
75137 +#if defined(CONFIG_REISER4_DEBUG_SIBLING_LIST) && defined(CONFIG_REISER4_DEBUG)
75138 +/*
75139 + * Turn on sibling-list debugging. In this mode consistency of sibling lists
75140 + * of reiser4 internal tree is checked.
75141 + *
75142 + * Note: this is not available through fs/Kconfig. Adjust manually.
75143 + */
75144 +#define REISER4_DEBUG_SIBLING_LIST (1)
75145 +#else
75146 +#define REISER4_DEBUG_SIBLING_LIST (0)
75147 +#endif
75148 +
75149 +#if defined(CONFIG_CRYPTO_DEFLATE)
75150 +#define REISER4_GZIP_TFM (1)
75151 +#else
75152 +#define REISER4_GZIP_TFM (0)
75153 +#endif
75154 +
75155 +/*
75156 + * PLEASE update fs/reiser4/kattr.c:show_options() when adding new compilation
75157 + * option
75158 + */
75159 +
75160 +
75161 +extern const char *REISER4_SUPER_MAGIC_STRING;
75162 +extern const int REISER4_MAGIC_OFFSET; /* offset to magic string from the
75163 +                                        * beginning of device */
75164 +
75165 +/* here go tunable parameters that are not worth special entry in kernel
75166 +   configuration */
75167 +
75168 +/* default number of slots in coord-by-key caches */
75169 +#define CBK_CACHE_SLOTS    (16)
75170 +/* how many elementary tree operation to carry on the next level */
75171 +#define CARRIES_POOL_SIZE        (5)
75172 +/* size of pool of preallocated nodes for carry process. */
75173 +#define NODES_LOCKED_POOL_SIZE   (5)
75174 +
75175 +#define REISER4_NEW_NODE_FLAGS (COPI_LOAD_LEFT | COPI_LOAD_RIGHT | COPI_GO_LEFT)
75176 +#define REISER4_NEW_EXTENT_FLAGS (COPI_LOAD_LEFT | COPI_LOAD_RIGHT | COPI_GO_LEFT)
75177 +#define REISER4_PASTE_FLAGS (COPI_GO_LEFT)
75178 +#define REISER4_INSERT_FLAGS (COPI_GO_LEFT)
75179 +
75180 +/* we are supporting reservation of disk space on uid basis */
75181 +#define REISER4_SUPPORT_UID_SPACE_RESERVATION (0)
75182 +/* we are supporting reservation of disk space for groups */
75183 +#define REISER4_SUPPORT_GID_SPACE_RESERVATION (0)
75184 +/* we are supporting reservation of disk space for root */
75185 +#define REISER4_SUPPORT_ROOT_SPACE_RESERVATION (0)
75186 +/* we use rapid flush mode, see flush.c for comments.  */
75187 +#define REISER4_USE_RAPID_FLUSH (1)
75188 +
75189 +/*
75190 + * set this to 0 if you don't want to use wait-for-flush in ->writepage().
75191 + */
75192 +#define REISER4_USE_ENTD (1)
75193 +
75194 +/* Using of emergency flush is an option. */
75195 +#define REISER4_USE_EFLUSH (1)
75196 +
75197 +/* key allocation is Plan-A */
75198 +#define REISER4_PLANA_KEY_ALLOCATION (1)
75199 +/* key allocation follows good old 3.x scheme */
75200 +#define REISER4_3_5_KEY_ALLOCATION (0)
75201 +
75202 +/* size of hash-table for znodes */
75203 +#define REISER4_ZNODE_HASH_TABLE_SIZE (1 << 13)
75204 +
75205 +/* number of buckets in lnode hash-table */
75206 +#define LNODE_HTABLE_BUCKETS (1024)
75207 +
75208 +/* some ridiculously high maximal limit on height of znode tree. This
75209 +    is used in declaration of various per level arrays and
75210 +    to allocate stattistics gathering array for per-level stats. */
75211 +#define REISER4_MAX_ZTREE_HEIGHT     (8)
75212 +
75213 +#define REISER4_PANIC_MSG_BUFFER_SIZE (1024)
75214 +
75215 +/* If array contains less than REISER4_SEQ_SEARCH_BREAK elements then,
75216 +   sequential search is on average faster than binary. This is because
75217 +   of better optimization and because sequential search is more CPU
75218 +   cache friendly. This number (25) was found by experiments on dual AMD
75219 +   Athlon(tm), 1400MHz.
75220 +
75221 +   NOTE: testing in kernel has shown that binary search is more effective than
75222 +   implied by results of the user level benchmarking. Probably because in the
75223 +   node keys are separated by other data. So value was adjusted after few
75224 +   tests. More thorough tuning is needed.
75225 +*/
75226 +#define REISER4_SEQ_SEARCH_BREAK      (3)
75227 +
75228 +/* don't allow tree to be lower than this */
75229 +#define REISER4_MIN_TREE_HEIGHT       (TWIG_LEVEL)
75230 +
75231 +/* NOTE NIKITA this is no longer used: maximal atom size is auto-adjusted to
75232 + * available memory. */
75233 +/* Default value of maximal atom size. Can be ovewritten by
75234 +   tmgr.atom_max_size mount option. By default infinity. */
75235 +#define REISER4_ATOM_MAX_SIZE         ((unsigned)(~0))
75236 +
75237 +/* Default value of maximal atom age (in jiffies). After reaching this age
75238 +   atom will be forced to commit, either synchronously or asynchronously. Can
75239 +   be overwritten by tmgr.atom_max_age mount option. */
75240 +#define REISER4_ATOM_MAX_AGE          (600 * HZ)
75241 +
75242 +/* sleeping period for ktxnmrgd */
75243 +#define REISER4_TXNMGR_TIMEOUT  (5 * HZ)
75244 +
75245 +/* timeout to wait for ent thread in writepage. Default: 3 milliseconds. */
75246 +#define REISER4_ENTD_TIMEOUT (3 * HZ / 1000)
75247 +
75248 +/* start complaining after that many restarts in coord_by_key().
75249 +
75250 +   This either means incredibly heavy contention for this part of a tree, or
75251 +   some corruption or bug.
75252 +*/
75253 +#define REISER4_CBK_ITERATIONS_LIMIT  (100)
75254 +
75255 +/* return -EIO after that many iterations in coord_by_key().
75256 +
75257 +   I have witnessed more than 800 iterations (in 30 thread test) before cbk
75258 +   finished. --nikita
75259 +*/
75260 +#define REISER4_MAX_CBK_ITERATIONS    ((unsigned)~0)
75261 +
75262 +/* put a per-inode limit on maximal number of directory entries with identical
75263 +   keys in hashed directory.
75264 +
75265 +   Disable this until inheritance interfaces stabilize: we need some way to
75266 +   set per directory limit.
75267 +*/
75268 +#define REISER4_USE_COLLISION_LIMIT    (0)
75269 +
75270 +/* If flush finds more than FLUSH_RELOCATE_THRESHOLD adjacent dirty leaf-level blocks it
75271 +   will force them to be relocated. */
75272 +#define FLUSH_RELOCATE_THRESHOLD 64
75273 +/* If flush finds can find a block allocation closer than at most FLUSH_RELOCATE_DISTANCE
75274 +   from the preceder it will relocate to that position. */
75275 +#define FLUSH_RELOCATE_DISTANCE  64
75276 +
75277 +/* If we have written this much or more blocks before encountering busy jnode
75278 +   in flush list - abort flushing hoping that next time we get called
75279 +   this jnode will be clean already, and we will save some seeks. */
75280 +#define FLUSH_WRITTEN_THRESHOLD 50
75281 +
75282 +/* The maximum number of nodes to scan left on a level during flush. */
75283 +#define FLUSH_SCAN_MAXNODES 10000
75284 +
75285 +/* default tracing buffer size */
75286 +#define REISER4_TRACE_BUF_SIZE (1 << 15)
75287 +
75288 +/* what size units of IO we would like cp, etc., to use, in writing to
75289 +   reiser4. In bytes.
75290 +
75291 +   Can be overwritten by optimal_io_size mount option.
75292 +*/
75293 +#define REISER4_OPTIMAL_IO_SIZE (64 * 1024)
75294 +
75295 +/* see comments in inode.c:oid_to_uino() */
75296 +#define REISER4_UINO_SHIFT (1 << 30)
75297 +
75298 +/* Mark function argument as unused to avoid compiler warnings. */
75299 +#define UNUSED_ARG __attribute__((unused))
75300 +
75301 +#if ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 3)) || (__GNUC__ > 3)
75302 +#define NONNULL __attribute__((nonnull))
75303 +#else
75304 +#define NONNULL
75305 +#endif
75306 +
75307 +/* master super block offset in bytes.*/
75308 +#define REISER4_MASTER_OFFSET 65536
75309 +
75310 +/* size of VFS block */
75311 +#define VFS_BLKSIZE 512
75312 +/* number of bits in size of VFS block (512==2^9) */
75313 +#define VFS_BLKSIZE_BITS 9
75314 +
75315 +#define REISER4_I reiser4_inode_data
75316 +
75317 +/* implication */
75318 +#define ergo( antecedent, consequent ) ( !( antecedent ) || ( consequent ) )
75319 +/* logical equivalence */
75320 +#define equi( p1, p2 ) ( ergo( ( p1 ), ( p2 ) ) && ergo( ( p2 ), ( p1 ) ) )
75321 +
75322 +#define sizeof_array(x) ((int) (sizeof(x) / sizeof(x[0])))
75323 +
75324 +
75325 +#define NOT_YET                       (0)
75326 +
75327 +/** Reiser4 specific error codes **/
75328 +
75329 +#define REISER4_ERROR_CODE_BASE 500
75330 +
75331 +/* Neighbor is not available (side neighbor or parent) */
75332 +#define E_NO_NEIGHBOR  (REISER4_ERROR_CODE_BASE)
75333 +
75334 +/* Node was not found in cache */
75335 +#define E_NOT_IN_CACHE (REISER4_ERROR_CODE_BASE + 1)
75336 +
75337 +/* node has no free space enough for completion of balancing operation */
75338 +#define E_NODE_FULL    (REISER4_ERROR_CODE_BASE + 2)
75339 +
75340 +/* repeat operation */
75341 +#define E_REPEAT       (REISER4_ERROR_CODE_BASE + 3)
75342 +
75343 +/* deadlock happens */
75344 +#define E_DEADLOCK     (REISER4_ERROR_CODE_BASE + 4)
75345 +
75346 +/* operation cannot be performed, because it would block and non-blocking mode
75347 + * was requested. */
75348 +#define E_BLOCK        (REISER4_ERROR_CODE_BASE + 5)
75349 +
75350 +/* wait some event (depends on context), then repeat */
75351 +#define E_WAIT         (REISER4_ERROR_CODE_BASE + 6)
75352 +
75353 +#endif                         /* __REISER4_H__ */
75354 +
75355 +/* Make Linus happy.
75356 +   Local variables:
75357 +   c-indentation-style: "K&R"
75358 +   mode-name: "LC"
75359 +   c-basic-offset: 8
75360 +   tab-width: 8
75361 +   fill-column: 120
75362 +   End:
75363 +*/
75364 diff -rupN linux-2.6.8-rc3/fs/reiser4/reiser4_roadmap.html linux-2.6.8-rc3-a/fs/reiser4/reiser4_roadmap.html
75365 --- linux-2.6.8-rc3/fs/reiser4/reiser4_roadmap.html     1970-01-01 03:00:00.000000000 +0300
75366 +++ linux-2.6.8-rc3-a/fs/reiser4/reiser4_roadmap.html   2004-08-05 21:20:53.290616776 +0400
75367 @@ -0,0 +1,1120 @@
75368 +<p> <H1> Reiser4 (Version 4 of ReiserFS)</H1>
75369 +
75370 +<H2> Primary sponsor www.DARPA.mil, regular sponsors applianceware.com and
75371 +bigstorage.com.  DARPA does not endorse this project, it merely
75372 +sponsors it.  </H2>
75373 +<p>Table of Contents:
75374 +
75375 +<p><a href="#new_ext">
75376 +New Extensibility Infrastructure</a>
75377 +<br>&nbsp;&nbsp;
75378 +<a href="#file_plg">
75379 +File Plugins</a>
75380 +<br>&nbsp;&nbsp;&nbsp;&nbsp;
75381 +<a href="#dir_plg">
75382 +Directory Plugins</a>
75383 +<br>&nbsp;&nbsp;
75384 +<a href="#hash_plg">
75385 +Hash Plugins</a>
75386 +<br>&nbsp;&nbsp;
75387 +<a href="#security_plg">
75388 +Security Plugins</a>
75389 +<br>&nbsp;&nbsp;
75390 +<a href="#new_plg">
75391 +Putting Your New Plugin To Work Will Mean Recompiling</a>
75392 +<br>&nbsp;&nbsp;
75393 +<a href="#item_plg">
75394 +Item Plugins</a>
75395 +<br>&nbsp;&nbsp;
75396 +<a href="#key_plg">
75397 +Key Assignment Plugins</a>
75398 +<br>&nbsp;&nbsp;
75399 +<a href="#search_plg">
75400 +Node Search and Item Search Plugins</a>
75401 +<br>&nbsp;&nbsp;
75402 +<a href="#backup">
75403 +Backup</a>
75404 +<br>&nbsp;&nbsp;
75405 +<a href="#without_plg">
75406 +Without Plugins We Will Drown</a>
75407 +<br>&nbsp;&nbsp;
75408 +<a href="#steps_crt">
75409 +Steps For Creating A Security Attribute</a>
75410 +<br>&nbsp;&nbsp;
75411 +<a href="#lazy">
75412 +Plugins: FS Programming For The Lazy</a>
75413 +<p>
75414 +<a href="#new_funct">
75415 +New Functionality</a>
75416 +<br>&nbsp;&nbsp;
75417 +<a href="#why_scrt">
75418 +Why Linux Needs To Be Secure</a>
75419 +<br>&nbsp;&nbsp;
75420 +<a href="#fine_scrt">
75421 +Fine Graining Security</a>
75422 +<br>&nbsp;&nbsp;&nbsp;&nbsp;
75423 +<a href="#good_scrt">
75424 +Good Security Requires Precision In Specification Of Security</a>
75425 +<br>&nbsp;&nbsp;&nbsp;&nbsp;
75426 +<a href="#imprecise_scrt">
75427 +Space Efficiency Concerns Motivate Imprecise Security</a>
75428 +<br>&nbsp;&nbsp;&nbsp;&nbsp;
75429 +<a href="#scrt_def">
75430 +Security Definition Units And Data Access Patterns Sometimes Inherently Don't Align</a>
75431 +<br>&nbsp;&nbsp;&nbsp;&nbsp;
75432 +<a href="#etc_passwd">
75433 +/etc/passwd As Example</a>
75434 +<br>&nbsp;&nbsp;&nbsp;&nbsp;
75435 +<a href="#aggr_files">
75436 +Aggregating Files Can Improve The User Interface To Them</a>
75437 +<br>&nbsp;&nbsp;&nbsp;&nbsp;
75438 +<a href="#aggr_modif">
75439 +How Do We Write Modifications To An Aggregation</a>
75440 +<br>&nbsp;&nbsp;&nbsp;&nbsp;
75441 +<a href="#inheritance">
75442 +Aggregation Is Best Implemented As Inheritance</a>
75443 +<br>&nbsp;&nbsp;&nbsp;&nbsp;
75444 +<a href="#constr">
75445 +Constraints</a>
75446 +<br>&nbsp;&nbsp;&nbsp;&nbsp;
75447 +<a href="#audit">
75448 +Auditing</a>
75449 +<br>&nbsp;&nbsp;&nbsp;&nbsp;
75450 +<a href="#incr_scrt">
75451 +Increasing the Allowed  Granularity of Security</a>
75452 +<br>&nbsp;&nbsp;&nbsp;&nbsp;
75453 +<a href="#files_dirs">
75454 +Files That Are Also Directories</a>
75455 +<br>&nbsp;&nbsp;&nbsp;&nbsp;
75456 +<a href="#hidden_dir">
75457 +Hidden Directory Entries</a>
75458 +<br>&nbsp;&nbsp;
75459 +<a href="#new_scrt">
75460 +New Security Attributes and Set Theoretic Semantic Purity</a>
75461 +<br>&nbsp;&nbsp;&nbsp;&nbsp;
75462 +<a href="#min_num">
75463 +Minimizing Number Of Primitives Is Important In Abstract Constructions</a>
75464 +<br>&nbsp;&nbsp;&nbsp;&nbsp;
75465 +<a href="#compose_streams">
75466 +Can We Get By Using Just Files and Directories (Composing Streams And Attributes From Files And Directories)?</a>
75467 +<br>&nbsp;&nbsp;&nbsp;&nbsp;
75468 +<a href="#list_features">
75469 +List Of Features Needed To Get Attribute And Stream Functionality From Files And Directories</a>
75470 +<br>&nbsp;&nbsp;&nbsp;&nbsp;
75471 +<a href="#mnt_fs">
75472 +Mounting FS Flavors</a>
75473 +<br>&nbsp;&nbsp;
75474 +<a href="#api">
75475 +API Suitable For Accessing Files That Store Security Attributes</a>
75476 +<br>&nbsp;&nbsp;&nbsp;&nbsp;
75477 +<a href="#flaws">
75478 +Flaws In Traditional File API When Applied To Security Attributes</a>
75479 +<br>&nbsp;&nbsp;&nbsp;&nbsp;
75480 +<a href="#resolution">
75481 +The Usual Resolution Of These Flaws Is A One-Off Solution</a>
75482 +<br>&nbsp;&nbsp;&nbsp;&nbsp;
75483 +<a href="#solution">
75484 +One-Off Solutions Are A Lot of Work To Do A Lot Of</a>
75485 +<br>&nbsp;&nbsp;&nbsp;&nbsp;
75486 +<a href="#syscall">
75487 +reiser4() System Call Description</a>
75488 +<br>&nbsp;&nbsp;&nbsp;&nbsp;
75489 +<a href="#tr_tr">
75490 +Transactions and Transcrashes:</a>
75491 +<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
75492 +<a href="#tr_necessary">
75493 +Transactions Are Necessary Safeguard Against Certain Race Condition Exploits</a>
75494 +<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
75495 +<a href="#transcr">
75496 +Transcrashes</a>
75497 +<br>
75498 +<a href="#performance">
75499 +Performance Enhancements</a>
75500 +<br>&nbsp;&nbsp;
75501 +<a href="#dancing">
75502 +Dancing Trees Are Faster Than Balanced Trees</a>
75503 +<br>&nbsp;&nbsp;&nbsp;&nbsp;
75504 +<a href="#if_ram">
75505 +If It Is In RAM, Dirty, and Contiguous, Then Squeeze It ALL
75506 +Together Just Before Writing</a>
75507 +<br>&nbsp;&nbsp;
75508 +<a href="#repacker">
75509 +Repacker</a>
75510 +<br>&nbsp;&nbsp;
75511 +<a href="#commit">
75512 +Encryption On Commit</a>
75513 +<br>&nbsp;&nbsp;&nbsp;&nbsp;
75514 +<a href="#wand_lgs">
75515 +Wandering Logs</a>
75516 +<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
75517 +<a href="#more_detailed">
75518 +(More detailed treatment soon to be available at www.namesys.com/transactions.html by Joshua MacDonald.)</a>
75519 +<br>&nbsp;&nbsp;&nbsp;&nbsp;
75520 +<a href="#conclusion">
75521 +Conclusion</a>
75522 +
75523 +
75524 +<p>
75525 +<a name="new_ext"></a>
75526 +<H1> New Extensibility Infrastructure </H1>
75527 +<p>
75528 +It takes more than a license to make source code open, it takes a design.
75529 +
75530 +<p> Reiser4 will focus on extensibility.  Plugins ala photoshop but
75531 +for files and directories.  This is necessary if we are to enable
75532 +vendors to DARPA (including ourselves) to cost effectively add
75533 +substantial numbers of new security features to Reiser4.
75534 +
75535 +<p>
75536 +Imagine that you were an experimental physicist who had spent his life
75537 +using only the tools that were in his local hardware store.  Then one
75538 +day you joined a major research lab with a machine shop, and a whole
75539 +bunch of other physicists.  All of a sudden you are not using just
75540 +whatever tools the large tool companies who have never heard of you
75541 +have made for you, you are part of a cooperative of physicists all
75542 +making your own tools, swapping tools with each other, suddenly
75543 +empowered to have tools that are exactly what you want them to be, or
75544 +even merely exactly what your colleagues want them to be, rather than
75545 +what some big tool company wants them to be.  That is the transition
75546 +you will make when you go from version 3 to version 4 of ReiserFS.
75547 +The tools your colleagues and sysadmins (your machinists) make are
75548 +going to be much better for what you need.
75549 +<a name="file_plg"></a>
75550 +<h2>File Plugins</H2>
75551 +<p>
75552 +Every object (file or directory) will possess a plugin id.  This
75553 +plugin id will identify a set of methods.  The set of methods will
75554 +embody all of the different possible interactions with the object that
75555 +come from sources external to reiserfs.  It is a layer of indirection
75556 +added between the external interface to reiserfs, and the rest of
75557 +reiserfs.  Each method will have a methodid.  It will be usual to mix
75558 +and match methods from other plugins when composing plugins.
75559 +
75560 +<a name="dir_plg"></a>
75561 +<h3>Directory Plugins</h3> <p>Reiser4 will implement a plugin for
75562 +traditional directories, and it will implement directory style access
75563 +to file attributes as part of the plugin for regular files.  Later we
75564 +will describe why this is useful.  Other directory plugins we will
75565 +leave for later versions.  There is no deep reason for this
75566 +deferra. It is simply the randomness of what features attract sponsors
75567 +and make into a release specification, and there are no sponsors at
75568 +the moment for additional directory plugins.  I have no doubt that
75569 +they will appear later; new directory plugins will be too much fun to
75570 +miss out on.:-)
75571 +
75572 +<p>
75573 +<a name="hash_plg"></a>
75574 +<H2>Hash Plugins</H2>
75575 +
75576 +<p>Hash plugins already exist in version 3, and if you know what they
75577 +are this paragraph says nothing new.  To coexist with NFS we must be
75578 +able to hand out 64 bit "cookies" that can be used to resume a
75579 +readdir.  Cookies are implemented in most filesystems as byte offsets
75580 +within a directory (which means they cannot shrink directories), and
75581 +in reiserfs as hashes of filenames plus a generation counter.  We
75582 +order directory entries in reiserfs by their cookies.  This costs us
75583 +performance compared to ordering lexicographically (but is immensely
75584 +faster than the linear searching employed by most other Unix
75585 +filesystems), and depending on the hash and its match to the
75586 +application usage pattern there may be more or less performance
75587 +lossage.  Hash plugins will probably remain until version 5 or so,
75588 +when directory plugins and ordering function plugins will obsolete
75589 +them, and directory entries will be ordered by filenames like they
75590 +should be (and possibly stem compressed as well).
75591 +
75592 +<a name="security_plg"></a>
75593 +<H2>Security Plugins</H2>
75594 +<p>
75595 +Security plugins handle all security checks.  They are normally
75596 +invoked by file and directory plugins.
75597 +
75598 +<p>
75599 +<ul>Example of reading a file:
75600 +<li>Access the pluginid for the file.
75601 +<li>Invoke the read method for the plugin.
75602 +<li>The read method determines the security plugin for the file.
75603 +<li>That security plugin invokes its read check method for determining whether to permit the read.
75604 +<li>The read check method for the security plugin reads file/attributes containing the permissions on the file
75605 +<li>Since file/attributes are also files, this means invoking the plugin for reading the file/attribute.
75606 +<li>The pluginid for this particular file/attribute for this file happens to be inherited (saving space and centralizing control of it).
75607 +
75608 +<li>The read method for the file/attribute is coded such that it does
75609 +not check permissions when called by a sec plug method.  (Endless
75610 +recursion is thereby avoided.)
75611 +
75612 +<li>The file/attribute plugin employs a decompression algorithm
75613 +specially designed for efficient decompression of our encoding of
75614 +ACLs.
75615 +
75616 +<li>The security plugin determines that the read should be permitted.
75617 +<li>The read method continues and completes.
75618 +</ul>
75619 +
75620 +<a name="new_plg"></a>
75621 +<H2>Putting Your New Plugin To Work Will Mean Recompiling</H2>
75622 +<p>If you want to add a new plugin, we think your having to ask the sysadmin to
75623 +recompile the kernel with your new plugin added to it will be
75624 +acceptable for version 4.0.  We will initially code plugin-id lookup
75625 +as an in-kernel fixed length array lookup, methodids as function
75626 +pointers, and make no provision for post-compilation loading of
75627 +plugins.  Performance, and coding cost, motivates this.
75628 +<a name="item_plg"></a>
75629 +<H2>Item Plugins</H2>
75630 +<p>
75631 +The balancing code will be able to balance an item iff it has an item
75632 +plugin implemented for it.  The item plugin will implement each of
75633 +the methods the balancing code needs (methods such as splitting items,
75634 +estimating how large the split pieces will be, overwriting, appending
75635 +to, cutting from, or inserting into the item, etc.)
75636 +<p>
75637 +In addition to all of the balancing operations, item plugins will also
75638 +implement intra-item search plugins.
75639 +<p>
75640 +Our current code understands the structure of the items it balances.
75641 +This makes adding new types of items storing such new security
75642 +attributes as other researchers develop too expensive in coding time,
75643 +greatly inhibiting the addition of them to ReiserFS.  We anticipate
75644 +that there will be a great proliferation in the types of security
75645 +attributes in ReiserFS if and only if we are able to make it a matter
75646 +requiring not a modification of the balancing code by our most
75647 +experienced programmers, but the writing of an item handler.  This is
75648 +necessary if we are to achieve our goal of making the adding of each
75649 +new security attribute an order of magnitude or more easier to perform
75650 +than it is now.
75651 +
75652 +<a name="key_plg"></a>
75653 +<H2>Key Assignment Plugins</H2>
75654 +<p>
75655 +
75656 +When assigning the key to an item, the key assignment plugin will be
75657 +invoked, and it will have a key assignment method for each item type.
75658 +A single key assignment plugin is defined for the whole FS at FS
75659 +creation time.  We know from experience that there is no "correct" key
75660 +assignment policy, squid has very different needs from average user
75661 +home directories.  Yes, there could be value in varying it more
75662 +flexibly than just at FS creation time, but we have to draw the line
75663 +somewhere when deciding what goes into each release....
75664 +
75665 +<a name="search_plg"></a>
75666 +<H2>Node Search and Item Search Plugins</H2>
75667 +<P>
75668 +Every node layout will have a search method for that layout, and every
75669 +item that is searched through will have a search method for that item.
75670 +(When doing searches, we search through a node to find an item, and
75671 +then search within the item for those items that contain multiple
75672 +things to find.)
75673 +<a name="backup"></a>
75674 +<h2>Backup</h2>
75675 +<p>
75676 +We need to modify tar to record plugin ids.  Some plugins may require special treatment.
75677 +<a name="without_plg"></a>
75678 +<H2>Without Plugins We Will Drown</H2>
75679 +<P>
75680 +People often ask, as ReiserFS grows in features, how will we keep the
75681 +design from being drowned under the weight of the added complexity,
75682 +and from reaching the point where it is difficult to work on the code?
75683 +<p>
75684 +The infrastructure to support security attributes implemented as files
75685 +also enables lots of features not necessarily security related.  The
75686 +plugins we are choosing to implement in v4.0 are all security related
75687 +because of our funding source, but users will add other sorts of
75688 +plugins just as they took DARPA's TCP/IP and used it for non-military
75689 +computers.  Only requiring that all features be implemented in the
75690 +manner that maximizes code reuse keeps ReiserFS coding complexity down
75691 +to where we can manage it over the long term.
75692 +
75693 +<a name="steps_crt"></a>
75694 +<H2>Steps For Creating A Security Attribute</H2>
75695 +<p>
75696 +Once this infrastructure has been created, you will be able to create a new security attribute by:
75697 +<ul>
75698 +<li>defining a pluginid
75699 +<li>composing a set of methods for the plugin from ones you create or reuse from other existing plugins
75700 +<li>defining a set of items that act as the storage containers of the object, or reusing existing items from other plugins (e.g. regular files)
75701 +<li>implementing item handlers for all of the new items you create
75702 +<li>creating a key assignment algorithm for all of the new items
75703 +<li>implementing a search handler for every item you create that requires searching within it (perhaps search methods should be considered part of the item handler, we shall see while implementing it)
75704 +</ul>
75705 +
75706 +<a name="lazy"></a>
75707 +<H2>Plugins: FS Programming For The Lazy</H2>
75708 +<p>
75709 +The important feature here is that in practice most plugins will have
75710 +only a very few of these features unique to them, and the rest of the
75711 +plugin will be reused code.  This is how we will reduce adding new
75712 +security attributes to a task requiring a few weeks work: by first
75713 +creating the right tools for the job, and only then starting work on
75714 +it.  Our ambition is to have two orders of magnitude more security
75715 +features than we otherwise would in 5 years, by first making it an
75716 +order of magnitude less work to add them to reiser4, and then
75717 +attracting an order of magnitude more security attribute developers
75718 +because of that.  What DARPA is paying for here, is primarily not a
75719 +suite of security plugins from Namesys, though it is getting that, but
75720 +an architectural (not just the license) enabling of lots of outside vendors
75721 +to efficiently create lots of innovative security plugins that Namesys
75722 +would never have imagined if working by itself as a supplier.
75723 +
75724 +<a name="new_funct"></a>
75725 +<H1> New Functionality</H1>
75726 +<a name="why_scrt"></a>
75727 +<H2> Why Linux Needs To Be Secure</H2>
75728 +<p>
75729 +The world is sadly changing.  It used to be that there was no spam,
75730 +because it was not socially acceptable.  Now there is spam.  It used
75731 +to be that security attacks on civilian computers were infrequent,
75732 +because only unbalanced teenage boys had nothing better to do.  This
75733 +is changing in much the same way.
75734 +<p>
75735 +The communist government of China has attacking US information
75736 +infrastructure as part of its military doctrine.  Linux computers are
75737 +the bricks the US (and global) civilian information infrastructure is
75738 +being built from.  It is in the US (and global) interest that Linux
75739 +become SECURELY neutral, so that when large US (or elsewhere) banks
75740 +use Linux, and the US (or anyone else) experiences an attack, the
75741 +infrastructure does not go down.  Chinese crackers are known to have
75742 +compromised a computer forming a part of the California power grid....
75743 +<p>
75744 +It used to be that most casualties in wars were to combatants.  Now
75745 +they are mostly to civilians.  In future information infrastructure
75746 +attacks, who will take more damage, civilian or military
75747 +installations?  DARPA is funding us to make all Linux computers more
75748 +resistant to attack.
75749 +<p>
75750 +<a name="fine_scrt"></a>
75751 +<H2>Fine Graining Security</h2>
75752 +<p>
75753 +<a name="good_scrt"></a>
75754 +<H3>Good Security Requires Precision In Specification Of Security</H3>
75755 +<p>
75756 +Suppose you have a large file, and this file has many components.  One
75757 +of the themes of SE Linux is that Unix security is insufficiently fine
75758 +grained.  This is a general principle of security, that good security
75759 +requires precision of permissions.  When security lacks precision, it
75760 +increases the burden of being secure, and the extent to which users
75761 +adhere to security requirements in practice is a function of the
75762 +burden of adhering to it.
75763 +<p>
75764 +<a name="imprecise_scrt"></a>
75765 +<H3>Space Efficiency Concerns Motivate Imprecise Security</H3>
75766 +<p>
75767 +Many filesystems make it space usage ineffective to store small components as
75768 +separate files for various reasons.  Not being separate
75769 +files means that they cannot have separate permissions.  One of the
75770 +reasons for using overly aggregated units of security is space
75771 +efficiency.  ReiserFS currently improves this by an order of magnitude
75772 +over most of the existing alternative art.  Space efficiency is the
75773 +hardest of the reasons to eliminate, and its elimination makes it that
75774 +much more enticing to attempt to eliminate the other reasons.
75775 +
75776 +<a name="scrt_def"></a>
75777 +<h3>Security Definition Units And Data Access Patterns Sometimes Inherently Don't Align</h3>
75778 +<p>
75779 +Applications sometimes want to operate on a collection of components
75780 +as a single aggregated stream.  (Note that commonly two different
75781 +applications want to operate on data with different levels of
75782 +aggregation, and the infrastructure for solving this as a security
75783 +issue will also solve that problem as well.)
75784 +
75785 +<a name="etc_passwd"></a>
75786 +<h3>/etc/passwd As Example</h3>
75787 +<p>
75788 +I am going to use the /etc/passwd file as an example, not because I
75789 +think that other aspects of SE Linux won't solve its problems better,
75790 +but because the implementation of it as a single flat file in the
75791 +early Unixes is a wonderful illustrative example of poorly
75792 +granularized security that the readers may share my personal
75793 +experiences with, and then I hope they will be able to imagine that
75794 +other data files less famous could have similar problems.
75795 +<p>
75796 +Have you ever tried to figure out just exactly what part of the
75797 +/etc/passwd file changed near the time of a break-in?  Have you ever
75798 +wished that you could have a modification time on each field in it?
75799 +Have you ever wished the users could change part of it, such as the
75800 +gecos field, themselves (setuid utilities have been written to allow
75801 +this, but this is a pedagogical not a practical example), but not have
75802 +the power to change it for other users?
75803 +<p>
75804 +There were good reasons why
75805 +/etc/passwd was first implemented as a single file with one single
75806 +permission governing the entire file.  If we can eliminate them one by
75807 +one, the same techniques for making finer grained security effective
75808 +will be of value to other highly secure data files.
75809 +
75810 +<a name="aggr_files"></a>
75811 +<h3>Aggregating Files Can Improve The User Interface To Them</h3>
75812 +<p>
75813 +Consider the use of emacs on a collection of a thousand small 8-32
75814 +byte files like you might have if you deconstructed /etc/passwd into
75815 +small files with separable acls for every field.  It is more
75816 +convenient in screen real estate, buffer management, and other user
75817 +interface considerations, to operate on them as an aggregation all
75818 +placed into a single buffer rather than as a thousand 8-32 byte
75819 +buffers.
75820 +
75821 +<a name="aggr_modif"></a>
75822 +<h3>How Do We Write Modifications To An Aggregation</h3>
75823 +<p>
75824 +Suppose we create a plugin that aggregates all of the files in a
75825 +directory into a single stream.  How does one handle writes to that
75826 +aggregation that change the length of the components of that
75827 +aggregation?
75828 +
75829 +<p>Richard Stallman pointed out to me that if we separate the
75830 +aggregated files with delimiters, then emacs need not be changed at
75831 +all to acquire an effective interface for large numbers of small files
75832 +accessed via an aggregation plugin.  If
75833 +/new_syntax_access_path/big_directory_of_small_files/.glued is a
75834 +plugin that aggregates every file in big_directory_of_small_files with
75835 +a delimiter separating every file within the aggregation, then one can
75836 +simply type emacs
75837 +/new_syntax_access_path/big_directory_of_small_files/.glued, and the
75838 +filesystem has done all the work emacs needs to be effective at this.
75839 +Not a line of emacs needs to be changed.
75840 +<p>
75841 +One needs to be able to choose different delimiting syntax for
75842 +different aggregation plugins so that one can, for say the passwd
75843 +file, aggregate subdirectories into lines, and files within those
75844 +subdirectories into colon separate fields within the line.  XML would
75845 +benefit from yet other delimiter construction rules.  (We have been
75846 +told by one XML company (need link to testimonial here) that ReiserFS is
75847 +higher performance than any other "database" for storing XML.)
75848 +
75849 +<a name="inheritance"></a>
75850 +<h3>Aggregation Is Best Implemented As Inheritance</h3>
75851 +
75852 +In summary, to be able to achieve precision in security we need to
75853 +have inheritance with specifiable delimiters, and we need whole file
75854 +inheritance to support ACLs.
75855 +
75856 +<a name="constr"></a>
75857 +<h3>Constraints</h3>
75858 +
75859 +<p>
75860 +Another way security may be insufficiently fine grained is in values:
75861 +it can be useful to allow persons to change data but only within
75862 +certain constraints.  For this project we will implement plugins, and
75863 +one type of plugin will be write constraints.  Write-constraints are
75864 +invoked upon write to a file, and if they return non-error then the
75865 +write is allowed.  We will implement two trivial sample
75866 +write-constraint plugins, one in the form of a kernel function
75867 +loadable as a kernel module which returns non-error (thus allowing the
75868 +write) if the file consists of the strings "secret" or "sensitive" but
75869 +not "top-secret", and another in the form of a perl program residing
75870 +in a file and is executed in user-space which does exactly the same.
75871 +Use of kernel functions will have performance advantages, particularly
75872 +for small functions, but severe disadvantages in power of scripting,
75873 +flexibility, and ability to be installed by non-secure sources.  Both
75874 +types of plugins will have their place.
75875 +
75876 +<p>Note that ACLs will also embody write constraints.
75877 +
75878 +<p>
75879 +We will implement constraints that are compiled into the kernel, and
75880 +constraints that are implemented as user space processes.
75881 +Specifically, we will implement a plugin that executes an arbitrary
75882 +constraint contained in an arbitary named file as a user space
75883 +process, passes the proposed new file contents to that process as
75884 +standard input, and iff the process exits without error allows the
75885 +write to occur.
75886 +
75887 +<p>
75888 +It can be useful to have read constraints as well as write constraints.
75889 +<p>
75890 +<a name="audit"></a>
75891 +<H3>Auditing </H3>
75892 +<p>
75893 +We will implement a plugin that notifies administrators by email when
75894 +access is made to files, e.g. read access.
75895 +
75896 +<p>With each plugin implemented, creating additional plugins becomes
75897 +easier as the available toolkit is enriched.  Auditing constitutes a
75898 +major additional security feature, yet it will be easy to implement
75899 +once the infrastructure to support it exists (and it would be
75900 +substantial work to implement it without that infrastructure).
75901 +<p>
75902 +The scope of this project is not the creation of plugins themselves,
75903 +but the creation of the infrastructure that plugin authors would find
75904 +useful.  We want to enable future contractors to the DoD (and US
75905 +financial institutions, PGP Security developers working on SE Linux,
75906 +etc.), to implement more secure systems on the Linux platform, not
75907 +implement them ourselves.  By laying a proper foundation and creating
75908 +a toolkit for them, we hope to reduce the cost of coding new security
75909 +attributes by an order of magnitude for those who follow us.
75910 +Employing a proper set of well orthogonalized primitives also changes
75911 +the addition of these attributes from being a complexity burden upon
75912 +the architecture into being an empowering extension of the
75913 +architecture, which greatly increases their acceptability for
75914 +ReiserFS.
75915 +
75916 +<a name="incr_scrt"></a>
75917 +<H3>Increasing the Allowed  Granularity of Security</H3>
75918 +<p>
75919 +Inheritance of security attributes is important to providing
75920 +flexibility in their administration.  We have spoken about making
75921 +security more fine grained, but sometimes it needs to be larger
75922 +grained.  Sometimes a large number of files are logically one unit in
75923 +regards to their security, and it is desirable to have a single point
75924 +of control over their security.  Inheritance of attributes is the
75925 +mechanism for implementing that.  Security administrators should have
75926 +the power to choose whatever units of security they desire, without
75927 +having to distort them to make them correspond to semantic units.
75928 +Inheritance of file bodies using aggregation plugins allows the units
75929 +of security to be smaller than files, inheritance of attributes allows
75930 +them to be larger than files.
75931 +
75932 +<a name="files_dirs"></a>
75933 +<h3>Files That Are Also Directories</h3>
75934 +<p>
75935 +In Reiser4 (but not ReiserFS 3) an object can be both a file and a
75936 +directory at the same time.  If you access it as a file, you obtain
75937 +the named sequence of bytes, and if you use it as a directory you can
75938 +obtain files within it, directory listings, etc.  There was a lengthy
75939 +discussion on the Linux kernel about whether this was technically
75940 +feasible to do which I won't reproduce here except to summarize that
75941 +Linus showed that it was feasible.
75942 +<p>
75943 +Allowing an object to be both a file and a directory is one of the
75944 +features necessary to to compose the functionality present in streams
75945 +and attributes using files and directories.
75946 +<p>
75947 +<a name="hidden_dir"></a>
75948 +<h3>Hidden Directory Entries</h3>
75949 +<p>
75950 +A file can exist, but not be visible when using readdir in the usual
75951 +way.  WAFL does this with the .snapshots directory, and it works well
75952 +for them without disturbing users.  This is useful for adding access
75953 +to a variety of new features without disturbing the user and
75954 +applications with them when they are not relevant.  An interesting
75955 +question is whether we should have all of these hidden files have the
75956 +same name prefix (e.g. '..' at the start of the hidden name), or not.
75957 +I am still soliciting input on this.  Note that this feature should be
75958 +used for special files that one does not want to be backed up.
75959 +<p>
75960 +<a name="new_scrt"></a>
75961 +<H2>New Security Attributes and
75962 +Set Theoretic Semantic Purity</H2>
75963 +<p>
75964 +<a name="min_num"></a>
75965 +<h3>Minimizing Number Of Primitives Is Important In Abstract Constructions</h3>
75966 +<p>
75967 +To a theoretician, it is extremely important to minimize the number of
75968 +primitives with which one achieves the desired functionality in an
75969 +abstract construction.  It is a bit hard to explain why this is so,
75970 +but it is well accepted that breaking an abstract model into more
75971 +basic primitives is very important.  A not very precise explanation of
75972 +why, is to say that if you have complex primitives, and you break them
75973 +into more basic primitives, then by combining those basic primitives
75974 +differently from how they were originally combined in the complex
75975 +primitives, you can usually express new things that the complex
75976 +primitives did not express.  Let's follow this grand tradition of
75977 +theoreticians, and see what happens if we apply it to Linux files and
75978 +directories.
75979 +<a name="compose_streams"></a>
75980 +<h3>Can We Get By Using Just Files and Directories (Composing Streams And Attributes From Files And Directories)?</h3>
75981 +<p>
75982 +In Linux we have files, directories, and attributes.  In NTFS they
75983 +have streams also.  Since Samba is important to Linux, there are
75984 +frequently requests that we add streams to ReiserFS.  There are also
75985 +requests that we add more and more different kinds of attributes using
75986 +more and more different APIs.  Can we do everything that can be done
75987 +with {files, directories, attributes, streams} using just {files,
75988 +directories}?  I say yes, if we make files and directories more powerful and flexible, and I hope that by the end of reading this you
75989 +will agree.
75990 +<p>Let us have two basic objects.  A file is a sequence of bytes that has a name.  A directory is a namespace mapping names to a set of objects "within" the directory.  We connect these directory namespaces such that one can use compound names whose subcomponents are separated by a delimiter '/'.
75991 +What is missing from files and directories now that attributes and streams offer?
75992 +<p>In ReiserFS 3, there exist file attributes.  File attributes are out-of-band data describing the sequence of bytes which is the file.  For example, the permissions defining who can access a file, or the last modification time, are file attributes.
75993 +File attributes have their own API, and creating new file attributes creates new code complexity and compatibility issues galore.  ACLs are one example of new file attributes users want.
75994 +<p>
75995 +Since files can also be directories in Reiser4, then we can implement
75996 +traditional file attributes as simply files.  To access a file
75997 +attribute, one need merely name the file, followed by a '/', followed
75998 +by an attribute name.  That is, a traditional file will be implemented
75999 +to possess some of the features of a directory, it will contains files
76000 +within the directory corresponding to file attributes which you can
76001 +access by their names, and it will contain a file body which is what
76002 +you access when you name the "directory" not the file.
76003 +<p>
76004 +Unix currently has a variety of attributes that are distinct from
76005 +files (ACLS, permissions, timestamps, other mostly security related
76006 +attributes....).  This is because a variety of persons needed this
76007 +feature and that, and there was no infrastructure that would allow
76008 +implementing the features as fully orthogonal features that could be
76009 +applied to any file.  Reiser4 will create that infrastructure.
76010 +<a name="list_features"></a>
76011 +<h3>List Of Features Needed To Get Attribute And Stream Functionality From Files And Directories</h3>
76012 +<ul>
76013 +<li>api efficient for small files
76014 +<li>efficient storage for small files
76015 +<li>plugins, including plugins that can compress a file servings as an attribute into a single bit
76016 +<li>files that also act as directories when accessed as directories
76017 +<li>inheritance (includes file aggregation)
76018 +<li>constraints
76019 +<li>transactions
76020 +<li>hidden directory entries
76021 +</ul>
76022 +<p>
76023 +The reader is asked to note that each of these additional features is a feature that the filesystem would benefit by the addition of anyway.  So we add them in v4.
76024 +<a name="mnt_fs"></a>
76025 +<H3>Mounting FS Flavors</H3>
76026 +<p>
76027 +Making these attributes accessible via filenames implies a slight
76028 +deviation from Unix tradition.  If we create a way for this deviation
76029 +to not be visible to those who don't want it, it paradoxically gives
76030 +us more freedom to deviate without getting paranoid about the effects
76031 +on existing applications.
76032 +
76033 +<p>
76034 +A strict POSIX filesystem API will be implemented as a restricted
76035 +functionality namespace obtained when mounting with --POSIX-only, and
76036 +it will be possible, and even usual, to mount the filesystem both with
76037 +and without --rich-semantics simultaneously each at different mount
76038 +points.  Note that Al Viro has done work in VFS to make this more
76039 +feasible, which is nice.
76040 +<p> "reiser4" will be a distinct filesystem type from "reiserfs" in
76041 +the eyes of the mount command.  Upon the completion of reiser4, we
76042 +will evaluate the relative costs of implementing a conversion script,
76043 +or supporting mounting "reiserfs" format filesystems using "reiser4".
76044 +Under no circumstance will we make it impossible to mount an old
76045 +"reiserfs" formatted filesystem, though users may or may not be able
76046 +to mount them as type "reiser4" --- this is not yet determined or
76047 +funded.
76048 +
76049 +<a name="api"></a>
76050 +<H2>API Suitable For Accessing Files That Store Security Attributes</H2>
76051 +
76052 +<p>A new system call reiser4() will be implemented to support
76053 +applications that don't have to be fooled into thinking that they are
76054 +using POSIX, and through this entry point a richer set of semantics
76055 +will access the same files that are also accessible using POSIX calls.
76056 +reiser4() will not implement more than hierarchical names, a full set
76057 +theoretic naming system as described on our future vision page will
76058 +not be implemented before reiser5() is implemented.  reiser4() will
76059 +implement all features necessary to access ACLs as files/directories
76060 +rather than as something neither file nor directory.  This includes
76061 +opening and closing transactions, performing a sequence of I/Os in one
76062 +system call, and accessing files without use of file descriptors
76063 +(necessary for efficient small I/O).  It will do it with a syntax
76064 +suitable for evolving into reiser5() syntax with its set theoretic
76065 +naming.
76066 +
76067 +<a name="flaws"></a>
76068 +<h3>Flaws In Traditional File API When Applied To Security Attributes</h3>
76069 +Security related attributes tend to be small.  The traditional filesystem API for reading and writing files has these flaws in the context of accessing security attributes:
76070 +<ul>
76071 +<li>Creating a file descriptor is excessive overhead and not useful when accessing an 8 byte attribute.
76072 +<li>A system call for every attribute accessed is too much overhead when accessing lots of little attributes.
76073 +<li>Lacking constraints:  it is important to constrain what is written to the attribute, often in complex ways.
76074 +<li>Lacking transactional semantics: Often one needs to update multiple attributes as one action that is guaranteed to either fully succeed or fully fail.
76075 +</ul>
76076 +<a name="resolution"></a>
76077 +<h3>The Usual Resolution Of These Flaws Is A One-Off Solution</h3>
76078 +<p>
76079 +The usual response to these flaws is that persons adding security
76080 +related and other attributes create a set of methods unique to their
76081 +attributes, plus non-reusable code to implement those methods in which
76082 +their particular attributes are accessed and stored not using the
76083 +methods for files, but using their particular methods for that
76084 +attribute.  Their particular API for that attribute typically does a
76085 +one-off instantiation of a lightweight single system call write
76086 +constrained atomic access with no code being reusable by those who
76087 +want to modify file bodies.  It is very basic and crucial to system
76088 +design to decompose desired functionality into reusable orthogonal
76089 +separated components.  Persons designing security attributes are
76090 +typically doing it without the filesystem that they want to add them
76091 +to offering them a proper foundation and toolkit.  They need more help
76092 +from us the core FS developers.  Linus said that we can have a system
76093 +call to use as our experimental plaything in this, and with what I
76094 +have in mind for the API, one rather flexible system call is all we
76095 +want for creating transactional lightweight batched constrained
76096 +accesses to files, with each of those adjectives to accesses being an
76097 +orthogonal optional feature that may or may not be invoked in a
76098 +particular instance of the new system call.
76099 +
76100 +<a name="solution"></a>
76101 +<h3>One-Off Solutions Are A Lot of Work To Do A Lot Of</h3>
76102 +<P>Looking at the coin from the other side, we want to make it an
76103 +order of magnitude less work to add features to ReiserFS, so that both
76104 +users and Namesys can add at least an order of magnitude more of them.
76105 +To verify that it is truly more extensible you have to do some
76106 +extending, and our DARPA funding motivates us to instantiate most of
76107 +those extensions as new security features.
76108 +
76109 +<p>This system call's syntax enables attributes to be implemented as a
76110 +particular type of file --- it avoids uglifying the semantics with two
76111 +APIs for two supposedly but not needfully different kinds of objects.
76112 +All of its special features that are useful for accessing particular
76113 +attributes are all available for use on files also.  It has symmetry,
76114 +and its features have been fully orthogonalized.  There will be
76115 +nothing particularly interesting about this system call to a languages
76116 +specialist (it's ideas are decades old except to filesystem
76117 +developers) until Reiser6, when we will further evolve it into a set theoretic
76118 +syntax that deconstructs tuple structured names into ordered set, and
76119 +unordered set, name components.  That is described at
76120 +www.namesys.com/future_vision.html
76121 +<a name="syscall"></a>
76122 +<h3>reiser4() System Call Description</h3>
76123 +<p>The reiser4() system call will contain a sequence of commands
76124 +separated by a separator ( comma only for now).
76125 +
76126 +<p>
76127 +Assignment, and transaction, will be the commands supported in
76128 +reiser4(), more commands will appear in reiser5.  => and <= will be
76129 +the assignment operators.
76130 +
76131 +<ul>lhs
76132 +(assignment target) values:
76133 +
76134 +<li> /process/buffer/first_byte/last_byte/bytes_written assigns
76135 +(writes) to the buffer starting at address first_byte in the process
76136 +address space, ending at last_byte, with the number of bytes actually
76137 +written (assignment source may be smaller or larger than assignment
76138 +target) being written to address bytes_written.  Representation of
76139 +first_byte,last_byte, and bytes_written is left to the coder to
76140 +determine, as it is an issue that will be of much dispute and little
76141 +importance.  Notice how / is used to indicate that the order of the
76142 +operands matters, see www.namesys.com/future_vision.html for details
76143 +of why this is appropriate syntax design.  Notice the lack of a file
76144 +descriptor
76145 +
76146 +<li>/filename assigns to the file named filename, wholly obliterating
76147 +its body with what is assigned.
76148 +
76149 +<li>/filename/..range/first_byte/last_byte/bytes_written writes to the
76150 +body, starting at first_byte, ending not past last_byte, recording
76151 +number of bytes written in bytes_written
76152 +
76153 +<li>/filename/..offset/offset_byte writes to the body starting at offset
76154 +
76155 +</ul>/..process/..range/first_byte/last_byte/bytes_written writes to
76156 +the process address space, starting at first_byte, ending not past
76157 +last_byte, recording number of bytes actually written in bytes_written
76158 +
76159 +<ul>rhs (assignment source) values:
76160 +
76161 +<li> /process/buffer/first_byte/last_byte/bytes_read reads from the
76162 +buffer starting at address first_byte in the process address space,
76163 +ending at last_byte, with the number of bytes actually read
76164 +(assignment source may be smaller or larger than assignment target)
76165 +being written to address bytes_read.  Representation of first_byte,
76166 +last_byte, and bytes_read is left to the coder to determine, as it is
76167 +an issue that will be of much dispute and little importance.
76168 +
76169 +<li>/filename reads the entirety of the file named filename.
76170 +
76171 +<li>/filename/..range/first_byte/last_byte/bytes_read reads from the
76172 +body, starting at first_byte, ending not past last_byte, recording
76173 +number of bytes read in bytes_read
76174 +
76175 +<li>/filename/..offset/offset_byte/bytes_read reads from the body
76176 +starting at offset until the end
76177 +
76178 +<li>/filename/..stat/owner reads from the ownership field of the stat
76179 +data (stat data is that which is returned by the stat() system call (owner, permissions,
76180 +etc.)
76181 +and stored on a per file basis by the FS)
76182 +
76183 +<li>/filename/..nonbody returns a delimiter separated aggregation of
76184 +all parts of the file except the body of the file (owner, permissions,
76185 +ACLs, etc.).
76186 +
76187 +</ul>
76188 +
76189 +<a name="tr_tr"></a>
76190 +<H3>Transactions and Transcrashes:</H3>
76191 +<a name="tr_necessary"></a>
76192 +<h4>Transactions Are Necessary Safeguard Against Certain Race Condition Exploits</h4>
76193 +
76194 +(This section to be replaced with link to Josh MacDonald paper when that is complete.)
76195 +<p>
76196 +Recently, a security exploit was discovered in all versions of the MIT Kerberos secure authentication system due to unsafe handling of temporary files [Bugtraq, 3/7/2001].
76197 +http://www.linuxsecurity.net/advisories/other_advisory-1204.html
76198 +<p>
76199 +During the process of generating a new ticket, the attacker creates a symbolic link that redirects the ticket file being written to an arbitrary location.  This kind of vulnerability is quite common, unfortunately, due to inherent weaknesses of the traditional POSIX file system interface.  There is no primitive support for an operation that atomically tests for the existence of a symbolic link prior to opening that location, not without vulnerability to races. The solution posted in the Kerberos incident does not completely eliminate the vulnerability.  Instead, vulnerability is greatly reduced through programmer vigilance (provided a few assumptions).  The existing file system interface leaves open potential vulnerabilities such as this, by default, due to the fact that it is a stateless interface.  In general, lacking transactions the result of a file system read cannot be trusted for security decisions; the instant a value is returned it may be out of date.
76200 +<p>
76201 +When security is a concern and the application is sufficiently important that it can be modified to conform with more secure interfaces, there is an easy solution to these problems --- transactions.  Transactions provide the framework for strict, fine-grained locking that is used to extend the atomicity of individual operations into an atomic sequence of operations.  In the Kerberos example, the ticket-writing application would instead issue a sequence of operations to:
76202 +<ul>
76203 +<li>lookup the file to be written,
76204 +<li>make security checks on that file,
76205 +<li>open the file for writing
76206 +<li>output data.
76207 +</ul>
76208 +<p>The transaction framework provides a context for ensuring that a security check remains consistent throughout the resulting operation.
76209 +<p>
76210 +Transactions also provide critical support for extensibility (i.e.,
76211 +plugins), since the system is able to automatically recover from
76212 +partial component failures, and transactions are necessary to support
76213 +consistent operations on multiple parts of an "aggregate" file.  [For
76214 +example: you wish to perform a complex edit over /etc/passwd that
76215 +requires the addition of one user and the deletion of another (e.g.,
76216 +rename user).  To perform that operation consistently you must have
76217 +transactions to preserve the invariant.]
76218 +<p>
76219 +There is a close relationship between version control and transaction
76220 +isolation, which is why the same programmer on our team (Josh
76221 +McDonald) does both.
76222 +
76223 +<a name="transcr"></a>
76224 +<H4>Transcrashes</H4>
76225 +<p>
76226 +There is a reason why filesystems implemented on top of databases have
76227 +not performed well.  Traditional database transactions do more work
76228 +than the filesystems needs them to do.  It is not that database
76229 +transactions are done wrong (far from it, we will take great pride in
76230 +adding database style transactions to reiser4), it is that in some
76231 +circumstances they are doing more work than is needed by traditional
76232 +filesystem usage requirements, and good performance requires making
76233 +the aspects of consistency independently selectable.  In particular,
76234 +filesystems often need to be able to guarantee that an operation will
76235 +be atomic with respect to surviving a crash, and DON'T need to
76236 +guarantee isolation with respect to other concurrent operations.  This
76237 +has profound performance import, and it affects not just buffering in
76238 +RAM, but also dramatically impacts the size of logs.
76239 +<p>
76240 +[J.n> Gray] models transactions as having 4 degrees of consistency.  I
76241 +find it more appropriate to model not degrees of consistency, which
76242 +implies that the features have ranked levels and one cannot have a
76243 +higher level feature without also getting the lower level features
76244 +with it, but aspects of consistency, each potentially fully orthogonal
76245 +to the other.
76246 +<p>
76247 +There are three aspects of consistency we will support initially, and
76248 +you'll note that they are decoupled and independently specifiable.
76249 +<ul>
76250 +<li>"Transcrashes", which guarantee that either all or none of the
76251 +transcrash will survive a crash.
76252 +<li>Branches, which guarantee isolation but not exclusion.
76253 +<li>Locks, which guarantee exclusion but not isolation.
76254 +</ul>
76255 +<p>
76256 +There is necessarily a performance cost to implementing an isolated
76257 +transaction.  This cost can be reduced for transcrashes which are not
76258 +also branched or locked.  Very frequently the application better knows
76259 +whether it needs to branch or lock, knows that its structure of
76260 +operation is such that it does not need the protection of branching
76261 +and locking, and it can depend on itself to do the right thing without
76262 +the significant unnecessary performance cost of asking the filesystem
76263 +to protect it from itself.
76264 +<p>
76265 +A "limited transcrash" has the property that it can be told to finish up
76266 +and either commit or abort within MAX_LIMITED_TRANSCRASH_DELAY time, and
76267 +it also has the property that the filesystem doesn't have to know how to
76268 +rollback if it chooses to abort but rather the user space process must
76269 +track how to do rollbacks.  Most such transcrashes will be implemented
76270 +to not ever rollback, but more simply to instead take responsibility
76271 +for ensuring that they can commit quickly enough.  If they fail to do
76272 +so, the commit will be imposed upon them before they have completed
76273 +the transcrash.  This approach is particularly useful for high
76274 +performance short running transcrashes.
76275 +<p>
76276 +For instance, suppose you want to do two simple updates to two files
76277 +as an atomic transaction, and these updates will not require longer
76278 +than MAX_TRANSCRASH_DELAY to be done, and you want to be able to do
76279 +many of these in parallel with high performance, and the application
76280 +process running in user space is able to handle worrying about
76281 +enforcing isolation through selective locking.  In that case, a common
76282 +view of the filesystem state involving many other such limited
76283 +transcrashes can be batched together and committed as one commit.
76284 +(This is necessarily higher performance.)  When memory pressure
76285 +triggers commit, all new transcrashes will hang while all outstanding
76286 +transcrashes are signalled to complete their transcrash, and given
76287 +MAX_TRANSCRASH_DELAY time in which they can be a running process if
76288 +they choose to be.  Carefully note that the delay allowed has to be
76289 +measured as time during which the process has priority if it chooses
76290 +to be runnable, not as absolute time.  (Nikita, please phrase this
76291 +more precisely for me, you know the scheduler better than I.)
76292 +<p>
76293 +A particular source of concern is high concurrency of semantically
76294 +unrelated data that has common metadata.  For instance, the super block
76295 +and the internal nodes of the tree.  Where the application can
76296 +track and self-ensure the isolation of itself from concurrent
76297 +processes rather than requiring the OS to give it its own atomically
76298 +merged and committed view, performance is very likely going to be
76299 +higher, and perhaps order of magnitude higher.
76300 +<p>
76301 +Reiser4 will implement limited transcrashes first, and whether it will
76302 +implement branching in v4.0 or 4.1 will depend on how fast
76303 +Josh works.
76304 +<p>
76305 +Why are limited transcrashs the priority for us?  We need to ensure that
76306 +the infrastructure we create is performance efficient for what
76307 +filesystems currently do before we enable new functionality based on
76308 +strong transactions.  In other words, we have gotten addicted to being
76309 +the fastest FS in town, and we don't want to lose that.  Reiser4 needs
76310 +transactional semantics for some of its internal housekeeping
76311 +(implementing rename), and only limited transcrashs are a high enough
76312 +performance architecture for those needs.
76313 +<p>
76314 +When any grouping delimiter ([] is the only one for 4.0) is preceded
76315 +by tw/transcrash_name (e.g. tw/transcrash_33[ /home/reiser/a <=
76316 +/home/reiser/b, /home/reiser/c <= /home/reiser/d]), then it delimits a
76317 +transcrash.  We leave unspecified for now how to have multipart
76318 +specifications of a transcrash (I am getting pretty shameless in
76319 +deferring things for v4.1, yes...? ).  Transactions logically batch
76320 +not nest, extent that the interpreter will error check the nesting to
76321 +make sure that it has not been passed confused garbage.
76322 +<p>
76323 +To anyone who has worked in databases or any other aspect of language
76324 +design, this design surely seems exceedingly simple and modest.  To
76325 +many filesystem and OS folks, this seems like something extraordinary,
76326 +commands that are parsed, oh no!  The complexity will be
76327 +extraordinary, oh no!  Sigh.  Namesys, determined to bring radical new
76328 +1960's technology from other areas of computer science into the file
76329 +systems field no matter how crazy our competitors think we are!  Sigh.
76330 +Reiser4 will be smaller than XFS much less VxFS....
76331 +
76332 +<a name="performance"></a>
76333 +<H1>Performance Enhancements</H1>
76334 +
76335 +<a name="dancing"></a>
76336 +<H2> Dancing Trees Are Faster Than Balanced Trees</H2>
76337 +
76338 +<p> ReiserFS V4 will also add innovations in the fundamental tree
76339 +technology.  We will employ not balanced trees, but "dancing trees".
76340 +Dancing trees merge insufficiently full nodes not with every
76341 +modification to the tree, but instead:
76342 +
76343 +<ul>
76344 +
76345 +<li>in response to memory pressure
76346 +triggering a commit.
76347 +
76348 +<li>when an insertion into an internal node presents a danger of
76349 +needing to split the internal node, and the immediate children are not
76350 +sufficiently space efficient (sufficiently being a configurable
76351 +value), and reducing the number of children could avoid the split.
76352 +</ul>
76353 +<a name="if_ram"></a>
76354 +<h3>If It Is In RAM, Dirty, and Contiguous, Then Squeeze It ALL
76355 +Together Just Before Writing</h3>
76356 +
76357 +<p>Let a slum be defined as a maximal sequence of contiguous in the
76358 +tree order, and dirty in this transaction, nodes.  A dancing tree,
76359 +when presented with memory pressure, responds to it by committing the
76360 +transaction, and the commit in turn triggers a repacking of all slums
76361 +involved in the transaction which it estimates can be squeezed into
76362 +fewer nodes than they currently occupy.
76363 +<p>
76364 +Balanced trees have an
76365 +inherent tradeoff between balancing cost and space efficiency.  If
76366 +they consider more neighboring nodes, for the purpose of merging them
76367 +to save a node, with every change to the tree, then they can pack the
76368 +tree more tightly at the cost of moving more data with every change to
76369 +the tree.
76370 +<p>
76371 +By contrast, with a dancing tree, you simply take a large slum, shove
76372 +everything in it as far to the left as it will go, and then free all
76373 +the nodes in the slum that are left with nothing remaining in them, at
76374 +the time of committing the slum's contents to disk in response to
76375 +memory pressure.  This gives you extreme space efficiency when slums
76376 +are large, at a cost in data movement that is lower than it would be
76377 +with an invariant balancing criterion because it is done less often.
76378 +By compressing at the time one flushes to disk, one compresses less
76379 +often, and that means one can afford to do it more thoroughly.
76380 +
76381 +<a name="repacker"></a>
76382 +<h2>Repacker</h2>
76383 +<p>
76384 +Another way of escaping from the balancing time vs. space efficiency
76385 +tradeoff is to use a repacker.  80% of files on the disk remain
76386 +unchanged for long periods of time.  It is efficient to pack them
76387 +perfectly, by using a repacker that runs much less often than every
76388 +write to disk.  This repacker goes through the entire tree ordering,
76389 +from left to right and then from right to left, alternating each time
76390 +it runs.  When it goes from left to right in the tree ordering, it
76391 +shoves everything as far to the left as it will go, and when it goes
76392 +from right to left it shoves everything as far to the right as it will
76393 +go.  (Left means small in key or in block number:-) ).  In the absence
76394 +of FS activity the effect of this over time is to sort by tree order
76395 +(defragment), and to pack with perfect efficiency.
76396 +<p>
76397 +Reiser4.1 will modify the repacker to insert controlled "airholes", as
76398 +it is well known that insertion efficiency is harmed by overly tight
76399 +packing.
76400 +<p>
76401 +I hypothesize that it is more efficient to periodically run a repacker
76402 +that systematically repacks using large IOs, than to perform lots of 1
76403 +block reads of neighboring nodes of the modification points, so as to
76404 +preserve a balancing invariant in the face of poorly localized
76405 +modifications to the tree.
76406 +<a name="commit"></a>
76407 +<H2>Encryption On Commit</H2>
76408 +<p>
76409 +Currently, encrypted files suffer severely in their write performance
76410 +when implemented using schemes that encrypt at every write() rather
76411 +than at every commit to disk.  We will implement encrypt on flush,
76412 +such that a file with an encryption plugin id is encrypted not at the
76413 +time of write, but at the time of commit to disk.  This is both
76414 +non-trivial to implement, and important to performance.  It requires
76415 +implementing a memory pressure manager for ReiserFS.  That memory
76416 +pressure manager would receive a request to either reduce memory
76417 +consumed, reduce dirty memory (dirty memory needs special treatment
76418 +for deadlock avoidance reasons), or verify that nothing overly old has
76419 +been kept in memory for too long.  It would respond by selecting what
76420 +to commit, and preparing it for writing to disk.  That preparation
76421 +will consist of encrypting it for those files that implement the
76422 +encryption plugin.  (It can also consist of allocating optimal block
76423 +numbers and repacking formatted nodes and compressing data, but that
76424 +is not of such concern here.)  I suspect you will want us to
76425 +coordinate with the PGP developers you are also contracting with.
76426 +
76427 +<p>Encryption is implemented as a special form of repacking, and it
76428 +occurs for any node which has its CONTAINS_ENCRYPTED_DATA state flag
76429 +set on it regardless of space usage.  With the dancing tree
76430 +infrastructure in place, it should be only a moderate amount of work
76431 +to implement encryption as a variant on repacking on commit.
76432 +<p>
76433 +<a name="wand_lgs"></a>
76434 +<h3>Wandering Logs</h3>
76435 +<a name="more_detailed"></a>
76436 +<h4>(More detailed treatment soon to be available at www.namesys.com/transactions.html by Joshua MacDonald.)</h4>
76437 +<p>
76438 +Traditional fixed location logs have a problem in that data gets
76439 +written twice, once to the log, and once to the rest of the
76440 +filesystem.
76441 +<p>
76442 +Instead of moving data out of the log, wandering logs redefine what
76443 +blocks compose the log.  There is no fixed location for where the log
76444 +is, though there are fixed locations for where the definition of what
76445 +blocks compose the log is.
76446 +<p>
76447 +This approach has two principle disadvantages:
76448 +<ul>
76449 +<li>blocks which contain data that must be retained if the transcrash
76450 +fails to complete cannot be written in the place of the block
76451 +containing the old data, and their location may not be as optimal as
76452 +that of the new data (it may also be more optimal though)
76453 +
76454 +<li>it does not support undo/redo for isolated transactions
76455 +</ul>
76456 +<p>
76457 +This means that in addition to wandering block logs, we also need
76458 +wandering logical logs.
76459 +<p>
76460 +Wandering logical logs log for every transaction enough information to
76461 +either redo or undo each isolated transaction.
76462 +<p>
76463 +They have the disadvantage that first they write the data into the log
76464 +(though it can go anywhere convenient to define as part of the log),
76465 +and then they write the data again after the transaction commits.
76466 +<p>
76467 +They have the advantage that for small updates (when not logging a 100
76468 +megabyte file) their log is smaller.  This is potentially useful for
76469 +distributed filesystems which operate by transmitting the log.
76470 +<p>
76471 +The compelling reason for supporting them is that they are needed for
76472 +supporting isolated transactions, and while isolated transactions are
76473 +expected to be only a small fraction of total disk IO, they are quite
76474 +important functionally.  (How many bytes does it take to make your
76475 +system not secure.... )
76476 +
76477 +<a name="conclusion"></a>
76478 +<h1>Conclusion</h1>
76479 +<p>
76480 +Reiser4 will offer a dramatically better infrastructure for creating
76481 +new filesystem features.  Files and directories will have all of the
76482 +features needed to make it not necessary to have file attributes be
76483 +something different from files.  The effectiveness of this new
76484 +infrastructure will be tested using a variety of new security
76485 +features.  Performance will be greatly improved by the use of dancing
76486 +trees, wandering logs, allocate on flush, a repacker, and encryption
76487 +on commit.
76488 diff -rupN linux-2.6.8-rc3/fs/reiser4/repacker.c linux-2.6.8-rc3-a/fs/reiser4/repacker.c
76489 --- linux-2.6.8-rc3/fs/reiser4/repacker.c       1970-01-01 03:00:00.000000000 +0300
76490 +++ linux-2.6.8-rc3-a/fs/reiser4/repacker.c     2004-08-05 21:20:52.967684890 +0400
76491 @@ -0,0 +1,661 @@
76492 +/* Copyright 2003 by Hans Reiser */
76493 +
76494 +/*
76495 +   The reiser4 repacker.
76496 +
76497 +   It walks the reiser4 tree and marks all nodes (reads them if it is
76498 +   necessary) for repacking by setting JNODE_REPACK bit. Also, all nodes which
76499 +   have no JNODE_REPACK bit set nodes added to a transaction and marked dirty.
76500 +*/
76501 +
76502 +#include <linux/kernel.h>
76503 +#include <linux/fs.h>
76504 +#include <linux/kobject.h>
76505 +#include <linux/sched.h>
76506 +#include <linux/writeback.h>
76507 +#include <linux/suspend.h>
76508 +
76509 +#include <asm/atomic.h>
76510 +
76511 +#include "reiser4.h"
76512 +#include "kattr.h"
76513 +#include "super.h"
76514 +#include "tree.h"
76515 +#include "tree_walk.h"
76516 +#include "jnode.h"
76517 +#include "znode.h"
76518 +#include "block_alloc.h"
76519 +
76520 +#include "plugin/item/extent.h"
76521 +
76522 +#include <linux/spinlock.h>
76523 +#include "kcond.h"
76524 +
76525 +#include "repacker.h"
76526 +
76527 +/* The reiser4 repacker process nodes by chunks of REPACKER_CHUNK_SIZE
76528 + * size. */
76529 +#define REPACKER_DEFAULT_CHUNK_SIZE 512
76530 +
76531 +enum repacker_state_bits {
76532 +       REPACKER_RUNNING       = 0x1,
76533 +       REPACKER_STOP          = 0x2,
76534 +       REPACKER_DESTROY       = 0x4,
76535 +       REPACKER_GOES_BACKWARD = 0x8
76536 +};
76537 +
76538 +/* Per super block repacker structure for  */
76539 +struct repacker {
76540 +       /* Back reference to a super block. */
76541 +       struct super_block * super;
76542 +       /* Repacker thread state */
76543 +       enum repacker_state_bits  state;
76544 +       /* A spin lock to protect state */
76545 +       spinlock_t guard;
76546 +       /* A conditional variable to wait repacker state change. */
76547 +       kcond_t    cond;
76548 +#if REISER4_USE_SYSFS
76549 +       /* An object (kobject), externally visible through SysFS. */
76550 +       struct kobject kobj;
76551 +#endif
76552 +       struct {
76553 +               reiser4_key start_key;
76554 +               reiser4_block_nr chunk_size;
76555 +               reiser4_block_nr count;
76556 +       } params;
76557 +};
76558 +
76559 +/* A thread-safe repacker check state bit routine.  */
76560 +static inline int check_repacker_state_bit(struct repacker *repacker, enum repacker_state_bits bits)
76561 +{
76562 +       int result;
76563 +
76564 +       spin_lock(&repacker->guard);
76565 +       result = !!(repacker->state & bits);
76566 +       spin_unlock(&repacker->guard);
76567 +
76568 +       return result;
76569 +}
76570 +
76571 +static int check_repacker_state (struct repacker * repacker)
76572 +{
76573 +       if (check_repacker_state_bit(
76574 +                   get_current_super_private()->repacker, REPACKER_STOP))
76575 +               return -EINTR;
76576 +       if (current->flags & PF_FREEZE)
76577 +               return -E_REPEAT;
76578 +       if (current_atom_should_commit())
76579 +               return -E_REPEAT;
76580 +       return 0;
76581 +}
76582 +
76583 +static void repacker_cursor_init (struct repacker_cursor * cursor, struct repacker * repacker)
76584 +{
76585 +       int backward = check_repacker_state_bit(repacker, REPACKER_GOES_BACKWARD);
76586 +
76587 +       xmemset(cursor, 0, sizeof (struct repacker_cursor));
76588 +
76589 +       blocknr_hint_init(&cursor->hint);
76590 +       cursor->hint.backward = backward;
76591 +
76592 +       if (backward)
76593 +               cursor->hint.blk = get_current_super_private()->block_count - 1;
76594 +       else
76595 +               cursor->hint.blk = 0;
76596 +}
76597 +
76598 +static void repacker_cursor_done (struct repacker_cursor * cursor)
76599 +{
76600 +       blocknr_hint_done(&cursor->hint);
76601 +}
76602 +
76603 +/* routines for closing current transaction and beginning new one */
76604 +
76605 +static int end_work (void)
76606 +{
76607 +       reiser4_context * ctx = get_current_context();
76608 +
76609 +       txn_end(ctx);
76610 +       return 0;
76611 +}
76612 +static void begin_work (void)
76613 +{
76614 +       reiser4_context * ctx = get_current_context();
76615 +       preempt_point();
76616 +       txn_begin(ctx);
76617 +}
76618 +
76619 +/* Processing of a formatted node when the repacker goes forward. */
76620 +static int process_znode_forward (tap_t * tap, void * arg)
76621 +{
76622 +       struct repacker_cursor * cursor = arg;
76623 +       znode * node = tap->lh->node;
76624 +       int ret;
76625 +
76626 +       assert("zam-954", cursor->count > 0);
76627 +
76628 +       ret = check_repacker_state(get_current_super_private()->repacker);
76629 +       if (ret)
76630 +               return ret;
76631 +
76632 +       if (ZF_ISSET(node, JNODE_REPACK))
76633 +               return 0;
76634 +
76635 +       if (current_atom_should_commit())
76636 +               return -E_REPEAT;
76637 +
76638 +       znode_make_dirty(node);
76639 +       ZF_SET(node, JNODE_REPACK);
76640 +
76641 +       cursor->stats.znodes_dirtied ++;
76642 +
76643 +       if (-- cursor->count <= 0)
76644 +               return -E_REPEAT;
76645 +       return 0;
76646 +}
76647 +
76648 +/* Processing of unformatted nodes (of one extent unit) when the repacker goes
76649 + * forward. */
76650 +static int process_extent_forward (tap_t *tap, void * arg)
76651 +{
76652 +       int ret;
76653 +       struct repacker_cursor * cursor = arg;
76654 +
76655 +       ret = check_repacker_state(get_current_super_private()->repacker);
76656 +       if (ret)
76657 +               return ret;
76658 +
76659 +       ret = mark_extent_for_repacking(tap, cursor->count);
76660 +       if (ret > 0) {
76661 +               cursor->stats.jnodes_dirtied += ret;
76662 +               cursor->count -= ret;
76663 +               if (cursor->count <= 0)
76664 +                            return -E_REPEAT;
76665 +               return 0;
76666 +       }
76667 +
76668 +       return ret;
76669 +}
76670 +
76671 +
76672 +/* It is for calling by tree walker before taking any locks. */
76673 +static int prepare_repacking_session (void * arg)
76674 +{
76675 +       struct repacker_cursor * cursor = arg;
76676 +       int ret;
76677 +
76678 +       assert("zam-951", schedulable());
76679 +
76680 +       all_grabbed2free();
76681 +       ret = end_work();
76682 +       if (ret)
76683 +               return ret;
76684 +
76685 +       if (current->flags & PF_FREEZE)
76686 +               refrigerator(PF_FREEZE);
76687 +
76688 +       balance_dirty_pages_ratelimited(get_current_super_private()->fake->i_mapping);
76689 +       begin_work();
76690 +       cursor->count = get_current_super_private()->repacker->params.chunk_size;
76691 +       return  reiser4_grab_space((__u64)cursor->count,
76692 +                                  BA_CAN_COMMIT | BA_FORCE);
76693 +}
76694 +
76695 +/* When the repacker goes backward (from the rightmost key to the leftmost
76696 + * one), it does relocation of all processed nodes to the end of disk.  Thus
76697 + * repacker does what usually the reiser4 flush does but in backward direction
76698 + * and node squeezing is not supported. */
76699 +static int process_znode_backward (tap_t * tap, void * arg)
76700 +{
76701 +       lock_handle parent_lock;
76702 +       load_count parent_load;
76703 +       znode * child = tap->lh->node;
76704 +       struct repacker_cursor * cursor = arg;
76705 +       __u64 new_blocknr;
76706 +       int ret;
76707 +
76708 +       assert("zam-977", (unsigned)(cursor->count) <= get_current_context()->grabbed_blocks);
76709 +
76710 +       /* Add node to current transaction like in processing forward. */
76711 +       ret = process_znode_forward(tap, arg);
76712 +       if (ret)
76713 +               return ret;
76714 +
76715 +       init_lh(&parent_lock);
76716 +       ret = reiser4_get_parent(&parent_lock, child, ZNODE_WRITE_LOCK, 0);
76717 +       if (ret)
76718 +               goto out;
76719 +
76720 +       init_load_count(&parent_load);
76721 +
76722 +       /* Do not relocate nodes which were processed by flush already. */
76723 +       if (ZF_ISSET(child, JNODE_RELOC) || ZF_ISSET(child, JNODE_OVRWR))
76724 +               goto out;
76725 +
76726 +       if (ZF_ISSET(child, JNODE_CREATED)) {
76727 +               assert("zam-962", blocknr_is_fake(znode_get_block(child)));
76728 +               cursor->hint.block_stage = BLOCK_UNALLOCATED;
76729 +       } else {
76730 +               if (znode_get_level(child) == LEAF_LEVEL)
76731 +                       cursor->hint.block_stage = BLOCK_FLUSH_RESERVED;
76732 +               else {
76733 +                       ret = reiser4_grab_space((__u64)1,
76734 +                                                BA_FORCE |
76735 +                                                BA_RESERVED |
76736 +                                                BA_PERMANENT |
76737 +                                                BA_FORMATTED);
76738 +                       if (ret)
76739 +                               goto out;
76740 +
76741 +                       cursor->hint.block_stage = BLOCK_GRABBED;
76742 +               }
76743 +       }
76744 +
76745 +       {
76746 +               __u64 len = 1UL;
76747 +
76748 +               ret = reiser4_alloc_blocks(&cursor->hint, &new_blocknr, &len,
76749 +                                          BA_PERMANENT | BA_FORMATTED);
76750 +               if (ret)
76751 +                       goto out;
76752 +
76753 +               cursor->hint.blk = new_blocknr;
76754 +       }
76755 +
76756 +       if (!ZF_ISSET(child, JNODE_CREATED)) {
76757 +               ret = reiser4_dealloc_block(znode_get_block(child), 0,
76758 +                                   BA_DEFER | BA_PERMANENT | BA_FORMATTED);
76759 +               if (ret)
76760 +                       goto out;
76761 +       }
76762 +
76763 +       /* Flush doesn't process nodes twice, it will not discard this block
76764 +        * relocation. */
76765 +       ZF_SET(child, JNODE_RELOC);
76766 +
76767 +       /* Update parent reference. */
76768 +       if (unlikely(znode_above_root(parent_lock.node))) {
76769 +               reiser4_tree * tree = current_tree;
76770 +               UNDER_RW_VOID(tree, tree, write, tree->root_block = new_blocknr);
76771 +       } else {
76772 +               coord_t parent_coord;
76773 +               item_plugin *iplug;
76774 +
76775 +               ret = incr_load_count_znode(&parent_load, parent_lock.node);
76776 +               if (ret)
76777 +                       goto out;
76778 +
76779 +               ret = find_child_ptr(parent_lock.node, child, &parent_coord);
76780 +               if (ret)
76781 +                       goto out;
76782 +
76783 +               assert ("zam-960", item_is_internal(&parent_coord));
76784 +               assert ("zam-961", znode_is_loaded(child));
76785 +               iplug = item_plugin_by_coord(&parent_coord);
76786 +               assert("zam-964", iplug->f.update != NULL);
76787 +               iplug->f.update(&parent_coord, &new_blocknr);
76788 +       }
76789 +
76790 +       znode_make_dirty(parent_lock.node);
76791 +       ret = znode_rehash(child, &new_blocknr);
76792 +
76793 + out:
76794 +       done_load_count(&parent_load);
76795 +       done_lh(&parent_lock);
76796 +       assert("zam-982", (unsigned)(cursor->count) <= get_current_context()->grabbed_blocks);
76797 +       return ret;
76798 +}
76799 +
76800 +/* Processing of unformatted nodes when the repacker goes backward. */
76801 +static int process_extent_backward (tap_t * tap, void * arg)
76802 +{
76803 +       struct repacker_cursor * cursor = arg;
76804 +       int ret;
76805 +
76806 +       assert("zam-978", (unsigned)(cursor->count) <= get_current_context()->grabbed_blocks);
76807 +
76808 +       ret = check_repacker_state(get_current_super_private()->repacker);
76809 +       if (ret)
76810 +               return ret;
76811 +
76812 +       ret = process_extent_backward_for_repacking(tap, cursor);
76813 +       if (ret)
76814 +               return ret;
76815 +       if (cursor->count <= 0)
76816 +               return -E_REPEAT;
76817 +
76818 +       return 0;
76819 +}
76820 +/* A set of functions to be called by tree_walk in repacker forward pass. */
76821 +static struct tree_walk_actor forward_actor = {
76822 +       .process_znode  = process_znode_forward,
76823 +       .process_extent = process_extent_forward,
76824 +       .before         = prepare_repacking_session
76825 +};
76826 +
76827 +/* A set of functions to be called by tree_walk in repacker backward pass. */
76828 +static struct tree_walk_actor backward_actor = {
76829 +       .process_znode  = process_znode_backward,
76830 +       .process_extent = process_extent_backward,
76831 +       .before         = prepare_repacking_session
76832 +};
76833 +
76834 +
76835 +reiser4_internal int reiser4_repacker (struct repacker * repacker)
76836 +{
76837 +       struct repacker_cursor cursor;
76838 +       int backward;
76839 +       struct tree_walk_actor * actor;
76840 +       int ret;
76841 +
76842 +       repacker_cursor_init(&cursor, repacker);
76843 +
76844 +       backward = check_repacker_state_bit(repacker, REPACKER_GOES_BACKWARD);
76845 +       actor = backward ? &backward_actor : &forward_actor;
76846 +       ret = tree_walk(NULL, backward, actor, &cursor);
76847 +       printk(KERN_INFO "reiser4 repacker: "
76848 +              "%lu formatted node(s) processed, %lu unformatted node(s) processed, ret = %d\n",
76849 +              cursor.stats.znodes_dirtied, cursor.stats.jnodes_dirtied, ret);
76850 +
76851 +       repacker_cursor_done(&cursor);
76852 +       return ret;
76853 +}
76854 +
76855 +/* The repacker kernel thread code. */
76856 +reiser4_internal int repacker_d(void *arg)
76857 +{
76858 +       struct repacker * repacker = arg;
76859 +       struct task_struct * me = current;
76860 +       int ret;
76861 +
76862 +       reiser4_context ctx;
76863 +
76864 +       daemonize("k_reiser4_repacker_d");
76865 +
76866 +       /* block all signals */
76867 +       spin_lock_irq(&me->sighand->siglock);
76868 +       siginitsetinv(&me->blocked, 0);
76869 +       recalc_sigpending();
76870 +       spin_unlock_irq(&me->sighand->siglock);
76871 +
76872 +       /* zeroing the fs_context copied form parent process' task struct. */
76873 +       me->journal_info = NULL;
76874 +
76875 +       printk(KERN_INFO "Repacker: I am alive, pid = %u\n", me->pid);
76876 +       ret = init_context(&ctx, repacker->super);
76877 +       if (!ret) {
76878 +               ret = reiser4_repacker(repacker);
76879 +               reiser4_exit_context(&ctx);
76880 +       }
76881 +
76882 +       spin_lock(&repacker->guard);
76883 +       repacker->state &= ~REPACKER_RUNNING;
76884 +       kcond_broadcast(&repacker->cond);
76885 +       spin_unlock(&repacker->guard);
76886 +
76887 +       return ret;
76888 +}
76889 +
76890 +static void wait_repacker_completion(struct repacker * repacker)
76891 +{
76892 +       if (repacker->state & REPACKER_RUNNING) {
76893 +               kcond_wait(&repacker->cond, &repacker->guard, 0);
76894 +               assert("zam-956", !(repacker->state & REPACKER_RUNNING));
76895 +       }
76896 +}
76897 +
76898 +static int start_repacker(struct repacker * repacker)
76899 +{
76900 +       spin_lock(&repacker->guard);
76901 +       if (!(repacker->state & REPACKER_DESTROY)) {
76902 +               repacker->state &= ~REPACKER_STOP;
76903 +               if (!(repacker->state & REPACKER_RUNNING)) {
76904 +                       repacker->state |= REPACKER_RUNNING;
76905 +                       spin_unlock(&repacker->guard);
76906 +                       kernel_thread(repacker_d, repacker, CLONE_VM | CLONE_FS | CLONE_FILES);
76907 +                       return 0;
76908 +               }
76909 +       }
76910 +       spin_unlock(&repacker->guard);
76911 +       return 0;
76912 +}
76913 +
76914 +static void stop_repacker(struct repacker * repacker)
76915 +{
76916 +       spin_lock(&repacker->guard);
76917 +       repacker->state |= REPACKER_STOP;
76918 +       spin_unlock(&repacker->guard);
76919 +}
76920 +
76921 +#if REISER4_USE_SYSFS
76922 +
76923 +struct repacker_attr {
76924 +       struct attribute attr;
76925 +       ssize_t (*show)(struct repacker *, char * buf);
76926 +       ssize_t (*store)(struct repacker *, const char * buf, size_t size);
76927 +};
76928 +
76929 +static ssize_t start_attr_show (struct repacker * repacker, char * buf)
76930 +{
76931 +       return snprintf(buf, PAGE_SIZE , "%d", check_repacker_state_bit(repacker, REPACKER_RUNNING));
76932 +}
76933 +
76934 +static ssize_t start_attr_store (struct repacker * repacker,  const char *buf, size_t size)
76935 +{
76936 +       int start_stop = 0;
76937 +
76938 +       sscanf(buf, "%d", &start_stop);
76939 +       if (start_stop)
76940 +               start_repacker(repacker);
76941 +       else
76942 +               stop_repacker(repacker);
76943 +
76944 +       return size;
76945 +}
76946 +
76947 +static ssize_t direction_attr_show (struct repacker * repacker, char * buf)
76948 +{
76949 +       return snprintf(buf, PAGE_SIZE , "%d", check_repacker_state_bit(repacker, REPACKER_GOES_BACKWARD));
76950 +}
76951 +
76952 +static ssize_t direction_attr_store (struct repacker * repacker,  const char *buf, size_t size)
76953 +{
76954 +       int go_left = 0;
76955 +
76956 +       sscanf(buf, "%d", &go_left);
76957 +
76958 +       spin_lock(&repacker->guard);
76959 +       if (!(repacker->state & REPACKER_RUNNING)) {
76960 +               if (go_left)
76961 +                       repacker->state |= REPACKER_GOES_BACKWARD;
76962 +               else
76963 +                       repacker->state &= ~REPACKER_GOES_BACKWARD;
76964 +       }
76965 +       spin_unlock(&repacker->guard);
76966 +       return size;
76967 +}
76968 +
76969 +static ssize_t start_key_attr_show (struct repacker * repacker, char * buf)
76970 +{
76971 +       spin_lock(&repacker->guard);
76972 +       spin_unlock(&repacker->guard);
76973 +
76974 +       return 0;
76975 +}
76976 +
76977 +static ssize_t start_key_attr_store (struct repacker * repacker,  const char *buf, size_t size)
76978 +{
76979 +       spin_lock(&repacker->guard);
76980 +       spin_unlock(&repacker->guard);
76981 +
76982 +       return (ssize_t)size;
76983 +}
76984 +
76985 +static ssize_t count_attr_show (struct repacker * repacker, char * buf)
76986 +{
76987 +       __u64 count;
76988 +
76989 +       spin_lock(&repacker->guard);
76990 +       count = repacker->params.count;
76991 +       spin_unlock(&repacker->guard);
76992 +
76993 +       return snprintf(buf, PAGE_SIZE, "%llu", (unsigned long long)count);
76994 +}
76995 +
76996 +static ssize_t count_attr_store (struct repacker * repacker,  const char *buf, size_t size)
76997 +{
76998 +       unsigned long long count;
76999 +
77000 +       sscanf(buf, "%Lu", &count);
77001 +
77002 +       spin_lock(&repacker->guard);
77003 +       repacker->params.count = (__u64)count;
77004 +       spin_unlock(&repacker->guard);
77005 +
77006 +       return (ssize_t)size;
77007 +}
77008 +
77009 +static ssize_t chunk_size_attr_show (struct repacker * repacker, char * buf)
77010 +{
77011 +       __u64 chunk_size;
77012 +
77013 +       spin_lock(&repacker->guard);
77014 +       chunk_size = repacker->params.chunk_size;
77015 +       spin_unlock(&repacker->guard);
77016 +
77017 +       return snprintf(buf, PAGE_SIZE, "%Lu", (unsigned long long)chunk_size);
77018 +}
77019 +
77020 +static ssize_t chunk_size_attr_store (struct repacker * repacker,  const char *buf, size_t size)
77021 +{
77022 +       unsigned long long chunk_size;
77023 +
77024 +       sscanf(buf, "%Lu", &chunk_size);
77025 +
77026 +       spin_lock(&repacker->guard);
77027 +       repacker->params.chunk_size = (__u64)chunk_size;
77028 +       spin_unlock(&repacker->guard);
77029 +
77030 +       return (ssize_t)size;
77031 +}
77032 +
77033 +#define REPACKER_ATTR(attr_name, perm)                 \
77034 +static struct repacker_attr attr_name ## _attr = {     \
77035 +       .attr = {                                       \
77036 +               .name = # attr_name,                    \
77037 +               .mode = perm                            \
77038 +       },                                              \
77039 +       .show = attr_name ## _attr_show,                \
77040 +       .store = attr_name ## _attr_store,              \
77041 +}
77042 +
77043 +REPACKER_ATTR(start, 0644);
77044 +REPACKER_ATTR(direction, 0644);
77045 +REPACKER_ATTR(start_key, 0644);
77046 +REPACKER_ATTR(count, 0644);
77047 +REPACKER_ATTR(chunk_size, 0644);
77048 +
77049 +static struct attribute * repacker_def_attrs[] = {
77050 +       &start_attr.attr,
77051 +       &direction_attr.attr,
77052 +       &start_key_attr.attr,
77053 +       &count_attr.attr,
77054 +       &chunk_size_attr.attr,
77055 +       NULL
77056 +};
77057 +
77058 +static ssize_t repacker_attr_show (struct kobject *kobj, struct attribute *attr,  char *buf)
77059 +{
77060 +       struct repacker_attr * r_attr = container_of(attr, struct repacker_attr, attr);
77061 +       struct repacker * repacker = container_of(kobj, struct repacker, kobj);
77062 +
77063 +       return r_attr->show(repacker, buf);
77064 +}
77065 +
77066 +static ssize_t repacker_attr_store (struct kobject *kobj, struct attribute *attr, const char *buf, size_t size)
77067 +{
77068 +       struct repacker_attr * r_attr = container_of(attr, struct repacker_attr, attr);
77069 +       struct repacker * repacker = container_of(kobj, struct repacker, kobj);
77070 +
77071 +       return r_attr->store(repacker, buf, size);
77072 +}
77073 +
77074 +static struct sysfs_ops repacker_sysfs_ops = {
77075 +       .show  = repacker_attr_show,
77076 +       .store = repacker_attr_store
77077 +};
77078 +
77079 +static struct kobj_type repacker_ktype = {
77080 +       .sysfs_ops     = &repacker_sysfs_ops,
77081 +       .default_attrs = repacker_def_attrs,
77082 +       .release       = NULL
77083 +};
77084 +
77085 +static int init_repacker_sysfs_interface (struct super_block * s)
77086 +{
77087 +       int ret = 0;
77088 +       reiser4_super_info_data * sinfo = get_super_private(s);
77089 +       struct kobject * root = &sinfo->kobj.kobj;
77090 +       struct repacker * repacker = sinfo->repacker;
77091 +
77092 +       assert("zam-947", repacker != NULL);
77093 +
77094 +       snprintf(repacker->kobj.name, KOBJ_NAME_LEN, "repacker");
77095 +       repacker->kobj.parent = kobject_get(root);
77096 +       repacker->kobj.ktype = &repacker_ktype;
77097 +       ret = kobject_register(&repacker->kobj);
77098 +
77099 +       return ret;
77100 +}
77101 +
77102 +static void done_repacker_sysfs_interface (struct super_block * s)
77103 +{
77104 +       reiser4_super_info_data * sinfo = get_super_private(s);
77105 +
77106 +       kobject_unregister(&sinfo->repacker->kobj);
77107 +}
77108 +
77109 +#else  /* REISER4_USE_SYSFS */
77110 +
77111 +#define init_repacker_sysfs_interface(s) (0)
77112 +#define done_repacker_sysfs_interface(s) do{}while(0)
77113 +
77114 +#endif /* REISER4_USE_SYSFS */
77115 +
77116 +reiser4_internal int init_reiser4_repacker (struct super_block *super)
77117 +{
77118 +       reiser4_super_info_data * sinfo = get_super_private(super);
77119 +
77120 +       assert ("zam-946", sinfo->repacker == NULL);
77121 +       sinfo->repacker = kmalloc(sizeof (struct repacker), GFP_KERNEL);
77122 +       if (sinfo->repacker == NULL)
77123 +               return -ENOMEM;
77124 +       xmemset(sinfo->repacker, 0, sizeof(struct repacker));
77125 +       sinfo->repacker->super = super;
77126 +
77127 +       /* set repacker parameters by default values */
77128 +       sinfo->repacker->params.chunk_size = REPACKER_DEFAULT_CHUNK_SIZE;
77129 +
77130 +       spin_lock_init(&sinfo->repacker->guard);
77131 +       kcond_init(&sinfo->repacker->cond);
77132 +
77133 +       return init_repacker_sysfs_interface(super);
77134 +}
77135 +
77136 +reiser4_internal void done_reiser4_repacker (struct super_block *super)
77137 +{
77138 +       reiser4_super_info_data * sinfo = get_super_private(super);
77139 +       struct repacker * repacker;
77140 +
77141 +       repacker = sinfo->repacker;
77142 +       assert("zam-945", repacker != NULL);
77143 +       done_repacker_sysfs_interface(super);
77144 +
77145 +       spin_lock(&repacker->guard);
77146 +       repacker->state |= (REPACKER_STOP | REPACKER_DESTROY);
77147 +       wait_repacker_completion(repacker);
77148 +       spin_unlock(&repacker->guard);
77149 +
77150 +       kfree(repacker);
77151 +       sinfo->repacker = NULL;
77152 +}
77153 diff -rupN linux-2.6.8-rc3/fs/reiser4/repacker.h linux-2.6.8-rc3-a/fs/reiser4/repacker.h
77154 --- linux-2.6.8-rc3/fs/reiser4/repacker.h       1970-01-01 03:00:00.000000000 +0300
77155 +++ linux-2.6.8-rc3-a/fs/reiser4/repacker.h     2004-08-05 21:20:52.874704503 +0400
77156 @@ -0,0 +1,22 @@
77157 +/* Copyright 2003 by Hans Reiser */
77158 +
77159 +#ifndef __FS_REISER4_REPACKER_H__
77160 +#define __FS_REISER4_REPACKER_H__
77161 +
77162 +/* Repacker per tread state and statistics. */
77163 +struct repacker_cursor {
77164 +       reiser4_blocknr_hint hint;
77165 +       int count;
77166 +       struct  {
77167 +               long znodes_dirtied;
77168 +               long jnodes_dirtied;
77169 +       } stats;
77170 +};
77171 +
77172 +extern int  init_reiser4_repacker(struct super_block *);
77173 +extern void done_reiser4_repacker(struct super_block *);
77174 +
77175 +extern int reiser4_repacker (struct repacker * repacker);
77176 +extern int repacker_d(void *arg);
77177 +
77178 +#endif /* __FS_REISER4_REPACKER_H__ */
77179 diff -rupN linux-2.6.8-rc3/fs/reiser4/safe_link.c linux-2.6.8-rc3-a/fs/reiser4/safe_link.c
77180 --- linux-2.6.8-rc3/fs/reiser4/safe_link.c      1970-01-01 03:00:00.000000000 +0300
77181 +++ linux-2.6.8-rc3-a/fs/reiser4/safe_link.c    2004-08-05 21:20:53.293616144 +0400
77182 @@ -0,0 +1,347 @@
77183 +/* Copyright 2003, 2004 by Hans Reiser, licensing governed by
77184 + * reiser4/README */
77185 +
77186 +/* Safe-links. */
77187 +
77188 +/*
77189 + * Safe-links are used to maintain file system consistency during operations
77190 + * that spawns multiple transactions. For example:
77191 + *
77192 + *     1. Unlink. UNIX supports "open-but-unlinked" files, that is files
77193 + *     without user-visible names in the file system, but still opened by some
77194 + *     active process. What happens here is that unlink proper (i.e., removal
77195 + *     of the last file name) and file deletion (truncate of file body to zero
77196 + *     and deletion of stat-data, that happens when last file descriptor is
77197 + *     closed), may belong to different transactions T1 and T2. If a crash
77198 + *     happens after T1 commit, but before T2 commit, on-disk file system has
77199 + *     a file without name, that is, disk space leak.
77200 + *
77201 + *     2. Truncate. Truncate of large file may spawn multiple transactions. If
77202 + *     system crashes while truncate was in-progress, file is left partially
77203 + *     truncated, which violates "atomicity guarantees" of reiser4, viz. that
77204 + *     every system is atomic.
77205 + *
77206 + * Safe-links address both above cases. Basically, safe-link is a way post
77207 + * some operation to be executed during commit of some other transaction than
77208 + * current one. (Another way to look at the safe-link is to interpret it as a
77209 + * logical logging.)
77210 + *
77211 + * Specifically, at the beginning of unlink safe-link in inserted in the
77212 + * tree. This safe-link is normally removed by file deletion code (during
77213 + * transaction T2 in the above terms). Truncate also inserts safe-link that is
77214 + * normally removed when truncate operation is finished.
77215 + *
77216 + * This means, that in the case of "clean umount" there are no safe-links in
77217 + * the tree. If safe-links are observed during mount, it means that (a) system
77218 + * was terminated abnormally, and (b) safe-link correspond to the "pending"
77219 + * (i.e., not finished) operations that were in-progress during system
77220 + * termination. Each safe-link record enough information to complete
77221 + * corresponding operation, and mount simply "replays" them (hence, the
77222 + * analogy with the logical logging).
77223 + *
77224 + * Safe-links are implemented as blackbox items (see
77225 + * plugin/item/blackbox.[ch]).
77226 + *
77227 + * For the reference: ext3 also has similar mechanism, it's called "an orphan
77228 + * list" there.
77229 + */
77230 +
77231 +#include "safe_link.h"
77232 +#include "debug.h"
77233 +#include "inode.h"
77234 +
77235 +#include "plugin/item/blackbox.h"
77236 +
77237 +#include <linux/fs.h>
77238 +
77239 +/*
77240 + * On-disk format of safe-link.
77241 + */
77242 +typedef struct safelink {
77243 +       reiser4_key sdkey; /* key of stat-data for the file safe-link is
77244 +                           * for */
77245 +       d64 size;          /* size to which file should be truncated */
77246 +} safelink_t;
77247 +
77248 +/*
77249 + * locality where safe-link items are stored. Next to the locality of root
77250 + * directory.
77251 + */
77252 +static oid_t
77253 +safe_link_locality(reiser4_tree *tree)
77254 +{
77255 +       return get_inode_oid(tree->super->s_root->d_inode) + 1;
77256 +}
77257 +
77258 +/*
77259 +  Construct a key for the safe-link. Key has the following format:
77260 +
77261 +|        60     | 4 |        64        | 4 |      60       |         64       |
77262 ++---------------+---+------------------+---+---------------+------------------+
77263 +|   locality    | 0 |        0         | 0 |   objectid    |     link type    |
77264 ++---------------+---+------------------+---+---------------+------------------+
77265 +|                   |                  |                   |                  |
77266 +|     8 bytes       |     8 bytes      |      8 bytes      |      8 bytes     |
77267 +
77268 +   This is in large keys format. In small keys format second 8 byte chunk is
77269 +   out. Locality is a constant returned by safe_link_locality(). objectid is
77270 +   an oid of a file on which operation protected by this safe-link is
77271 +   performed. link-type is used to distinguish safe-links for different
77272 +   operations.
77273 +
77274 + */
77275 +static reiser4_key *
77276 +build_link_key(struct inode *inode, reiser4_safe_link_t link, reiser4_key *key)
77277 +{
77278 +       key_init(key);
77279 +       set_key_locality(key, safe_link_locality(tree_by_inode(inode)));
77280 +       set_key_objectid(key, get_inode_oid(inode));
77281 +       set_key_offset(key, link);
77282 +       return key;
77283 +}
77284 +
77285 +/*
77286 + * how much disk space is necessary to insert and remove (in the
77287 + * error-handling path) safe-link.
77288 + */
77289 +reiser4_internal __u64 safe_link_tograb(reiser4_tree *tree)
77290 +{
77291 +       return
77292 +               /* insert safe link */
77293 +               estimate_one_insert_item(tree) +
77294 +               /* remove safe link */
77295 +               estimate_one_item_removal(tree) +
77296 +               /* drill to the leaf level during insertion */
77297 +               1 + estimate_one_insert_item(tree) +
77298 +               /*
77299 +                * possible update of existing safe-link. Actually, if
77300 +                * safe-link existed already (we failed to remove it), then no
77301 +                * insertion is necessary, so this term is already "covered",
77302 +                * but for simplicity let's left it.
77303 +                */
77304 +               1;
77305 +}
77306 +
77307 +/*
77308 + * grab enough disk space to insert and remove (in the error-handling path)
77309 + * safe-link.
77310 + */
77311 +reiser4_internal int safe_link_grab(reiser4_tree *tree, reiser4_ba_flags_t flags)
77312 +{
77313 +       int   result;
77314 +
77315 +       grab_space_enable();
77316 +       /* The sbinfo->delete semaphore can be taken here.
77317 +        * safe_link_release() should be called before leaving reiser4
77318 +        * context. */
77319 +       result = reiser4_grab_reserved(tree->super, safe_link_tograb(tree), flags);
77320 +       grab_space_enable();
77321 +       return result;
77322 +}
77323 +
77324 +/*
77325 + * release unused disk space reserved by safe_link_grab().
77326 + */
77327 +reiser4_internal void safe_link_release(reiser4_tree * tree)
77328 +{
77329 +       reiser4_release_reserved(tree->super);
77330 +}
77331 +
77332 +/*
77333 + * insert into tree safe-link for operation @link on inode @inode.
77334 + */
77335 +reiser4_internal int safe_link_add(struct inode *inode, reiser4_safe_link_t link)
77336 +{
77337 +       reiser4_key key;
77338 +       safelink_t sl;
77339 +       int length;
77340 +       int result;
77341 +       reiser4_tree *tree;
77342 +
77343 +       build_sd_key(inode, &sl.sdkey);
77344 +       length = sizeof sl.sdkey;
77345 +
77346 +       if (link == SAFE_TRUNCATE) {
77347 +               /*
77348 +                * for truncate we have to store final file length also,
77349 +                * expand item.
77350 +                */
77351 +               length += sizeof(sl.size);
77352 +               cputod64(inode->i_size, &sl.size);
77353 +       }
77354 +       tree = tree_by_inode(inode);
77355 +       build_link_key(inode, link, &key);
77356 +
77357 +       result = store_black_box(tree, &key, &sl, length);
77358 +       if (result == -EEXIST)
77359 +               result = update_black_box(tree, &key, &sl, length);
77360 +       return result;
77361 +}
77362 +
77363 +/*
77364 + * remove safe-link corresponding to the operation @link on inode @inode from
77365 + * the tree.
77366 + */
77367 +reiser4_internal int safe_link_del(struct inode *inode, reiser4_safe_link_t link)
77368 +{
77369 +       reiser4_key key;
77370 +
77371 +       return kill_black_box(tree_by_inode(inode),
77372 +                             build_link_key(inode, link, &key));
77373 +}
77374 +
77375 +/*
77376 + * in-memory structure to keep information extracted from safe-link. This is
77377 + * used to iterate over all safe-links.
77378 + */
77379 +typedef struct {
77380 +       reiser4_tree       *tree;   /* internal tree */
77381 +       reiser4_key         key;    /* safe-link key*/
77382 +       reiser4_key         sdkey;  /* key of object stat-data */
77383 +       reiser4_safe_link_t link;   /* safe-link type */
77384 +       oid_t               oid;    /* object oid */
77385 +       __u64               size;   /* final size for truncate */
77386 +} safe_link_context;
77387 +
77388 +/*
77389 + * start iterating over all safe-links.
77390 + */
77391 +static void safe_link_iter_begin(reiser4_tree *tree, safe_link_context *ctx)
77392 +{
77393 +       ctx->tree = tree;
77394 +       key_init(&ctx->key);
77395 +       set_key_locality(&ctx->key, safe_link_locality(tree));
77396 +       set_key_objectid(&ctx->key, get_key_objectid(max_key()));
77397 +       set_key_offset(&ctx->key, get_key_offset(max_key()));
77398 +}
77399 +
77400 +/*
77401 + * return next safe-link.
77402 + */
77403 +static int safe_link_iter_next(safe_link_context *ctx)
77404 +{
77405 +       int result;
77406 +       safelink_t sl;
77407 +
77408 +       result = load_black_box(ctx->tree,
77409 +                               &ctx->key, &sl, sizeof sl, 0);
77410 +       if (result == 0) {
77411 +               ctx->oid = get_key_objectid(&ctx->key);
77412 +               ctx->link = get_key_offset(&ctx->key);
77413 +               ctx->sdkey = sl.sdkey;
77414 +               if (ctx->link == SAFE_TRUNCATE)
77415 +                       ctx->size = d64tocpu(&sl.size);
77416 +       }
77417 +       return result;
77418 +}
77419 +
77420 +/*
77421 + * check are there any more safe-links left in the tree.
77422 + */
77423 +static int safe_link_iter_finished(safe_link_context *ctx)
77424 +{
77425 +       return get_key_locality(&ctx->key) != safe_link_locality(ctx->tree);
77426 +}
77427 +
77428 +
77429 +/*
77430 + * finish safe-link iteration.
77431 + */
77432 +static void safe_link_iter_end(safe_link_context *ctx)
77433 +{
77434 +       /* nothing special */
77435 +}
77436 +
77437 +/*
77438 + * process single safe-link.
77439 + */
77440 +static int process_safelink(struct super_block *super, reiser4_safe_link_t link,
77441 +                           reiser4_key *sdkey, oid_t oid, __u64 size)
77442 +{
77443 +       struct inode *inode;
77444 +       int result;
77445 +
77446 +       /*
77447 +        * obtain object inode by reiser4_iget(), then call object plugin
77448 +        * ->safelink() method to do actual work, then delete safe-link on
77449 +        * success.
77450 +        */
77451 +
77452 +       inode = reiser4_iget(super, sdkey, 1);
77453 +       if (!IS_ERR(inode)) {
77454 +               file_plugin *fplug;
77455 +
77456 +               fplug = inode_file_plugin(inode);
77457 +               assert("nikita-3428", fplug != NULL);
77458 +               if (fplug->safelink != NULL)
77459 +                       result = fplug->safelink(inode, link, size);
77460 +               else {
77461 +                       warning("nikita-3430",
77462 +                               "Cannot handle safelink for %lli", oid);
77463 +                       print_key("key", sdkey);
77464 +                       print_inode("inode", inode);
77465 +                       result = 0;
77466 +               }
77467 +               if (result != 0) {
77468 +                       warning("nikita-3431",
77469 +                               "Error processing safelink for %lli: %i",
77470 +                               oid, result);
77471 +               }
77472 +               reiser4_iget_complete(inode);
77473 +               iput(inode);
77474 +               if (result == 0) {
77475 +                       result = safe_link_grab(tree_by_inode(inode),
77476 +                                               BA_CAN_COMMIT);
77477 +                       if (result == 0)
77478 +                               result = safe_link_del(inode, link);
77479 +                       safe_link_release(tree_by_inode(inode));
77480 +                       /*
77481 +                        * restart transaction: if there was large number of
77482 +                        * safe-links, their processing may fail to fit into
77483 +                        * single transaction.
77484 +                        */
77485 +                       if (result == 0)
77486 +                               txn_restart_current();
77487 +               }
77488 +       } else
77489 +               result = PTR_ERR(inode);
77490 +       return result;
77491 +}
77492 +
77493 +/*
77494 + * iterate over all safe-links in the file-system processing them one by one.
77495 + */
77496 +reiser4_internal int process_safelinks(struct super_block *super)
77497 +{
77498 +       safe_link_context ctx;
77499 +       int result;
77500 +
77501 +       if (rofs_super(super))
77502 +               /* do nothing on the read-only file system */
77503 +               return 0;
77504 +       safe_link_iter_begin(&get_super_private(super)->tree, &ctx);
77505 +       result = 0;
77506 +       do {
77507 +               result = safe_link_iter_next(&ctx);
77508 +               if (safe_link_iter_finished(&ctx) || result == -ENOENT) {
77509 +                       result = 0;
77510 +                       break;
77511 +               }
77512 +               if (result == 0)
77513 +                       result = process_safelink(super, ctx.link,
77514 +                                                 &ctx.sdkey, ctx.oid, ctx.size);
77515 +       } while (result == 0);
77516 +       safe_link_iter_end(&ctx);
77517 +       return result;
77518 +}
77519 +
77520 +/* Make Linus happy.
77521 +   Local variables:
77522 +   c-indentation-style: "K&R"
77523 +   mode-name: "LC"
77524 +   c-basic-offset: 8
77525 +   tab-width: 8
77526 +   fill-column: 120
77527 +   scroll-step: 1
77528 +   End:
77529 +*/
77530 diff -rupN linux-2.6.8-rc3/fs/reiser4/safe_link.h linux-2.6.8-rc3-a/fs/reiser4/safe_link.h
77531 --- linux-2.6.8-rc3/fs/reiser4/safe_link.h      1970-01-01 03:00:00.000000000 +0300
77532 +++ linux-2.6.8-rc3-a/fs/reiser4/safe_link.h    2004-08-05 21:20:53.281618674 +0400
77533 @@ -0,0 +1,33 @@
77534 +/* Copyright 2003 by Hans Reiser, licensing governed by
77535 + * reiser4/README */
77536 +
77537 +/* Safe-links. See safe_link.c for details. */
77538 +
77539 +#if !defined( __FS_SAFE_LINK_H__ )
77540 +#define __FS_SAFE_LINK_H__
77541 +
77542 +#include "tree.h"
77543 +#include "tap.h"
77544 +
77545 +struct inode;
77546 +
77547 +__u64 safe_link_tograb(reiser4_tree *tree);
77548 +int safe_link_grab(reiser4_tree *tree, reiser4_ba_flags_t flags);
77549 +void safe_link_release(reiser4_tree *tree);
77550 +int safe_link_add(struct inode *inode, reiser4_safe_link_t link);
77551 +int safe_link_del(struct inode *inode, reiser4_safe_link_t link);
77552 +
77553 +int process_safelinks(struct super_block *super);
77554 +
77555 +/* __FS_SAFE_LINK_H__ */
77556 +#endif
77557 +
77558 +/* Make Linus happy.
77559 +   Local variables:
77560 +   c-indentation-style: "K&R"
77561 +   mode-name: "LC"
77562 +   c-basic-offset: 8
77563 +   tab-width: 8
77564 +   fill-column: 120
77565 +   End:
77566 +*/
77567 diff -rupN linux-2.6.8-rc3/fs/reiser4/seal.c linux-2.6.8-rc3-a/fs/reiser4/seal.c
77568 --- linux-2.6.8-rc3/fs/reiser4/seal.c   1970-01-01 03:00:00.000000000 +0300
77569 +++ linux-2.6.8-rc3-a/fs/reiser4/seal.c 2004-08-05 21:20:53.382597375 +0400
77570 @@ -0,0 +1,234 @@
77571 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
77572 +/* Seals implemenation. */
77573 +/* Seals are "weak" tree pointers. They are analogous to tree coords in
77574 +   allowing to bypass tree traversal. But normal usage of coords implies that
77575 +   node pointed to by coord is locked, whereas seals don't keep a lock (or
77576 +   even a reference) to znode. In stead, each znode contains a version number,
77577 +   increased on each znode modification. This version number is copied into a
77578 +   seal when seal is created. Later, one can "validate" seal by calling
77579 +   seal_validate(). If znode is in cache and its version number is still the
77580 +   same, seal is "pristine" and coord associated with it can be re-used
77581 +   immediately.
77582 +
77583 +   If, on the other hand, znode is out of cache, or it is obviously different
77584 +   one from the znode seal was initially attached to (for example, it is on
77585 +   the different level, or is being removed from the tree), seal is
77586 +   irreparably invalid ("burned") and tree traversal has to be repeated.
77587 +
77588 +   Otherwise, there is some hope, that while znode was modified (and seal was
77589 +   "broken" as a result), key attached to the seal is still in the node. This
77590 +   is checked by first comparing this key with delimiting keys of node and, if
77591 +   key is ok, doing intra-node lookup.
77592 +
77593 +   Znode version is maintained in the following way:
77594 +
77595 +   there is reiser4_tree.znode_epoch counter. Whenever new znode is created,
77596 +   znode_epoch is incremented and its new value is stored in ->version field
77597 +   of new znode. Whenever znode is dirtied (which means it was probably
77598 +   modified), znode_epoch is also incremented and its new value is stored in
77599 +   znode->version. This is done so, because just incrementing znode->version
77600 +   on each update is not enough: it may so happen, that znode get deleted, new
77601 +   znode is allocated for the same disk block and gets the same version
77602 +   counter, tricking seal code into false positive.
77603 +*/
77604 +
77605 +#include "forward.h"
77606 +#include "debug.h"
77607 +#include "key.h"
77608 +#include "coord.h"
77609 +#include "seal.h"
77610 +#include "plugin/item/item.h"
77611 +#include "plugin/node/node.h"
77612 +#include "jnode.h"
77613 +#include "znode.h"
77614 +#include "super.h"
77615 +
77616 +static znode *seal_node(const seal_t * seal);
77617 +static int seal_matches(const seal_t * seal, znode * node);
77618 +
77619 +/* initialise seal. This can be called several times on the same seal. @coord
77620 +   and @key can be NULL.  */
77621 +reiser4_internal void
77622 +seal_init(seal_t * seal /* seal to initialise */ ,
77623 +         const coord_t * coord /* coord @seal will be attached to */ ,
77624 +         const reiser4_key * key UNUSED_ARG    /* key @seal will be
77625 +                                                * attached to */ )
77626 +{
77627 +       assert("nikita-1886", seal != NULL);
77628 +       xmemset(seal, 0, sizeof *seal);
77629 +       if (coord != NULL) {
77630 +               znode *node;
77631 +
77632 +               node = coord->node;
77633 +               assert("nikita-1987", node != NULL);
77634 +               spin_lock_znode(node);
77635 +               seal->version = node->version;
77636 +               assert("nikita-1988", seal->version != 0);
77637 +               seal->block = *znode_get_block(node);
77638 +#if REISER4_DEBUG
77639 +               seal->coord = *coord;
77640 +               if (key != NULL)
77641 +                       seal->key = *key;
77642 +#endif
77643 +               spin_unlock_znode(node);
77644 +       }
77645 +}
77646 +
77647 +/* finish with seal */
77648 +reiser4_internal void
77649 +seal_done(seal_t * seal /* seal to clear */)
77650 +{
77651 +       assert("nikita-1887", seal != NULL);
77652 +       seal->version = 0;
77653 +}
77654 +
77655 +/* true if seal was initialised */
77656 +reiser4_internal int
77657 +seal_is_set(const seal_t * seal /* seal to query */ )
77658 +{
77659 +       assert("nikita-1890", seal != NULL);
77660 +       return seal->version != 0;
77661 +}
77662 +
77663 +#if REISER4_DEBUG
77664 +/* helper function for seal_validate(). It checks that item at @coord has
77665 + * expected key. This is to detect cases where node was modified but wasn't
77666 + * marked dirty. */
77667 +static inline int
77668 +check_seal_match(const coord_t * coord /* coord to check */,
77669 +                const reiser4_key * k /* expected key */)
77670 +{
77671 +       reiser4_key ukey;
77672 +
77673 +       return (coord->between != AT_UNIT) ||
77674 +           /* FIXME-VS: we only can compare keys for items whose units
77675 +              represent exactly one key */
77676 +           (coord_is_existing_unit(coord) && (item_is_extent(coord) || keyeq(k, unit_key_by_coord(coord, &ukey))));
77677 +}
77678 +#endif
77679 +
77680 +
77681 +/* this is used by seal_validate. It accepts return value of
77682 + * longterm_lock_znode and returns 1 if it can be interpreted as seal
77683 + * validation failure. For instance, when longterm_lock_znode returns -EINVAL,
77684 + * seal_validate returns -E_REPEAT and caller will call tre search. We cannot
77685 + * do this in longterm_lock_znode(), because sometimes we want to distinguish
77686 + * between -EINVAL and -E_REPEAT. */
77687 +static int
77688 +should_repeat(int return_code)
77689 +{
77690 +       return return_code == -EINVAL;
77691 +}
77692 +
77693 +/* (re-)validate seal.
77694 +
77695 +   Checks whether seal is pristine, and try to revalidate it if possible.
77696 +
77697 +   If seal was burned, or broken irreparably, return -E_REPEAT.
77698 +
77699 +   NOTE-NIKITA currently seal_validate() returns -E_REPEAT if key we are
77700 +   looking for is in range of keys covered by the sealed node, but item wasn't
77701 +   found by node ->lookup() method. Alternative is to return -ENOENT in this
77702 +   case, but this would complicate callers logic.
77703 +
77704 +*/
77705 +reiser4_internal int
77706 +seal_validate(seal_t * seal /* seal to validate */ ,
77707 +             coord_t * coord /* coord to validate against */ ,
77708 +             const reiser4_key * key /* key to validate against */ ,
77709 +             tree_level level /* level of node */ ,
77710 +             lock_handle * lh /* resulting lock handle */ ,
77711 +             lookup_bias bias /* search bias */ ,
77712 +             znode_lock_mode mode /* lock node */ ,
77713 +             znode_lock_request request /* locking priority */ )
77714 +{
77715 +       znode *node;
77716 +       int result;
77717 +
77718 +       assert("nikita-1889", seal != NULL);
77719 +       assert("nikita-1881", seal_is_set(seal));
77720 +       assert("nikita-1882", key != NULL);
77721 +       assert("nikita-1883", coord != NULL);
77722 +       assert("nikita-1884", lh != NULL);
77723 +       assert("nikita-1885", keyeq(&seal->key, key));
77724 +       assert("nikita-1989", coords_equal(&seal->coord, coord));
77725 +
77726 +       /* obtain znode by block number */
77727 +       node = seal_node(seal);
77728 +       if (node != NULL) {
77729 +               /* znode was in cache, lock it */
77730 +               result = longterm_lock_znode(lh, node, mode, request);
77731 +               zput(node);
77732 +               if (result == 0) {
77733 +                       if (seal_matches(seal, node)) {
77734 +                               /* if seal version and znode version
77735 +                                  coincide */
77736 +                               ON_DEBUG(coord_update_v(coord));
77737 +                               assert("nikita-1990", node == seal->coord.node);
77738 +                               assert("nikita-1898", WITH_DATA_RET(coord->node, 1, check_seal_match(coord, key)));
77739 +                               reiser4_stat_inc(seal.perfect_match);
77740 +                       } else
77741 +                               result = RETERR(-E_REPEAT);
77742 +               }
77743 +               if (result != 0) {
77744 +                       if (should_repeat(result))
77745 +                               result = RETERR(-E_REPEAT);
77746 +                       /* unlock node on failure */
77747 +                       done_lh(lh);
77748 +               }
77749 +       } else {
77750 +               /* znode wasn't in cache */
77751 +               reiser4_stat_inc(seal.out_of_cache);
77752 +               result = RETERR(-E_REPEAT);
77753 +       }
77754 +       return result;
77755 +}
77756 +
77757 +/* helpers functions */
77758 +
77759 +/* obtain reference to znode seal points to, if in cache */
77760 +static znode *
77761 +seal_node(const seal_t * seal /* seal to query */ )
77762 +{
77763 +       assert("nikita-1891", seal != NULL);
77764 +       return zlook(current_tree, &seal->block);
77765 +}
77766 +
77767 +/* true if @seal version and @node version coincide */
77768 +static int
77769 +seal_matches(const seal_t * seal /* seal to check */ ,
77770 +            znode * node /* node to check */ )
77771 +{
77772 +       assert("nikita-1991", seal != NULL);
77773 +       assert("nikita-1993", node != NULL);
77774 +
77775 +       return UNDER_SPIN(jnode, ZJNODE(node), (seal->version == node->version));
77776 +}
77777 +
77778 +#if REISER4_DEBUG_OUTPUT
77779 +/* debugging function: print human readable form of @seal. */
77780 +reiser4_internal void
77781 +print_seal(const char *prefix, const seal_t * seal)
77782 +{
77783 +       if (seal == NULL) {
77784 +               printk("%s: null seal\n", prefix);
77785 +       } else {
77786 +               printk("%s: version: %llu, block: %llu\n", prefix, seal->version, seal->block);
77787 +#if REISER4_DEBUG
77788 +               print_key("seal key", &seal->key);
77789 +               print_coord("seal coord", &seal->coord, 0);
77790 +#endif
77791 +       }
77792 +}
77793 +#endif
77794 +
77795 +/* Make Linus happy.
77796 +   Local variables:
77797 +   c-indentation-style: "K&R"
77798 +   mode-name: "LC"
77799 +   c-basic-offset: 8
77800 +   tab-width: 8
77801 +   fill-column: 120
77802 +   scroll-step: 1
77803 +   End:
77804 +*/
77805 diff -rupN linux-2.6.8-rc3/fs/reiser4/seal.h linux-2.6.8-rc3-a/fs/reiser4/seal.h
77806 --- linux-2.6.8-rc3/fs/reiser4/seal.h   1970-01-01 03:00:00.000000000 +0300
77807 +++ linux-2.6.8-rc3-a/fs/reiser4/seal.h 2004-08-05 21:20:53.029671816 +0400
77808 @@ -0,0 +1,59 @@
77809 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
77810 +
77811 +/* Declaration of seals: "weak" tree pointers. See seal.c for comments. */
77812 +
77813 +#ifndef __SEAL_H__
77814 +#define __SEAL_H__
77815 +
77816 +#include "forward.h"
77817 +#include "debug.h"
77818 +#include "dformat.h"
77819 +#include "key.h"
77820 +#include "coord.h"
77821 +
77822 +/* for __u?? types */
77823 +#include <linux/types.h>
77824 +
77825 +/* seal. See comment at the top of seal.c */
77826 +typedef struct seal_s {
77827 +       /* version of znode recorder at the time of seal creation */
77828 +       __u64 version;
77829 +       /* block number of znode attached to this seal */
77830 +       reiser4_block_nr block;
77831 +#if REISER4_DEBUG
77832 +       /* coord this seal is attached to. For debugging. */
77833 +       coord_t coord;
77834 +       /* key this seal is attached to. For debugging. */
77835 +       reiser4_key key;
77836 +#endif
77837 +} seal_t;
77838 +
77839 +extern void seal_init(seal_t * seal, const coord_t * coord, const reiser4_key * key);
77840 +extern void seal_done(seal_t * seal);
77841 +
77842 +extern int seal_is_set(const seal_t * seal);
77843 +
77844 +extern int seal_validate(seal_t * seal,
77845 +                        coord_t * coord,
77846 +                        const reiser4_key * key,
77847 +                        tree_level level,
77848 +                        lock_handle * lh, lookup_bias bias, znode_lock_mode mode, znode_lock_request request);
77849 +
77850 +#if REISER4_DEBUG_OUTPUT
77851 +extern void print_seal(const char *prefix, const seal_t * seal);
77852 +#else
77853 +#define print_seal( prefix, seal ) noop
77854 +#endif
77855 +
77856 +/* __SEAL_H__ */
77857 +#endif
77858 +
77859 +/* Make Linus happy.
77860 +   Local variables:
77861 +   c-indentation-style: "K&R"
77862 +   mode-name: "LC"
77863 +   c-basic-offset: 8
77864 +   tab-width: 8
77865 +   fill-column: 120
77866 +   End:
77867 +*/
77868 diff -rupN linux-2.6.8-rc3/fs/reiser4/search.c linux-2.6.8-rc3-a/fs/reiser4/search.c
77869 --- linux-2.6.8-rc3/fs/reiser4/search.c 1970-01-01 03:00:00.000000000 +0300
77870 +++ linux-2.6.8-rc3-a/fs/reiser4/search.c       2004-08-05 21:20:53.328608763 +0400
77871 @@ -0,0 +1,1689 @@
77872 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
77873 + * reiser4/README */
77874 +
77875 +#include "forward.h"
77876 +#include "debug.h"
77877 +#include "dformat.h"
77878 +#include "key.h"
77879 +#include "coord.h"
77880 +#include "seal.h"
77881 +#include "plugin/item/item.h"
77882 +#include "plugin/node/node.h"
77883 +#include "plugin/plugin.h"
77884 +#include "jnode.h"
77885 +#include "znode.h"
77886 +#include "block_alloc.h"
77887 +#include "tree_walk.h"
77888 +#include "tree.h"
77889 +#include "log.h"
77890 +#include "reiser4.h"
77891 +#include "super.h"
77892 +#include "prof.h"
77893 +#include "inode.h"
77894 +
77895 +#include <linux/slab.h>
77896 +
77897 +/* tree searching algorithm, intranode searching algorithms are in
77898 +   plugin/node/ */
77899 +
77900 +/* tree lookup cache
77901 + *
77902 + * The coord by key cache consists of small list of recently accessed nodes
77903 + * maintained according to the LRU discipline. Before doing real top-to-down
77904 + * tree traversal this cache is scanned for nodes that can contain key
77905 + * requested.
77906 + *
77907 + * The efficiency of coord cache depends heavily on locality of reference for
77908 + * tree accesses. Our user level simulations show reasonably good hit ratios
77909 + * for coord cache under most loads so far.
77910 + */
77911 +
77912 +/* Initialise coord cache slot */
77913 +static void
77914 +cbk_cache_init_slot(cbk_cache_slot * slot)
77915 +{
77916 +       assert("nikita-345", slot != NULL);
77917 +
77918 +       cbk_cache_list_clean(slot);
77919 +       slot->node = NULL;
77920 +}
77921 +
77922 +/* Initialise coord cache */
77923 +reiser4_internal int
77924 +cbk_cache_init(cbk_cache * cache /* cache to init */ )
77925 +{
77926 +       int i;
77927 +
77928 +       assert("nikita-346", cache != NULL);
77929 +
77930 +       cache->slot = kmalloc(sizeof (cbk_cache_slot) * cache->nr_slots, GFP_KERNEL);
77931 +       if (cache->slot == NULL)
77932 +               return RETERR(-ENOMEM);
77933 +
77934 +       cbk_cache_list_init(&cache->lru);
77935 +       for (i = 0; i < cache->nr_slots; ++i) {
77936 +               cbk_cache_init_slot(cache->slot + i);
77937 +               cbk_cache_list_push_back(&cache->lru, cache->slot + i);
77938 +       }
77939 +       rw_cbk_cache_init(cache);
77940 +       return 0;
77941 +}
77942 +
77943 +/* free cbk cache data */
77944 +reiser4_internal void
77945 +cbk_cache_done(cbk_cache * cache /* cache to release */ )
77946 +{
77947 +       assert("nikita-2493", cache != NULL);
77948 +       if (cache->slot != NULL) {
77949 +               kfree(cache->slot);
77950 +               cache->slot = NULL;
77951 +       }
77952 +}
77953 +
77954 +/* macro to iterate over all cbk cache slots */
77955 +#define for_all_slots( cache, slot )                                   \
77956 +       for( ( slot ) = cbk_cache_list_front( &( cache ) -> lru ) ;     \
77957 +            !cbk_cache_list_end( &( cache ) -> lru, ( slot ) ) ;       \
77958 +            ( slot ) = cbk_cache_list_next( slot ) )
77959 +
77960 +#if REISER4_DEBUG_OUTPUT
77961 +/* Debugging aid: print human readable information about @slot */
77962 +reiser4_internal void
77963 +print_cbk_slot(const char *prefix /* prefix to print */ ,
77964 +              const cbk_cache_slot * slot /* slot to print */ )
77965 +{
77966 +       if (slot == NULL)
77967 +               printk("%s: null slot\n", prefix);
77968 +       else
77969 +               print_znode("node", slot->node);
77970 +}
77971 +
77972 +/* Debugging aid: print human readable information about @cache */
77973 +reiser4_internal void
77974 +print_cbk_cache(const char *prefix /* prefix to print */ ,
77975 +               const cbk_cache * cache /* cache to print */ )
77976 +{
77977 +       if (cache == NULL)
77978 +               printk("%s: null cache\n", prefix);
77979 +       else {
77980 +               cbk_cache_slot *scan;
77981 +
77982 +               printk("%s: cache: %p\n", prefix, cache);
77983 +               for_all_slots(cache, scan)
77984 +                   print_cbk_slot("slot", scan);
77985 +       }
77986 +}
77987 +#endif
77988 +
77989 +#if REISER4_DEBUG
77990 +/* this function assures that [cbk-cache-invariant] invariant holds */
77991 +static int
77992 +cbk_cache_invariant(const cbk_cache * cache)
77993 +{
77994 +       cbk_cache_slot *slot;
77995 +       int result;
77996 +       int unused;
77997 +
77998 +       if (cache->nr_slots == 0)
77999 +               return 1;
78000 +
78001 +       assert("nikita-2469", cache != NULL);
78002 +       unused = 0;
78003 +       result = 1;
78004 +       read_lock_cbk_cache((cbk_cache *) cache);
78005 +       for_all_slots(cache, slot) {
78006 +               /* in LRU first go all `used' slots followed by `unused' */
78007 +               if (unused && (slot->node != NULL))
78008 +                       result = 0;
78009 +               if (slot->node == NULL)
78010 +                       unused = 1;
78011 +               else {
78012 +                       cbk_cache_slot *scan;
78013 +
78014 +                       /* all cached nodes are different */
78015 +                       scan = slot;
78016 +                       while (result) {
78017 +                               scan = cbk_cache_list_next(scan);
78018 +                               if (cbk_cache_list_end(&cache->lru, scan))
78019 +                                       break;
78020 +                               if (slot->node == scan->node)
78021 +                                       result = 0;
78022 +                       }
78023 +               }
78024 +               if (!result)
78025 +                       break;
78026 +       }
78027 +       read_unlock_cbk_cache((cbk_cache *) cache);
78028 +       return result;
78029 +}
78030 +
78031 +#endif
78032 +
78033 +/* Remove references, if any, to @node from coord cache */
78034 +reiser4_internal void
78035 +cbk_cache_invalidate(const znode * node /* node to remove from cache */ ,
78036 +                    reiser4_tree * tree /* tree to remove node from */ )
78037 +{
78038 +       cbk_cache_slot *slot;
78039 +       cbk_cache *cache;
78040 +       int i;
78041 +
78042 +       assert("nikita-350", node != NULL);
78043 +       assert("nikita-1479", LOCK_CNT_GTZ(rw_locked_tree));
78044 +
78045 +       cache = &tree->cbk_cache;
78046 +       assert("nikita-2470", cbk_cache_invariant(cache));
78047 +
78048 +       write_lock_cbk_cache(cache);
78049 +       for (i = 0, slot = cache->slot; i < cache->nr_slots; ++ i, ++ slot) {
78050 +               if (slot->node == node) {
78051 +                       cbk_cache_list_remove(slot);
78052 +                       cbk_cache_list_push_back(&cache->lru, slot);
78053 +                       slot->node = NULL;
78054 +                       break;
78055 +               }
78056 +       }
78057 +       write_unlock_cbk_cache(cache);
78058 +       assert("nikita-2471", cbk_cache_invariant(cache));
78059 +}
78060 +
78061 +/* add to the cbk-cache in the "tree" information about "node". This
78062 +    can actually be update of existing slot in a cache. */
78063 +reiser4_internal void
78064 +cbk_cache_add(const znode * node /* node to add to the cache */ )
78065 +{
78066 +       cbk_cache *cache;
78067 +       cbk_cache_slot *slot;
78068 +       int i;
78069 +
78070 +       assert("nikita-352", node != NULL);
78071 +
78072 +       cache = &znode_get_tree(node)->cbk_cache;
78073 +       assert("nikita-2472", cbk_cache_invariant(cache));
78074 +
78075 +       if (cache->nr_slots == 0)
78076 +               return;
78077 +
78078 +       write_lock_cbk_cache(cache);
78079 +       /* find slot to update/add */
78080 +       for (i = 0, slot = cache->slot; i < cache->nr_slots; ++ i, ++ slot) {
78081 +               /* oops, this node is already in a cache */
78082 +               if (slot->node == node)
78083 +                       break;
78084 +       }
78085 +       /* if all slots are used, reuse least recently used one */
78086 +       if (i == cache->nr_slots) {
78087 +               slot = cbk_cache_list_back(&cache->lru);
78088 +               slot->node = (znode *) node;
78089 +       }
78090 +       cbk_cache_list_remove(slot);
78091 +       cbk_cache_list_push_front(&cache->lru, slot);
78092 +       write_unlock_cbk_cache(cache);
78093 +       assert("nikita-2473", cbk_cache_invariant(cache));
78094 +}
78095 +
78096 +static int setup_delimiting_keys(cbk_handle * h);
78097 +static lookup_result coord_by_handle(cbk_handle * handle);
78098 +static lookup_result traverse_tree(cbk_handle * h);
78099 +static int cbk_cache_search(cbk_handle * h);
78100 +
78101 +static level_lookup_result cbk_level_lookup(cbk_handle * h);
78102 +static level_lookup_result cbk_node_lookup(cbk_handle * h);
78103 +
78104 +/* helper functions */
78105 +
78106 +static void update_stale_dk(reiser4_tree *tree, znode *node);
78107 +
78108 +/* release parent node during traversal */
78109 +static void put_parent(cbk_handle * h);
78110 +/* check consistency of fields */
78111 +static int sanity_check(cbk_handle * h);
78112 +/* release resources in handle */
78113 +static void hput(cbk_handle * h);
78114 +
78115 +static level_lookup_result search_to_left(cbk_handle * h);
78116 +
78117 +/* pack numerous (numberous I should say) arguments of coord_by_key() into
78118 + * cbk_handle */
78119 +reiser4_internal cbk_handle *cbk_pack(cbk_handle *handle,
78120 +                    reiser4_tree * tree,
78121 +                    const reiser4_key * key,
78122 +                    coord_t * coord,
78123 +                    lock_handle * active_lh,
78124 +                    lock_handle * parent_lh,
78125 +                    znode_lock_mode lock_mode,
78126 +                    lookup_bias bias,
78127 +                    tree_level lock_level,
78128 +                    tree_level stop_level,
78129 +                    __u32 flags,
78130 +                    ra_info_t *info)
78131 +{
78132 +       xmemset(handle, 0, sizeof *handle);
78133 +
78134 +       handle->tree = tree;
78135 +       handle->key = key;
78136 +       handle->lock_mode = lock_mode;
78137 +       handle->bias = bias;
78138 +       handle->lock_level = lock_level;
78139 +       handle->stop_level = stop_level;
78140 +       handle->coord = coord;
78141 +       /* set flags. See comment in tree.h:cbk_flags */
78142 +       handle->flags = flags | CBK_TRUST_DK | CBK_USE_CRABLOCK;
78143 +
78144 +       handle->active_lh = active_lh;
78145 +       handle->parent_lh = parent_lh;
78146 +       handle->ra_info = info;
78147 +       return handle;
78148 +}
78149 +
78150 +/* main tree lookup procedure
78151 +
78152 +   Check coord cache. If key we are looking for is not found there, call cbk()
78153 +   to do real tree traversal.
78154 +
78155 +   As we have extents on the twig level, @lock_level and @stop_level can
78156 +   be different from LEAF_LEVEL and each other.
78157 +
78158 +   Thread cannot keep any reiser4 locks (tree, znode, dk spin-locks, or znode
78159 +   long term locks) while calling this.
78160 +*/
78161 +reiser4_internal lookup_result
78162 +coord_by_key(reiser4_tree * tree       /* tree to perform search
78163 +                                                * in. Usually this tree is
78164 +                                                * part of file-system
78165 +                                                * super-block */ ,
78166 +                          const reiser4_key * key /* key to look for */ ,
78167 +                          coord_t * coord      /* where to store found
78168 +                                                  * position in a tree. Fields
78169 +                                                  * in "coord" are only valid if
78170 +                                                  * coord_by_key() returned
78171 +                                                  * "CBK_COORD_FOUND" */ ,
78172 +                          lock_handle * lh,    /* resulting lock handle */
78173 +                          znode_lock_mode lock_mode    /* type of lookup we
78174 +                                                        * want on node. Pass
78175 +                                                        * ZNODE_READ_LOCK here
78176 +                                                        * if you only want to
78177 +                                                        * read item found and
78178 +                                                        * ZNODE_WRITE_LOCK if
78179 +                                                        * you want to modify
78180 +                                                        * it */ ,
78181 +                          lookup_bias bias     /* what to return if coord
78182 +                                                * with exactly the @key is
78183 +                                                * not in the tree */ ,
78184 +                          tree_level lock_level        /* tree level where to start
78185 +                                                        * taking @lock type of
78186 +                                                        * locks */ ,
78187 +                          tree_level stop_level        /* tree level to stop. Pass
78188 +                                                        * LEAF_LEVEL or TWIG_LEVEL
78189 +                                                        * here Item being looked
78190 +                                                        * for has to be between
78191 +                                                        * @lock_level and
78192 +                                                        * @stop_level, inclusive */ ,
78193 +                          __u32 flags /* search flags */,
78194 +                          ra_info_t *info /* information about desired tree traversal readahead */)
78195 +{
78196 +       cbk_handle handle;
78197 +       lock_handle parent_lh;
78198 +       lookup_result result;
78199 +
78200 +       init_lh(lh);
78201 +       init_lh(&parent_lh);
78202 +
78203 +       assert("nikita-3023", schedulable());
78204 +
78205 +       assert("nikita-353", tree != NULL);
78206 +       assert("nikita-354", key != NULL);
78207 +       assert("nikita-355", coord != NULL);
78208 +       assert("nikita-356", (bias == FIND_EXACT) || (bias == FIND_MAX_NOT_MORE_THAN));
78209 +       assert("nikita-357", stop_level >= LEAF_LEVEL);
78210 +
78211 +       if (!lock_stack_isclean(get_current_lock_stack()))
78212 +               print_clog();
78213 +
78214 +       /* no locks can be held during tree traversal */
78215 +       assert("nikita-2104", lock_stack_isclean(get_current_lock_stack()));
78216 +       trace_stamp(TRACE_TREE);
78217 +
78218 +       cbk_pack(&handle,
78219 +                tree,
78220 +                key,
78221 +                coord,
78222 +                lh,
78223 +                &parent_lh,
78224 +                lock_mode,
78225 +                bias,
78226 +                lock_level,
78227 +                stop_level,
78228 +                flags,
78229 +                info);
78230 +
78231 +       result = coord_by_handle(&handle);
78232 +       assert("nikita-3247", ergo(!IS_CBKERR(result), coord->node == lh->node));
78233 +       return result;
78234 +}
78235 +
78236 +/* like coord_by_key(), but starts traversal from vroot of @object rather than
78237 + * from tree root. */
78238 +reiser4_internal lookup_result
78239 +object_lookup(struct inode *object,
78240 +             const reiser4_key * key,
78241 +             coord_t * coord,
78242 +             lock_handle * lh,
78243 +             znode_lock_mode lock_mode,
78244 +             lookup_bias bias,
78245 +             tree_level lock_level,
78246 +             tree_level stop_level,
78247 +             __u32 flags,
78248 +             ra_info_t *info)
78249 +{
78250 +       cbk_handle handle;
78251 +       lock_handle parent_lh;
78252 +       lookup_result result;
78253 +
78254 +       init_lh(lh);
78255 +       init_lh(&parent_lh);
78256 +
78257 +       assert("nikita-3023", schedulable());
78258 +
78259 +       assert("nikita-354", key != NULL);
78260 +       assert("nikita-355", coord != NULL);
78261 +       assert("nikita-356", (bias == FIND_EXACT) || (bias == FIND_MAX_NOT_MORE_THAN));
78262 +       assert("nikita-357", stop_level >= LEAF_LEVEL);
78263 +
78264 +       if (!lock_stack_isclean(get_current_lock_stack()))
78265 +               print_clog();
78266 +
78267 +       /* no locks can be held during tree search by key */
78268 +       assert("nikita-2104", lock_stack_isclean(get_current_lock_stack()));
78269 +       trace_stamp(TRACE_TREE);
78270 +
78271 +       cbk_pack(&handle,
78272 +                object != NULL ? tree_by_inode(object) : current_tree,
78273 +                key,
78274 +                coord,
78275 +                lh,
78276 +                &parent_lh,
78277 +                lock_mode,
78278 +                bias,
78279 +                lock_level,
78280 +                stop_level,
78281 +                flags,
78282 +                info);
78283 +       handle.object = object;
78284 +
78285 +       result = coord_by_handle(&handle);
78286 +       assert("nikita-3247", ergo(!IS_CBKERR(result), coord->node == lh->node));
78287 +       return result;
78288 +}
78289 +
78290 +/* lookup by cbk_handle. Common part of coord_by_key() and object_lookup(). */
78291 +static lookup_result
78292 +coord_by_handle(cbk_handle * handle)
78293 +{
78294 +       /*
78295 +        * first check cbk_cache (which is look-aside cache for our tree) and
78296 +        * of this fails, start traversal.
78297 +        */
78298 +
78299 +       write_tree_log(handle->tree, tree_lookup, handle->key);
78300 +
78301 +       /* first check whether "key" is in cache of recent lookups. */
78302 +       if (cbk_cache_search(handle) == 0)
78303 +               return handle->result;
78304 +       else
78305 +               return traverse_tree(handle);
78306 +}
78307 +
78308 +/* Execute actor for each item (or unit, depending on @through_units_p),
78309 +   starting from @coord, right-ward, until either:
78310 +
78311 +   - end of the tree is reached
78312 +   - unformatted node is met
78313 +   - error occurred
78314 +   - @actor returns 0 or less
78315 +
78316 +   Error code, or last actor return value is returned.
78317 +
78318 +   This is used by plugin/dir/hashe_dir.c:find_entry() to move through
78319 +   sequence of entries with identical keys and alikes.
78320 +*/
78321 +reiser4_internal int
78322 +iterate_tree(reiser4_tree * tree /* tree to scan */ ,
78323 +            coord_t * coord /* coord to start from */ ,
78324 +            lock_handle * lh   /* lock handle to start with and to
78325 +                                  * update along the way */ ,
78326 +            tree_iterate_actor_t actor /* function to call on each
78327 +                                        * item/unit */ ,
78328 +            void *arg /* argument to pass to @actor */ ,
78329 +            znode_lock_mode mode /* lock mode on scanned nodes */ ,
78330 +            int through_units_p        /* call @actor on each item or on each
78331 +                                        * unit */ )
78332 +{
78333 +       int result;
78334 +
78335 +       assert("nikita-1143", tree != NULL);
78336 +       assert("nikita-1145", coord != NULL);
78337 +       assert("nikita-1146", lh != NULL);
78338 +       assert("nikita-1147", actor != NULL);
78339 +
78340 +       result = zload(coord->node);
78341 +       coord_clear_iplug(coord);
78342 +       if (result != 0)
78343 +               return result;
78344 +       if (!coord_is_existing_unit(coord)) {
78345 +               zrelse(coord->node);
78346 +               return -ENOENT;
78347 +       }
78348 +       while ((result = actor(tree, coord, lh, arg)) > 0) {
78349 +               /* move further  */
78350 +               if ((through_units_p && coord_next_unit(coord)) ||
78351 +                   (!through_units_p && coord_next_item(coord))) {
78352 +                       do {
78353 +                               lock_handle couple;
78354 +
78355 +                               /* move to the next node  */
78356 +                               init_lh(&couple);
78357 +                               result = reiser4_get_right_neighbor(
78358 +                                       &couple, coord->node, (int) mode, GN_CAN_USE_UPPER_LEVELS);
78359 +                               zrelse(coord->node);
78360 +                               if (result == 0) {
78361 +
78362 +                                       result = zload(couple.node);
78363 +                                       if (result != 0) {
78364 +                                               done_lh(&couple);
78365 +                                               return result;
78366 +                                       }
78367 +
78368 +                                       coord_init_first_unit(coord, couple.node);
78369 +                                       done_lh(lh);
78370 +                                       move_lh(lh, &couple);
78371 +                               } else
78372 +                                       return result;
78373 +                       } while (node_is_empty(coord->node));
78374 +               }
78375 +
78376 +               assert("nikita-1149", coord_is_existing_unit(coord));
78377 +       }
78378 +       zrelse(coord->node);
78379 +       return result;
78380 +}
78381 +
78382 +/* return locked uber znode for @tree */
78383 +reiser4_internal int get_uber_znode(reiser4_tree * tree, znode_lock_mode mode,
78384 +                  znode_lock_request pri, lock_handle *lh)
78385 +{
78386 +       int result;
78387 +
78388 +       result = longterm_lock_znode(lh, tree->uber, mode, pri);
78389 +       return result;
78390 +}
78391 +
78392 +/* true if @key is strictly within @node
78393 +
78394 +   we are looking for possibly non-unique key and it is item is at the edge of
78395 +   @node. May be it is in the neighbor.
78396 +*/
78397 +static int
78398 +znode_contains_key_strict(znode * node /* node to check key
78399 +                                        * against */ ,
78400 +                         const reiser4_key * key /* key to check */,
78401 +                         int isunique)
78402 +{
78403 +       int answer;
78404 +
78405 +       assert("nikita-1760", node != NULL);
78406 +       assert("nikita-1722", key != NULL);
78407 +
78408 +       if (keyge(key, &node->rd_key))
78409 +               return 0;
78410 +
78411 +       answer = keycmp(&node->ld_key, key);
78412 +
78413 +       if (isunique)
78414 +               return answer != GREATER_THAN;
78415 +       else
78416 +               return answer == LESS_THAN;
78417 +}
78418 +
78419 +/*
78420 + * Virtual Root (vroot) code.
78421 + *
78422 + *     For given file system object (e.g., regular file or directory) let's
78423 + *     define its "virtual root" as lowest in the tree (that is, furtherest
78424 + *     from the tree root) node such that all body items of said object are
78425 + *     located in a tree rooted at this node.
78426 + *
78427 + *     Once vroot of object is found all tree lookups for items within body of
78428 + *     this object ("object lookups") can be started from its vroot rather
78429 + *     than from real root. This has following advantages:
78430 + *
78431 + *         1. amount of nodes traversed during lookup (and, hence, amount of
78432 + *         key comparisons made) decreases, and
78433 + *
78434 + *         2. contention on tree root is decreased. This latter was actually
78435 + *         motivating reason behind vroot, because spin lock of root node,
78436 + *         which is taken when acquiring long-term lock on root node is the
78437 + *         hottest lock in the reiser4.
78438 + *
78439 + * How to find vroot.
78440 + *
78441 + *     When vroot of object F is not yet determined, all object lookups start
78442 + *     from the root of the tree. At each tree level during traversal we have
78443 + *     a node N such that a key we are looking for (which is the key inside
78444 + *     object's body) is located within N. In function handle_vroot() called
78445 + *     from cbk_level_lookup() we check whether N is possible vroot for
78446 + *     F. Check is trivial---if neither leftmost nor rightmost item of N
78447 + *     belongs to F (and we already have helpful ->owns_item() method of
78448 + *     object plugin for this), then N is possible vroot of F. This, of
78449 + *     course, relies on the assumption that each object occupies contiguous
78450 + *     range of keys in the tree.
78451 + *
78452 + *     Thus, traversing tree downward and checking each node as we go, we can
78453 + *     find lowest such node, which, by definition, is vroot.
78454 + *
78455 + * How to track vroot.
78456 + *
78457 + *     Nohow. If actual vroot changes, next object lookup will just restart
78458 + *     from the actual tree root, refreshing object's vroot along the way.
78459 + *
78460 + */
78461 +
78462 +/*
78463 + * Check whether @node is possible vroot of @object.
78464 + */
78465 +static void
78466 +handle_vroot(struct inode *object, znode *node)
78467 +{
78468 +       file_plugin *fplug;
78469 +       coord_t coord;
78470 +
78471 +       fplug = inode_file_plugin(object);
78472 +       assert("nikita-3353", fplug != NULL);
78473 +       assert("nikita-3354", fplug->owns_item != NULL);
78474 +
78475 +       if (unlikely(node_is_empty(node)))
78476 +               return;
78477 +
78478 +       coord_init_first_unit(&coord, node);
78479 +       /*
78480 +        * if leftmost item of @node belongs to @object, we cannot be sure
78481 +        * that @node is vroot of @object, because, some items of @object are
78482 +        * probably in the sub-tree rooted at the left neighbor of @node.
78483 +        */
78484 +       if (fplug->owns_item(object, &coord))
78485 +               return;
78486 +       coord_init_last_unit(&coord, node);
78487 +       /* mutatis mutandis for the rightmost item */
78488 +       if (fplug->owns_item(object, &coord))
78489 +               return;
78490 +       /* otherwise, @node is possible vroot of @object */
78491 +       inode_set_vroot(object, node);
78492 +}
78493 +
78494 +/*
78495 + * helper function used by traverse tree to start tree traversal not from the
78496 + * tree root, but from @h->object's vroot, if possible.
78497 + */
78498 +static int
78499 +prepare_object_lookup(cbk_handle * h)
78500 +{
78501 +       znode         *vroot;
78502 +       int            result;
78503 +
78504 +       vroot = inode_get_vroot(h->object);
78505 +       if (vroot == NULL) {
78506 +               /*
78507 +                * object doesn't have known vroot, start from real tree root.
78508 +                */
78509 +               reiser4_stat_inc(tree.object_lookup_novroot);
78510 +               return LOOKUP_CONT;
78511 +       }
78512 +
78513 +       h->level = znode_get_level(vroot);
78514 +       /* take a long-term lock on vroot */
78515 +       h->result = longterm_lock_znode(h->active_lh, vroot,
78516 +                                       cbk_lock_mode(h->level, h),
78517 +                                       ZNODE_LOCK_LOPRI);
78518 +       result = LOOKUP_REST;
78519 +       if (h->result == 0) {
78520 +               int isunique;
78521 +               int inside;
78522 +
78523 +               isunique = h->flags & CBK_UNIQUE;
78524 +               /* check that key is inside vroot */
78525 +               inside =
78526 +                       UNDER_RW(dk, h->tree, read,
78527 +                                znode_contains_key_strict(vroot,
78528 +                                                          h->key,
78529 +                                                          isunique)) &&
78530 +                       !ZF_ISSET(vroot, JNODE_HEARD_BANSHEE);
78531 +               if (inside) {
78532 +                       h->result = zload(vroot);
78533 +                       if (h->result == 0) {
78534 +                               /* search for key in vroot. */
78535 +                               result = cbk_node_lookup(h);
78536 +                               zrelse(vroot);/*h->active_lh->node);*/
78537 +                               if (h->active_lh->node != vroot) {
78538 +                                       result = LOOKUP_REST;
78539 +                                       reiser4_stat_inc(tree.object_lookup_moved);
78540 +                               } else if (result == LOOKUP_CONT) {
78541 +                                       move_lh(h->parent_lh, h->active_lh);
78542 +                                       h->flags &= ~CBK_DKSET;
78543 +                               }
78544 +                       }
78545 +               } else
78546 +                       /* vroot is not up-to-date. Restart. */
78547 +                       reiser4_stat_inc(tree.object_lookup_outside);
78548 +       } else
78549 +               /* long-term locking failed. Restart. */
78550 +               reiser4_stat_inc(tree.object_lookup_cannotlock);
78551 +
78552 +       zput(vroot);
78553 +
78554 +       if (IS_CBKERR(h->result) || result == LOOKUP_REST)
78555 +               hput(h);
78556 +       if (result != LOOKUP_REST)
78557 +               reiser4_stat_inc_at_level(h->level, object_lookup_start);
78558 +       return result;
78559 +}
78560 +
78561 +/* main function that handles common parts of tree traversal: starting
78562 +    (fake znode handling), restarts, error handling, completion */
78563 +static lookup_result
78564 +traverse_tree(cbk_handle * h /* search handle */ )
78565 +{
78566 +       int done;
78567 +       int iterations;
78568 +       int vroot_used;
78569 +
78570 +       assert("nikita-365", h != NULL);
78571 +       assert("nikita-366", h->tree != NULL);
78572 +       assert("nikita-367", h->key != NULL);
78573 +       assert("nikita-368", h->coord != NULL);
78574 +       assert("nikita-369", (h->bias == FIND_EXACT) || (h->bias == FIND_MAX_NOT_MORE_THAN));
78575 +       assert("nikita-370", h->stop_level >= LEAF_LEVEL);
78576 +       assert("nikita-2949", !(h->flags & CBK_DKSET));
78577 +       assert("zam-355", lock_stack_isclean(get_current_lock_stack()));
78578 +
78579 +       trace_stamp(TRACE_TREE);
78580 +       reiser4_stat_inc(tree.cbk);
78581 +
78582 +       done = 0;
78583 +       iterations = 0;
78584 +       vroot_used = 0;
78585 +
78586 +       /* loop for restarts */
78587 +restart:
78588 +
78589 +       assert("nikita-3024", schedulable());
78590 +
78591 +       h->result = CBK_COORD_FOUND;
78592 +       /* connect_znode() needs it */
78593 +       h->ld_key = *min_key();
78594 +       h->rd_key = *max_key();
78595 +       h->flags |= CBK_DKSET;
78596 +       h->error = NULL;
78597 +
78598 +       if (!vroot_used && h->object != NULL) {
78599 +               vroot_used = 1;
78600 +               done = prepare_object_lookup(h);
78601 +               if (done == LOOKUP_REST) {
78602 +                       reiser4_stat_inc(tree.object_lookup_restart);
78603 +                       goto restart;
78604 +               } else if (done == LOOKUP_DONE)
78605 +                       return h->result;
78606 +       }
78607 +       if (h->parent_lh->node == NULL) {
78608 +               done = get_uber_znode(h->tree, ZNODE_READ_LOCK, ZNODE_LOCK_LOPRI,
78609 +                                     h->parent_lh);
78610 +
78611 +               assert("nikita-1637", done != -E_DEADLOCK);
78612 +
78613 +               h->block = h->tree->root_block;
78614 +               h->level = h->tree->height;
78615 +               h->coord->node = h->parent_lh->node;
78616 +
78617 +               if (done != 0)
78618 +                       return done;
78619 +       }
78620 +
78621 +       /* loop descending a tree */
78622 +       while (!done) {
78623 +
78624 +               if (unlikely((iterations > REISER4_CBK_ITERATIONS_LIMIT) &&
78625 +                            IS_POW(iterations))) {
78626 +                       warning("nikita-1481", "Too many iterations: %i", iterations);
78627 +                       print_key("key", h->key);
78628 +                       ++iterations;
78629 +               } else if (unlikely(iterations > REISER4_MAX_CBK_ITERATIONS)) {
78630 +                       h->error =
78631 +                           "reiser-2018: Too many iterations. Tree corrupted, or (less likely) starvation occurring.";
78632 +                       h->result = RETERR(-EIO);
78633 +                       break;
78634 +               }
78635 +               switch (cbk_level_lookup(h)) {
78636 +               case LOOKUP_CONT:
78637 +                       move_lh(h->parent_lh, h->active_lh);
78638 +                       continue;
78639 +               default:
78640 +                       wrong_return_value("nikita-372", "cbk_level");
78641 +               case LOOKUP_DONE:
78642 +                       done = 1;
78643 +                       break;
78644 +               case LOOKUP_REST:
78645 +                       reiser4_stat_inc(tree.cbk_restart);
78646 +                       hput(h);
78647 +                       /* deadlock avoidance is normal case. */
78648 +                       if (h->result != -E_DEADLOCK)
78649 +                               ++iterations;
78650 +                       preempt_point();
78651 +                       goto restart;
78652 +               }
78653 +       }
78654 +       /* that's all. The rest is error handling */
78655 +       if (unlikely(h->error != NULL)) {
78656 +               warning("nikita-373", "%s: level: %i, "
78657 +                       "lock_level: %i, stop_level: %i "
78658 +                       "lock_mode: %s, bias: %s",
78659 +                       h->error, h->level, h->lock_level, h->stop_level,
78660 +                       lock_mode_name(h->lock_mode), bias_name(h->bias));
78661 +               print_address("block", &h->block);
78662 +               print_key("key", h->key);
78663 +               print_coord_content("coord", h->coord);
78664 +               print_znode("active", h->active_lh->node);
78665 +               print_znode("parent", h->parent_lh->node);
78666 +       }
78667 +       /* `unlikely' error case */
78668 +       if (unlikely(IS_CBKERR(h->result))) {
78669 +               /* failure. do cleanup */
78670 +               hput(h);
78671 +       } else {
78672 +               assert("nikita-1605", WITH_DATA_RET
78673 +                      (h->coord->node, 1,
78674 +                       ergo((h->result == CBK_COORD_FOUND) &&
78675 +                            (h->bias == FIND_EXACT) &&
78676 +                            (!node_is_empty(h->coord->node)), coord_is_existing_item(h->coord))));
78677 +       }
78678 +       write_tree_log(h->tree, tree_exit);
78679 +       return h->result;
78680 +}
78681 +
78682 +/* find delimiting keys of child
78683 +
78684 +   Determine left and right delimiting keys for child pointed to by
78685 +   @parent_coord.
78686 +
78687 +*/
78688 +static void
78689 +find_child_delimiting_keys(znode * parent      /* parent znode, passed
78690 +                                                * locked */ ,
78691 +                          const coord_t * parent_coord /* coord where
78692 +                                                          * pointer to
78693 +                                                          * child is
78694 +                                                          * stored */ ,
78695 +                          reiser4_key * ld     /* where to store left
78696 +                                                * delimiting key */ ,
78697 +                          reiser4_key * rd     /* where to store right
78698 +                                                * delimiting key */ )
78699 +{
78700 +       coord_t neighbor;
78701 +
78702 +       assert("nikita-1484", parent != NULL);
78703 +       assert("nikita-1485", rw_dk_is_locked(znode_get_tree(parent)));
78704 +
78705 +       coord_dup(&neighbor, parent_coord);
78706 +
78707 +       if (neighbor.between == AT_UNIT)
78708 +               /* imitate item ->lookup() behavior. */
78709 +               neighbor.between = AFTER_UNIT;
78710 +
78711 +       if (coord_is_existing_unit(&neighbor) ||
78712 +           coord_set_to_left(&neighbor) == 0)
78713 +               unit_key_by_coord(&neighbor, ld);
78714 +       else
78715 +               *ld = *znode_get_ld_key(parent);
78716 +
78717 +       coord_dup(&neighbor, parent_coord);
78718 +       if (neighbor.between == AT_UNIT)
78719 +               neighbor.between = AFTER_UNIT;
78720 +       if (coord_set_to_right(&neighbor) == 0)
78721 +               unit_key_by_coord(&neighbor, rd);
78722 +       else
78723 +               *rd = *znode_get_rd_key(parent);
78724 +}
78725 +
78726 +/*
78727 + * setup delimiting keys for a child
78728 + *
78729 + * @parent parent node
78730 + *
78731 + * @coord location in @parent where pointer to @child is
78732 + *
78733 + * @child child node
78734 + */
78735 +reiser4_internal int
78736 +set_child_delimiting_keys(znode * parent,
78737 +                         const coord_t * coord, znode * child)
78738 +{
78739 +       reiser4_tree *tree;
78740 +       int result;
78741 +
78742 +       assert("nikita-2952",
78743 +              znode_get_level(parent) == znode_get_level(coord->node));
78744 +
78745 +       tree = znode_get_tree(parent);
78746 +       result = 0;
78747 +       /* fast check without taking dk lock. This is safe, because
78748 +        * JNODE_DKSET is never cleared once set. */
78749 +       if (!ZF_ISSET(child, JNODE_DKSET)) {
78750 +               WLOCK_DK(tree);
78751 +               if (likely(!ZF_ISSET(child, JNODE_DKSET))) {
78752 +                       find_child_delimiting_keys(parent, coord,
78753 +                                                  znode_get_ld_key(child),
78754 +                                                  znode_get_rd_key(child));
78755 +                       ZF_SET(child, JNODE_DKSET);
78756 +                       result = 1;
78757 +               }
78758 +               WUNLOCK_DK(tree);
78759 +       }
78760 +       return result;
78761 +}
78762 +
78763 +/* Perform tree lookup at one level. This is called from cbk_traverse()
78764 +   function that drives lookup through tree and calls cbk_node_lookup() to
78765 +   perform lookup within one node.
78766 +
78767 +   See comments in a code.
78768 +*/
78769 +static level_lookup_result
78770 +cbk_level_lookup(cbk_handle * h /* search handle */ )
78771 +{
78772 +       int ret;
78773 +       int setdk;
78774 +       int ldkeyset = 0;
78775 +       reiser4_key ldkey;
78776 +       reiser4_key key;
78777 +       znode *active;
78778 +
78779 +       assert("nikita-3025", schedulable());
78780 +
78781 +       /* acquire reference to @active node */
78782 +       active = zget(h->tree, &h->block, h->parent_lh->node, h->level, GFP_KERNEL);
78783 +
78784 +       if (IS_ERR(active)) {
78785 +               h->result = PTR_ERR(active);
78786 +               return LOOKUP_DONE;
78787 +       }
78788 +
78789 +       /* lock @active */
78790 +       h->result = longterm_lock_znode(h->active_lh,
78791 +                                       active,
78792 +                                       cbk_lock_mode(h->level, h),
78793 +                                       ZNODE_LOCK_LOPRI);
78794 +       /* longterm_lock_znode() acquires additional reference to znode (which
78795 +          will be later released by longterm_unlock_znode()). Release
78796 +          reference acquired by zget().
78797 +       */
78798 +       zput(active);
78799 +       if (unlikely(h->result != 0))
78800 +               goto fail_or_restart;
78801 +
78802 +       setdk = 0;
78803 +       /* if @active is accessed for the first time, setup delimiting keys on
78804 +          it. Delimiting keys are taken from the parent node. See
78805 +          setup_delimiting_keys() for details.
78806 +       */
78807 +       if (h->flags & CBK_DKSET) {
78808 +               setdk = setup_delimiting_keys(h);
78809 +               h->flags &= ~CBK_DKSET;
78810 +       } else {
78811 +               znode *parent;
78812 +
78813 +               parent = h->parent_lh->node;
78814 +               h->result = zload(parent);
78815 +               if (unlikely(h->result != 0))
78816 +                       goto fail_or_restart;
78817 +
78818 +               if (!ZF_ISSET(active, JNODE_DKSET))
78819 +                       setdk = set_child_delimiting_keys(parent,
78820 +                                                         h->coord, active);
78821 +               else {
78822 +                       UNDER_RW_VOID(dk, h->tree, read,
78823 +                                     find_child_delimiting_keys(parent,
78824 +                                                                h->coord,
78825 +                                                                &ldkey, &key));
78826 +                       ldkeyset = 1;
78827 +               }
78828 +               zrelse(parent);
78829 +       }
78830 +
78831 +       /* this is ugly kludge. Reminder: this is necessary, because
78832 +          ->lookup() method returns coord with ->between field probably set
78833 +          to something different from AT_UNIT.
78834 +       */
78835 +       h->coord->between = AT_UNIT;
78836 +
78837 +       if (znode_just_created(active) && (h->coord->node != NULL)) {
78838 +               WLOCK_TREE(h->tree);
78839 +               /* if we are going to load znode right now, setup
78840 +                  ->in_parent: coord where pointer to this node is stored in
78841 +                  parent.
78842 +               */
78843 +               coord_to_parent_coord(h->coord, &active->in_parent);
78844 +               WUNLOCK_TREE(h->tree);
78845 +       }
78846 +
78847 +       /* check connectedness without holding tree lock---false negatives
78848 +        * will be re-checked by connect_znode(), and false positives are
78849 +        * impossible---@active cannot suddenly turn into unconnected
78850 +        * state. */
78851 +       if (!znode_is_connected(active)) {
78852 +               h->result = connect_znode(h->coord, active);
78853 +               if (unlikely(h->result != 0)) {
78854 +                       put_parent(h);
78855 +                       goto fail_or_restart;
78856 +               }
78857 +       }
78858 +
78859 +       jload_prefetch(ZJNODE(active));
78860 +
78861 +       if (setdk)
78862 +               update_stale_dk(h->tree, active);
78863 +
78864 +       /* put_parent() cannot be called earlier, because connect_znode()
78865 +          assumes parent node is referenced; */
78866 +       put_parent(h);
78867 +
78868 +       if ((!znode_contains_key_lock(active, h->key) &&
78869 +            (h->flags & CBK_TRUST_DK)) || ZF_ISSET(active, JNODE_HEARD_BANSHEE)) {
78870 +               /* 1. key was moved out of this node while this thread was
78871 +                  waiting for the lock. Restart. More elaborate solution is
78872 +                  to determine where key moved (to the left, or to the right)
78873 +                  and try to follow it through sibling pointers.
78874 +
78875 +                  2. or, node itself is going to be removed from the
78876 +                  tree. Release lock and restart.
78877 +               */
78878 +               if (REISER4_STATS) {
78879 +                       if (znode_contains_key_lock(active, h->key))
78880 +                               reiser4_stat_inc_at_level(h->level, cbk_met_ghost);
78881 +                       else
78882 +                               reiser4_stat_inc_at_level(h->level, cbk_key_moved);
78883 +               }
78884 +               h->result = -E_REPEAT;
78885 +       }
78886 +       if (h->result == -E_REPEAT)
78887 +               return LOOKUP_REST;
78888 +
78889 +       h->result = zload_ra(active, h->ra_info);
78890 +       if (h->result) {
78891 +               return LOOKUP_DONE;
78892 +       }
78893 +
78894 +       /* sanity checks */
78895 +       if (sanity_check(h)) {
78896 +               zrelse(active);
78897 +               return LOOKUP_DONE;
78898 +       }
78899 +
78900 +       /* check that key of leftmost item in the @active is the same as in
78901 +        * its parent */
78902 +       if (ldkeyset && !node_is_empty(active) &&
78903 +           !keyeq(leftmost_key_in_node(active, &key), &ldkey)) {
78904 +               warning("vs-3533", "Keys are inconsistent. Fsck?");
78905 +               print_node_content("child", active, ~0);
78906 +               print_key("inparent", &ldkey);
78907 +               print_key("inchild", &key);
78908 +               h->result = RETERR(-EIO);
78909 +               zrelse(active);
78910 +               return LOOKUP_DONE;
78911 +       }
78912 +
78913 +       if (h->object != NULL)
78914 +               handle_vroot(h->object, active);
78915 +
78916 +       ret = cbk_node_lookup(h);
78917 +
78918 +       /* reget @active from handle, because it can change in
78919 +          cbk_node_lookup()  */
78920 +       /*active = h->active_lh->node;*/
78921 +       zrelse(active);
78922 +
78923 +       return ret;
78924 +
78925 +fail_or_restart:
78926 +       if (h->result == -E_DEADLOCK)
78927 +               return LOOKUP_REST;
78928 +       return LOOKUP_DONE;
78929 +}
78930 +
78931 +#if REISER4_DEBUG
78932 +/* check left and right delimiting keys of a znode */
78933 +void
78934 +check_dkeys(const znode *node)
78935 +{
78936 +       znode *left;
78937 +       znode *right;
78938 +
78939 +       RLOCK_DK(current_tree);
78940 +       RLOCK_TREE(current_tree);
78941 +
78942 +       assert("vs-1197", !keygt(&node->ld_key, &node->rd_key));
78943 +
78944 +       left = node->left;
78945 +       right = node->right;
78946 +
78947 +       if (ZF_ISSET(node, JNODE_LEFT_CONNECTED) &&
78948 +           left != NULL && ZF_ISSET(left, JNODE_DKSET))
78949 +               /* check left neighbor */
78950 +               assert("vs-1198", keyeq(&left->rd_key, &node->ld_key));
78951 +
78952 +       if (ZF_ISSET(node, JNODE_RIGHT_CONNECTED) && right != NULL &&
78953 +           ZF_ISSET(right, JNODE_DKSET))
78954 +               /* check right neighbor */
78955 +               assert("vs-1199", keyeq(&node->rd_key, &right->ld_key));
78956 +
78957 +       RUNLOCK_TREE(current_tree);
78958 +       RUNLOCK_DK(current_tree);
78959 +}
78960 +#endif
78961 +
78962 +/* Process one node during tree traversal.
78963 +
78964 +   This is called by cbk_level_lookup(). */
78965 +static level_lookup_result
78966 +cbk_node_lookup(cbk_handle * h /* search handle */ )
78967 +{
78968 +       /* node plugin of @active */
78969 +       node_plugin *nplug;
78970 +       /* item plugin of item that was found */
78971 +       item_plugin *iplug;
78972 +       /* search bias */
78973 +       lookup_bias node_bias;
78974 +       /* node we are operating upon */
78975 +       znode *active;
78976 +       /* tree we are searching in */
78977 +       reiser4_tree *tree;
78978 +       /* result */
78979 +       int result;
78980 +
78981 +       /* true if @key is left delimiting key of @node */
78982 +       static int key_is_ld(znode * node, const reiser4_key * key) {
78983 +               int ld;
78984 +
78985 +                assert("nikita-1716", node != NULL);
78986 +                assert("nikita-1758", key != NULL);
78987 +
78988 +                RLOCK_DK(znode_get_tree(node));
78989 +                assert("nikita-1759", znode_contains_key(node, key));
78990 +                ld = keyeq(znode_get_ld_key(node), key);
78991 +                RUNLOCK_DK(znode_get_tree(node));
78992 +                return ld;
78993 +       }
78994 +       assert("nikita-379", h != NULL);
78995 +
78996 +       active = h->active_lh->node;
78997 +       tree = h->tree;
78998 +
78999 +       nplug = active->nplug;
79000 +       assert("nikita-380", nplug != NULL);
79001 +
79002 +       ON_DEBUG(check_dkeys(active));
79003 +
79004 +       /* return item from "active" node with maximal key not greater than
79005 +          "key"  */
79006 +       node_bias = h->bias;
79007 +       result = nplug->lookup(active, h->key, node_bias, h->coord);
79008 +       if (unlikely(result != NS_FOUND && result != NS_NOT_FOUND)) {
79009 +               /* error occurred */
79010 +               h->result = result;
79011 +               return LOOKUP_DONE;
79012 +       }
79013 +       if (h->level == h->stop_level) {
79014 +               /* welcome to the stop level */
79015 +               assert("nikita-381", h->coord->node == active);
79016 +               if (result == NS_FOUND) {
79017 +                       /* success of tree lookup */
79018 +                       if (!(h->flags & CBK_UNIQUE) && key_is_ld(active, h->key)) {
79019 +                               return search_to_left(h);
79020 +                       } else
79021 +                               h->result = CBK_COORD_FOUND;
79022 +                       reiser4_stat_inc(tree.cbk_found);
79023 +               } else {
79024 +                       h->result = CBK_COORD_NOTFOUND;
79025 +                       reiser4_stat_inc(tree.cbk_notfound);
79026 +               }
79027 +               if (!(h->flags & CBK_IN_CACHE))
79028 +                       cbk_cache_add(active);
79029 +               return LOOKUP_DONE;
79030 +       }
79031 +
79032 +       if (h->level > TWIG_LEVEL && result == NS_NOT_FOUND) {
79033 +               h->error = "not found on internal node";
79034 +               h->result = result;
79035 +               return LOOKUP_DONE;
79036 +       }
79037 +
79038 +       assert("vs-361", h->level > h->stop_level);
79039 +
79040 +       if (handle_eottl(h, &result)) {
79041 +               /**/
79042 +               assert("vs-1674", result == LOOKUP_DONE || result == LOOKUP_REST);
79043 +               return result;
79044 +       }
79045 +
79046 +       assert("nikita-2116", item_is_internal(h->coord));
79047 +       iplug = item_plugin_by_coord(h->coord);
79048 +
79049 +       /* go down to next level */
79050 +       assert("vs-515", item_is_internal(h->coord));
79051 +       iplug->s.internal.down_link(h->coord, h->key, &h->block);
79052 +       --h->level;
79053 +       return LOOKUP_CONT;     /* continue */
79054 +}
79055 +
79056 +/* scan cbk_cache slots looking for a match for @h */
79057 +static int
79058 +cbk_cache_scan_slots(cbk_handle * h /* cbk handle */ )
79059 +{
79060 +       level_lookup_result llr;
79061 +       znode *node;
79062 +       reiser4_tree *tree;
79063 +       cbk_cache_slot *slot;
79064 +       cbk_cache *cache;
79065 +       tree_level level;
79066 +       int isunique;
79067 +       const reiser4_key *key;
79068 +       int result;
79069 +
79070 +       assert("nikita-1317", h != NULL);
79071 +       assert("nikita-1315", h->tree != NULL);
79072 +       assert("nikita-1316", h->key != NULL);
79073 +
79074 +       tree = h->tree;
79075 +       cache = &tree->cbk_cache;
79076 +       if (cache->nr_slots == 0)
79077 +               /* size of cbk cache was set to 0 by mount time option. */
79078 +               return RETERR(-ENOENT);
79079 +
79080 +       assert("nikita-2474", cbk_cache_invariant(cache));
79081 +       node = NULL;            /* to keep gcc happy */
79082 +       level = h->level;
79083 +       key = h->key;
79084 +       isunique = h->flags & CBK_UNIQUE;
79085 +       result = RETERR(-ENOENT);
79086 +
79087 +       /*
79088 +        * this is time-critical function and dragons had, hence, been settled
79089 +        * here.
79090 +        *
79091 +        * Loop below scans cbk cache slots trying to find matching node with
79092 +        * suitable range of delimiting keys and located at the h->level.
79093 +        *
79094 +        * Scan is done under cbk cache spin lock that protects slot->node
79095 +        * pointers. If suitable node is found we want to pin it in
79096 +        * memory. But slot->node can point to the node with x_count 0
79097 +        * (unreferenced). Such node can be recycled at any moment, or can
79098 +        * already be in the process of being recycled (within jput()).
79099 +        *
79100 +        * As we found node in the cbk cache, it means that jput() hasn't yet
79101 +        * called cbk_cache_invalidate().
79102 +        *
79103 +        * We acquire reference to the node without holding tree lock, and
79104 +        * later, check node's RIP bit. This avoids races with jput().
79105 +        *
79106 +        */
79107 +
79108 +       rcu_read_lock();
79109 +       read_lock_cbk_cache(cache);
79110 +       slot = cbk_cache_list_prev(cbk_cache_list_front(&cache->lru));
79111 +       while (1) {
79112 +
79113 +               slot = cbk_cache_list_next(slot);
79114 +
79115 +               if (!cbk_cache_list_end(&cache->lru, slot))
79116 +                       node = slot->node;
79117 +               else
79118 +                       node = NULL;
79119 +
79120 +               if (unlikely(node == NULL))
79121 +                       break;
79122 +
79123 +               /*
79124 +                * this is (hopefully) the only place in the code where we are
79125 +                * working with delimiting keys without holding dk lock. This
79126 +                * is fine here, because this is only "guess" anyway---keys
79127 +                * are rechecked under dk lock below.
79128 +                */
79129 +               if (znode_get_level(node) == level &&
79130 +                   /* min_key < key < max_key */
79131 +                   znode_contains_key_strict(node, key, isunique)) {
79132 +                       zref(node);
79133 +                       result = 0;
79134 +                       spin_lock_prefetch(&tree->tree_lock.lock);
79135 +                       break;
79136 +               }
79137 +       }
79138 +       read_unlock_cbk_cache(cache);
79139 +
79140 +       assert("nikita-2475", cbk_cache_invariant(cache));
79141 +
79142 +       if (unlikely(result == 0 && ZF_ISSET(node, JNODE_RIP)))
79143 +               result = -ENOENT;
79144 +
79145 +       rcu_read_unlock();
79146 +
79147 +       if (result != 0) {
79148 +               h->result = CBK_COORD_NOTFOUND;
79149 +               return RETERR(-ENOENT);
79150 +       }
79151 +
79152 +       result = longterm_lock_znode(h->active_lh, node, cbk_lock_mode(level, h), ZNODE_LOCK_LOPRI);
79153 +       zput(node);
79154 +       if (result != 0)
79155 +               return result;
79156 +       result = zload(node);
79157 +       if (result != 0)
79158 +               return result;
79159 +
79160 +       /* recheck keys */
79161 +       result =
79162 +               UNDER_RW(dk, tree, read,
79163 +                        znode_contains_key_strict(node, key, isunique)) &&
79164 +               !ZF_ISSET(node, JNODE_HEARD_BANSHEE);
79165 +
79166 +       if (result) {
79167 +               /* do lookup inside node */
79168 +               llr = cbk_node_lookup(h);
79169 +               /* if cbk_node_lookup() wandered to another node (due to eottl
79170 +                  or non-unique keys), adjust @node */
79171 +               /*node = h->active_lh->node;*/
79172 +
79173 +               if (llr != LOOKUP_DONE) {
79174 +                       /* restart or continue on the next level */
79175 +                       reiser4_stat_inc(tree.cbk_cache_wrong_node);
79176 +                       result = RETERR(-ENOENT);
79177 +               } else if (IS_CBKERR(h->result))
79178 +                       /* io or oom */
79179 +                       result = RETERR(-ENOENT);
79180 +               else {
79181 +                       /* good. Either item found or definitely not found. */
79182 +                       result = 0;
79183 +
79184 +                       write_lock_cbk_cache(cache);
79185 +                       if (slot->node == h->active_lh->node/*node*/) {
79186 +                               /* if this node is still in cbk cache---move
79187 +                                  its slot to the head of the LRU list. */
79188 +                               cbk_cache_list_remove(slot);
79189 +                               cbk_cache_list_push_front(&cache->lru, slot);
79190 +                       }
79191 +                       write_unlock_cbk_cache(cache);
79192 +               }
79193 +       } else {
79194 +               /* race. While this thread was waiting for the lock, node was
79195 +                  rebalanced and item we are looking for, shifted out of it
79196 +                  (if it ever was here).
79197 +
79198 +                  Continuing scanning is almost hopeless: node key range was
79199 +                  moved to, is almost certainly at the beginning of the LRU
79200 +                  list at this time, because it's hot, but restarting
79201 +                  scanning from the very beginning is complex. Just return,
79202 +                  so that cbk() will be performed. This is not that
79203 +                  important, because such races should be rare. Are they?
79204 +               */
79205 +               reiser4_stat_inc(tree.cbk_cache_race);
79206 +               result = RETERR(-ENOENT);       /* -ERAUGHT */
79207 +       }
79208 +       zrelse(node);
79209 +       assert("nikita-2476", cbk_cache_invariant(cache));
79210 +       return result;
79211 +}
79212 +
79213 +/* look for item with given key in the coord cache
79214 +
79215 +   This function, called by coord_by_key(), scans "coord cache" (&cbk_cache)
79216 +   which is a small LRU list of znodes accessed lately. For each znode in
79217 +   znode in this list, it checks whether key we are looking for fits into key
79218 +   range covered by this node. If so, and in addition, node lies at allowed
79219 +   level (this is to handle extents on a twig level), node is locked, and
79220 +   lookup inside it is performed.
79221 +
79222 +   we need a measurement of the cost of this cache search compared to the cost
79223 +   of coord_by_key.
79224 +
79225 +*/
79226 +static int
79227 +cbk_cache_search(cbk_handle * h /* cbk handle */ )
79228 +{
79229 +       int result = 0;
79230 +       tree_level level;
79231 +
79232 +       /* add CBK_IN_CACHE to the handle flags. This means that
79233 +        * cbk_node_lookup() assumes that cbk_cache is scanned and would add
79234 +        * found node to the cache. */
79235 +       h->flags |= CBK_IN_CACHE;
79236 +       for (level = h->stop_level; level <= h->lock_level; ++level) {
79237 +               h->level = level;
79238 +               result = cbk_cache_scan_slots(h);
79239 +               if (result != 0) {
79240 +                       done_lh(h->active_lh);
79241 +                       done_lh(h->parent_lh);
79242 +                       reiser4_stat_inc(tree.cbk_cache_miss);
79243 +               } else {
79244 +                       assert("nikita-1319", !IS_CBKERR(h->result));
79245 +                       reiser4_stat_inc(tree.cbk_cache_hit);
79246 +                       write_tree_log(h->tree, tree_cached);
79247 +                       break;
79248 +               }
79249 +       }
79250 +       h->flags &= ~CBK_IN_CACHE;
79251 +       return result;
79252 +}
79253 +
79254 +/* type of lock we want to obtain during tree traversal. On stop level
79255 +    we want type of lock user asked for, on upper levels: read lock. */
79256 +reiser4_internal znode_lock_mode cbk_lock_mode(tree_level level, cbk_handle * h)
79257 +{
79258 +       assert("nikita-382", h != NULL);
79259 +
79260 +       return (level <= h->lock_level) ? h->lock_mode : ZNODE_READ_LOCK;
79261 +}
79262 +
79263 +/* update outdated delimiting keys */
79264 +static void stale_dk(reiser4_tree *tree, znode *node)
79265 +{
79266 +       znode *right;
79267 +
79268 +       WLOCK_DK(tree);
79269 +       RLOCK_TREE(tree);
79270 +       right = node->right;
79271 +
79272 +       if (ZF_ISSET(node, JNODE_RIGHT_CONNECTED) && right &&
79273 +           !keyeq(znode_get_rd_key(node), znode_get_ld_key(right)))
79274 +               znode_set_rd_key(node, znode_get_ld_key(right));
79275 +
79276 +       RUNLOCK_TREE(tree);
79277 +       WUNLOCK_DK(tree);
79278 +}
79279 +
79280 +/* check for possibly outdated delimiting keys, and update them if
79281 + * necessary. */
79282 +static void update_stale_dk(reiser4_tree *tree, znode *node)
79283 +{
79284 +       znode *right;
79285 +       reiser4_key rd;
79286 +
79287 +       RLOCK_DK(tree);
79288 +       rd = *znode_get_rd_key(node);
79289 +       RLOCK_TREE(tree);
79290 +       right = node->right;
79291 +       if (unlikely(ZF_ISSET(node, JNODE_RIGHT_CONNECTED) && right &&
79292 +                    !keyeq(&rd, znode_get_ld_key(right)))) {
79293 +               RUNLOCK_TREE(tree);
79294 +               RUNLOCK_DK(tree);
79295 +               stale_dk(tree, node);
79296 +               return;
79297 +       }
79298 +       RUNLOCK_TREE(tree);
79299 +       RUNLOCK_DK(tree);
79300 +}
79301 +
79302 +/*
79303 + * handle searches a the non-unique key.
79304 + *
79305 + * Suppose that we are looking for an item with possibly non-unique key 100.
79306 + *
79307 + * Root node contains two pointers: one to a node with left delimiting key 0,
79308 + * and another to a node with left delimiting key 100. Item we interested in
79309 + * may well happen in the sub-tree rooted at the first pointer.
79310 + *
79311 + * To handle this search_to_left() is called when search reaches stop
79312 + * level. This function checks it is _possible_ that item we are looking for
79313 + * is in the left neighbor (this can be done by comparing delimiting keys) and
79314 + * if so, tries to lock left neighbor (this is low priority lock, so it can
79315 + * deadlock, tree traversal is just restarted if it did) and then checks
79316 + * whether left neighbor actually contains items with our key.
79317 + *
79318 + * Note that this is done on the stop level only. It is possible to try such
79319 + * left-check on each level, but as duplicate keys are supposed to be rare
79320 + * (very unlikely that more than one node is completely filled with items with
79321 + * duplicate keys), it sis cheaper to scan to the left on the stop level once.
79322 + *
79323 + */
79324 +static level_lookup_result
79325 +search_to_left(cbk_handle * h /* search handle */ )
79326 +{
79327 +       level_lookup_result result;
79328 +       coord_t *coord;
79329 +       znode *node;
79330 +       znode *neighbor;
79331 +
79332 +       lock_handle lh;
79333 +
79334 +       assert("nikita-1761", h != NULL);
79335 +       assert("nikita-1762", h->level == h->stop_level);
79336 +
79337 +       init_lh(&lh);
79338 +       coord = h->coord;
79339 +       node = h->active_lh->node;
79340 +       assert("nikita-1763", coord_is_leftmost_unit(coord));
79341 +
79342 +       reiser4_stat_inc(tree.check_left_nonuniq);
79343 +       h->result = reiser4_get_left_neighbor(
79344 +               &lh, node, (int) h->lock_mode, GN_CAN_USE_UPPER_LEVELS);
79345 +       neighbor = NULL;
79346 +       switch (h->result) {
79347 +       case -E_DEADLOCK:
79348 +               result = LOOKUP_REST;
79349 +               break;
79350 +       case 0:{
79351 +                       node_plugin *nplug;
79352 +                       coord_t crd;
79353 +                       lookup_bias bias;
79354 +
79355 +                       neighbor = lh.node;
79356 +                       h->result = zload(neighbor);
79357 +                       if (h->result != 0) {
79358 +                               result = LOOKUP_DONE;
79359 +                               break;
79360 +                       }
79361 +
79362 +                       nplug = neighbor->nplug;
79363 +
79364 +                       coord_init_zero(&crd);
79365 +                       bias = h->bias;
79366 +                       h->bias = FIND_EXACT;
79367 +                       h->result = nplug->lookup(neighbor, h->key, h->bias, &crd);
79368 +                       h->bias = bias;
79369 +
79370 +                       if (h->result == NS_NOT_FOUND) {
79371 +       case -E_NO_NEIGHBOR:
79372 +                               h->result = CBK_COORD_FOUND;
79373 +                               reiser4_stat_inc(tree.cbk_found);
79374 +                               if (!(h->flags & CBK_IN_CACHE))
79375 +                                       cbk_cache_add(node);
79376 +       default:                /* some other error */
79377 +                               result = LOOKUP_DONE;
79378 +                       } else if (h->result == NS_FOUND) {
79379 +                               reiser4_stat_inc(tree.left_nonuniq_found);
79380 +
79381 +                               RLOCK_DK(znode_get_tree(neighbor));
79382 +                               h->rd_key = *znode_get_ld_key(node);
79383 +                               leftmost_key_in_node(neighbor, &h->ld_key);
79384 +                               RUNLOCK_DK(znode_get_tree(neighbor));
79385 +                               h->flags |= CBK_DKSET;
79386 +
79387 +                               h->block = *znode_get_block(neighbor);
79388 +                               /* clear coord -> node so that cbk_level_lookup()
79389 +                                  wouldn't overwrite parent hint in neighbor.
79390 +
79391 +                                  Parent hint was set up by
79392 +                                  reiser4_get_left_neighbor()
79393 +                               */
79394 +                               UNDER_RW_VOID(tree, znode_get_tree(neighbor), write,
79395 +                                             h->coord->node = NULL);
79396 +                               result = LOOKUP_CONT;
79397 +                       } else {
79398 +                               result = LOOKUP_DONE;
79399 +                       }
79400 +                       if (neighbor != NULL)
79401 +                               zrelse(neighbor);
79402 +               }
79403 +       }
79404 +       done_lh(&lh);
79405 +       return result;
79406 +}
79407 +
79408 +/* debugging aid: return symbolic name of search bias */
79409 +reiser4_internal const char *
79410 +bias_name(lookup_bias bias /* bias to get name of */ )
79411 +{
79412 +       if (bias == FIND_EXACT)
79413 +               return "exact";
79414 +       else if (bias == FIND_MAX_NOT_MORE_THAN)
79415 +               return "left-slant";
79416 +/*     else if( bias == RIGHT_SLANT_BIAS ) */
79417 +/*             return "right-bias"; */
79418 +       else {
79419 +               static char buf[30];
79420 +
79421 +               sprintf(buf, "unknown: %i", bias);
79422 +               return buf;
79423 +       }
79424 +}
79425 +
79426 +#if REISER4_DEBUG_OUTPUT
79427 +/* debugging aid: print human readable information about @p */
79428 +reiser4_internal void
79429 +print_coord_content(const char *prefix /* prefix to print */ ,
79430 +                   coord_t * p /* coord to print */ )
79431 +{
79432 +       reiser4_key key;
79433 +
79434 +       if (p == NULL) {
79435 +               printk("%s: null\n", prefix);
79436 +               return;
79437 +       }
79438 +       if ((p->node != NULL) && znode_is_loaded(p->node) && coord_is_existing_item(p))
79439 +               printk("%s: data: %p, length: %i\n", prefix, item_body_by_coord(p), item_length_by_coord(p));
79440 +       print_znode(prefix, p->node);
79441 +       if (znode_is_loaded(p->node)) {
79442 +               item_key_by_coord(p, &key);
79443 +               print_key(prefix, &key);
79444 +               print_plugin(prefix, item_plugin_to_plugin(item_plugin_by_coord(p)));
79445 +       }
79446 +}
79447 +
79448 +/* debugging aid: print human readable information about @block */
79449 +reiser4_internal void
79450 +print_address(const char *prefix /* prefix to print */ ,
79451 +             const reiser4_block_nr * block /* block number to print */ )
79452 +{
79453 +       printk("%s: %s\n", prefix, sprint_address(block));
79454 +}
79455 +#endif
79456 +
79457 +/* return string containing human readable representation of @block */
79458 +reiser4_internal char *
79459 +sprint_address(const reiser4_block_nr * block /* block number to print */ )
79460 +{
79461 +       static char address[30];
79462 +
79463 +       if (block == NULL)
79464 +               sprintf(address, "null");
79465 +       else if (blocknr_is_fake(block))
79466 +               sprintf(address, "%llx", *block);
79467 +       else
79468 +               sprintf(address, "%llu", *block);
79469 +       return address;
79470 +}
79471 +
79472 +/* release parent node during traversal */
79473 +static void
79474 +put_parent(cbk_handle * h /* search handle */ )
79475 +{
79476 +       assert("nikita-383", h != NULL);
79477 +       if (h->parent_lh->node != NULL) {
79478 +               longterm_unlock_znode(h->parent_lh);
79479 +       }
79480 +}
79481 +
79482 +/* helper function used by coord_by_key(): release reference to parent znode
79483 +   stored in handle before processing its child. */
79484 +static void
79485 +hput(cbk_handle * h /* search handle */ )
79486 +{
79487 +       assert("nikita-385", h != NULL);
79488 +       done_lh(h->parent_lh);
79489 +       done_lh(h->active_lh);
79490 +}
79491 +
79492 +/* Helper function used by cbk(): update delimiting keys of child node (stored
79493 +   in h->active_lh->node) using key taken from parent on the parent level. */
79494 +static int
79495 +setup_delimiting_keys(cbk_handle * h /* search handle */)
79496 +{
79497 +       znode *active;
79498 +       reiser4_tree *tree;
79499 +
79500 +       assert("nikita-1088", h != NULL);
79501 +
79502 +       active = h->active_lh->node;
79503 +       tree = znode_get_tree(active);
79504 +       /* fast check without taking dk lock. This is safe, because
79505 +        * JNODE_DKSET is never cleared once set. */
79506 +       if (!ZF_ISSET(active, JNODE_DKSET)) {
79507 +               WLOCK_DK(tree);
79508 +               if (!ZF_ISSET(active, JNODE_DKSET)) {
79509 +                       znode_set_ld_key(active, &h->ld_key);
79510 +                       znode_set_rd_key(active, &h->rd_key);
79511 +                       ZF_SET(active, JNODE_DKSET);
79512 +               }
79513 +               WUNLOCK_DK(tree);
79514 +               return 1;
79515 +       }
79516 +       return 0;
79517 +}
79518 +
79519 +/* true if @block makes sense for the @tree. Used to detect corrupted node
79520 + * pointers */
79521 +static int
79522 +block_nr_is_correct(reiser4_block_nr * block   /* block number to check */ ,
79523 +                   reiser4_tree * tree /* tree to check against */ )
79524 +{
79525 +       assert("nikita-757", block != NULL);
79526 +       assert("nikita-758", tree != NULL);
79527 +
79528 +       /* check to see if it exceeds the size of the device. */
79529 +       return reiser4_blocknr_is_sane_for(tree->super, block);
79530 +}
79531 +
79532 +/* check consistency of fields */
79533 +static int
79534 +sanity_check(cbk_handle * h /* search handle */ )
79535 +{
79536 +       assert("nikita-384", h != NULL);
79537 +
79538 +       if (h->level < h->stop_level) {
79539 +               h->error = "Buried under leaves";
79540 +               h->result = RETERR(-EIO);
79541 +               return LOOKUP_DONE;
79542 +       } else if (!block_nr_is_correct(&h->block, h->tree)) {
79543 +               h->error = "bad block number";
79544 +               h->result = RETERR(-EIO);
79545 +               return LOOKUP_DONE;
79546 +       } else
79547 +               return 0;
79548 +}
79549 +
79550 +
79551 +/* Make Linus happy.
79552 +   Local variables:
79553 +   c-indentation-style: "K&R"
79554 +   mode-name: "LC"
79555 +   c-basic-offset: 8
79556 +   tab-width: 8
79557 +   fill-column: 120
79558 +   scroll-step: 1
79559 +   End:
79560 +*/
79561 diff -rupN linux-2.6.8-rc3/fs/reiser4/spin_macros.h linux-2.6.8-rc3-a/fs/reiser4/spin_macros.h
79562 --- linux-2.6.8-rc3/fs/reiser4/spin_macros.h    1970-01-01 03:00:00.000000000 +0300
79563 +++ linux-2.6.8-rc3-a/fs/reiser4/spin_macros.h  2004-08-05 21:20:52.940690584 +0400
79564 @@ -0,0 +1,839 @@
79565 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
79566 +
79567 +/* Wrapper functions/macros for spin locks. */
79568 +
79569 +/*
79570 + * This file implements wrapper functions and macros to work with spin locks
79571 + * and read write locks embedded into kernel objects. Wrapper functions
79572 + * provide following functionality:
79573 + *
79574 + *    (1) encapsulation of locks: in stead of writing spin_lock(&obj->lock),
79575 + *    where obj is object of type foo, one writes spin_lock_foo(obj).
79576 + *
79577 + *    (2) optional keeping (in per-thread reiser4_context->locks) information
79578 + *    about number of locks of particular type currently held by thread. This
79579 + *    is done if REISER4_DEBUG is on.
79580 + *
79581 + *    (3) optional checking of lock ordering. For object type foo, it is
79582 + *    possible to provide "lock ordering predicate" (possibly using
79583 + *    information stored in reiser4_context->locks) checking that locks are
79584 + *    acquired in the proper order. This is done if REISER4_DEBUG is on.
79585 + *
79586 + *    (4) optional collection of spin lock contention statistics. In this mode
79587 + *    two sysfs objects (located in /sys/profregion) are associated with each
79588 + *    spin lock type. One object (foo_t) shows how much time was spent trying
79589 + *    to acquire spin locks of foo type. Another (foo_h) shows how much time
79590 + *    spin locks of the type foo were held locked. See spinprof.h for more
79591 + *    details on this.
79592 + *
79593 + */
79594 +
79595 +#ifndef __SPIN_MACROS_H__
79596 +#define __SPIN_MACROS_H__
79597 +
79598 +#include <linux/spinlock.h>
79599 +#include <linux/profile.h>
79600 +
79601 +#include "debug.h"
79602 +#include "spinprof.h"
79603 +
79604 +/* Checks that read write lock @s is locked (or not) by the -current-
79605 + * thread. not yet implemented */
79606 +#define check_is_write_locked(s)     ((void)(s), 1)
79607 +#define check_is_read_locked(s)      ((void)(s), 1)
79608 +#define check_is_not_read_locked(s)  ((void)(s), 1)
79609 +#define check_is_not_write_locked(s) ((void)(s), 1)
79610 +
79611 +/* Checks that spin lock @s is locked (or not) by the -current- thread. */
79612 +#if defined(CONFIG_DEBUG_SPINLOCK) && defined(CONFIG_SMP)
79613 +     /* Spin lock debugging in the kernel. This depends on patch that adds
79614 +      * ->owner field to the spin lock. */
79615 +#    define check_spin_is_not_locked(s) ((s)->owner != get_current())
79616 +#    define spin_is_not_locked(s)       ((s)->owner == NULL)
79617 +#    define check_spin_is_locked(s)     ((s)->owner == get_current())
79618 +#else
79619 +#    define check_spin_is_not_locked(s) ((void)(s), 1)
79620 +#    define spin_is_not_locked(s)       ((void)(s), 1)
79621 +#    if defined(CONFIG_SMP)
79622 +#        define check_spin_is_locked(s)     spin_is_locked(s)
79623 +#    else
79624 +#        define check_spin_is_locked(s)     ((void)(s), 1)
79625 +#    endif
79626 +#endif
79627 +
79628 +#if REISER4_DEBUG_SPIN_LOCKS
79629 +#define __ODCA(l, e) ON_DEBUG_CONTEXT(assert(l, e))
79630 +#else
79631 +#define __ODCA(l, e) noop
79632 +#endif
79633 +
79634 +#define REISER4_LOCKPROF_OBJECTS (0)
79635 +
79636 +#if REISER4_LOCKPROF
79637 +
79638 +/*
79639 + * If spin lock profiling is on, define profregions (see spinprof.[ch])
79640 + * exporting through sysfs information about spin lock contention. With each
79641 + * spin lock type two profregions are associated: "held" region (exported as
79642 + * /sys/profregion/foo_h), and "trying" region (exported as
79643 + * /sys/profregion/foo_t).
79644 + */
79645 +
79646 +/*
79647 + * This macro, given spin lock type, defines corresponding profregions and
79648 + * functions to register and unregister them.
79649 + */
79650 +#define DEFINE_SPIN_PROFREGIONS(aname)                                         \
79651 +struct profregion pregion_spin_ ## aname ## _held = {                          \
79652 +       .kobj = {                                                               \
79653 +               .name = #aname  "_h"                                            \
79654 +       }                                                                       \
79655 +};                                                                             \
79656 +                                                                               \
79657 +struct profregion pregion_spin_ ## aname ## _trying = {                        \
79658 +       .kobj = {                                                               \
79659 +               .name = #aname  "_t"                                            \
79660 +       }                                                                       \
79661 +};                                                                             \
79662 +                                                                               \
79663 +static inline int register_ ## aname ## _profregion(void)                      \
79664 +{                                                                              \
79665 +       int result;                                                             \
79666 +                                                                               \
79667 +       result = profregion_register(&pregion_spin_ ## aname ## _held);         \
79668 +       if (result != 0)                                                        \
79669 +               return result;                                                  \
79670 +       result = profregion_register(&pregion_spin_ ## aname ## _trying);       \
79671 +       return result;                                                          \
79672 +}                                                                              \
79673 +                                                                               \
79674 +static inline void unregister_ ## aname ## _profregion(void)                   \
79675 +{                                                                              \
79676 +       profregion_unregister(&pregion_spin_ ## aname ## _held);                \
79677 +       profregion_unregister(&pregion_spin_ ## aname ## _trying);              \
79678 +}                                                                              \
79679 +                                                                               \
79680 +typedef struct { int foo; } aname ## _spin_dummy_profregion
79681 +
79682 +#define DECLARE_SPIN_PROFREGIONS(NAME)                         \
79683 +extern struct profregion pregion_spin_ ## NAME ## _held;       \
79684 +extern struct profregion pregion_spin_ ## NAME ## _trying;
79685 +
79686 +/*
79687 + * If spin lock profiling is on, define profregions (see spinprof.[ch])
79688 + * exporting through sysfs information about read write lock contention. With
79689 + * each read write lock type four profregions are associated: "read held" and
79690 + * "write held" regions, and "read trying" and "write trying" regions,
79691 + * exported as /sys/profregion/foo_{r,w}_{t,h}.
79692 + */
79693 +
79694 +
79695 +/*
79696 + * This macro, given read write lock type, defines corresponding profregions
79697 + * and functions to register and unregister them.
79698 + */
79699 +#define DEFINE_RW_PROFREGIONS(aname)                                           \
79700 +struct profregion pregion_rw_ ## aname ## _r_held = {                          \
79701 +       .kobj = {                                                               \
79702 +               .name = #aname  "_r_h"                                          \
79703 +       }                                                                       \
79704 +};                                                                             \
79705 +                                                                               \
79706 +struct profregion pregion_rw_ ## aname ## _w_held = {                          \
79707 +       .kobj = {                                                               \
79708 +               .name = #aname  "_w_h"                                          \
79709 +       }                                                                       \
79710 +};                                                                             \
79711 +                                                                               \
79712 +struct profregion pregion_rw_ ## aname ## _r_trying = {                        \
79713 +       .kobj = {                                                               \
79714 +               .name = #aname  "_r_t"                                          \
79715 +       }                                                                       \
79716 +};                                                                             \
79717 +                                                                               \
79718 +struct profregion pregion_rw_ ## aname ## _w_trying = {                        \
79719 +       .kobj = {                                                               \
79720 +               .name = #aname  "_w_t"                                          \
79721 +       }                                                                       \
79722 +};                                                                             \
79723 +                                                                               \
79724 +static inline int register_ ## aname ## _profregion(void)                      \
79725 +{                                                                              \
79726 +       int result;                                                             \
79727 +                                                                               \
79728 +       result = profregion_register(&pregion_rw_ ## aname ## _r_held);         \
79729 +       if (result != 0)                                                        \
79730 +               return result;                                                  \
79731 +       result = profregion_register(&pregion_rw_ ## aname ## _w_held);         \
79732 +       if (result != 0)                                                        \
79733 +               return result;                                                  \
79734 +       result = profregion_register(&pregion_rw_ ## aname ## _r_trying);       \
79735 +       if (result != 0)                                                        \
79736 +               return result;                                                  \
79737 +       result = profregion_register(&pregion_rw_ ## aname ## _w_trying);       \
79738 +       return result;                                                          \
79739 +}                                                                              \
79740 +                                                                               \
79741 +static inline void unregister_ ## aname ## _profregion(void)                   \
79742 +{                                                                              \
79743 +       profregion_unregister(&pregion_rw_ ## aname ## _r_held);                \
79744 +       profregion_unregister(&pregion_rw_ ## aname ## _w_held);                \
79745 +       profregion_unregister(&pregion_rw_ ## aname ## _r_trying);              \
79746 +       profregion_unregister(&pregion_rw_ ## aname ## _w_trying);              \
79747 +}                                                                              \
79748 +                                                                               \
79749 +typedef struct { int foo; } aname ## _rw_dummy_profregion
79750 +
79751 +#define DECLARE_RW_PROFREGIONS(NAME)                           \
79752 +extern struct profregion pregion_rw_ ## NAME ## _r_held;       \
79753 +extern struct profregion pregion_rw_ ## NAME ## _w_held;       \
79754 +extern struct profregion pregion_rw_ ## NAME ## _r_trying;     \
79755 +extern struct profregion pregion_rw_ ## NAME ## _w_trying;
79756 +
79757 +#if REISER4_LOCKPROF_OBJECTS
79758 +#define OBJCNT(field) field
79759 +#else
79760 +#define OBJCNT(field) (NULL)
79761 +#endif
79762 +
79763 +/*
79764 + * Helper macros to enter and leave profiling regions.
79765 + */
79766 +
79767 +#define GETCPU(cpu)                            \
79768 +       int cpu = get_cpu()
79769 +
79770 +#define PUTCPU(cpu) put_cpu()
79771 +
79772 +#define PREG_IN(cpu, preg, objloc, codeloc)                            \
79773 +       profregion_in(cpu, preg, OBJCNT(objloc), codeloc)
79774 +
79775 +#define PREG_REPLACE(cpu, preg, objloc, codeloc)                       \
79776 +       profregion_replace(cpu, preg, OBJCNT(objloc), codeloc)
79777 +
79778 +#define PREG_EX(cpu, preg) profregion_ex(cpu, preg)
79779 +
79780 +/* REISER4_LOCKPROF */
79781 +#else
79782 +
79783 +/*
79784 + * If spin lock profiling is disabled, declare everything to noops.
79785 + */
79786 +
79787 +#define DEFINE_SPIN_PROFREGIONS(aname)                         \
79788 +static inline int register_ ## aname ## _profregion(void)      \
79789 +{                                                              \
79790 +       return 0;                                               \
79791 +}                                                              \
79792 +                                                               \
79793 +static inline void unregister_ ## aname ## _profregion(void)   \
79794 +{                                                              \
79795 +}
79796 +
79797 +#define DECLARE_SPIN_PROFREGIONS(NAME)
79798 +
79799 +#define DEFINE_RW_PROFREGIONS(aname)                           \
79800 +static inline int register_ ## aname ## _profregion(void)      \
79801 +{                                                              \
79802 +       return 0;                                               \
79803 +}                                                              \
79804 +                                                               \
79805 +static inline void unregister_ ## aname ## _profregion(void)   \
79806 +{                                                              \
79807 +}
79808 +
79809 +#define DECLARE_RW_PROFREGIONS(NAME)
79810 +
79811 +#define GETCPU(cpu)
79812 +#define PUTCPU(cpu)
79813 +#define PREG_IN(cpu, preg, objloc, codeloc)
79814 +#define PREG_REPLACE(cpu, preg, objloc, codeloc)
79815 +#define PREG_EX(cpu, preg)
79816 +
79817 +/* REISER4_LOCKPROF */
79818 +#endif
79819 +
79820 +/*
79821 + * Data structure embedded into kernel objects together with spin lock.
79822 + */
79823 +typedef struct reiser4_spin_data {
79824 +       /* spin lock proper */
79825 +       spinlock_t lock;
79826 +#if REISER4_LOCKPROF && REISER4_LOCKPROF_OBJECTS
79827 +       /* number of times clock interrupt found spin lock of this objects to
79828 +        * be held */
79829 +       int        held;
79830 +       /* number of times clock interrupt found that current thread is trying
79831 +        * to acquire this spin lock */
79832 +       int        trying;
79833 +#endif
79834 +} reiser4_spin_data;
79835 +
79836 +/*
79837 + * Data structure embedded into kernel objects together with read write lock.
79838 + */
79839 +typedef struct reiser4_rw_data {
79840 +       /* read write lock proper */
79841 +       rwlock_t lock;
79842 +#if REISER4_LOCKPROF && REISER4_LOCKPROF_OBJECTS
79843 +       /* number of times clock interrupt found read write lock of this
79844 +        * objects to be read held */
79845 +       int      r_held;
79846 +       /* number of times clock interrupt found that current thread is trying
79847 +        * to acquire this lock for read */
79848 +       int      r_trying;
79849 +       /* number of times clock interrupt found read write lock of this
79850 +        * objects to be write held */
79851 +       int      w_held;
79852 +       /* number of times clock interrupt found that current thread is trying
79853 +        * to acquire this lock for write */
79854 +       int      w_trying;
79855 +#endif
79856 +} reiser4_rw_data;
79857 +
79858 +/* Define several inline functions for each type of spinlock. This is long
79859 + * monster macro definition. */
79860 +#define SPIN_LOCK_FUNCTIONS(NAME,TYPE,FIELD)                                   \
79861 +                                                                               \
79862 +DECLARE_SPIN_PROFREGIONS(NAME)                                                 \
79863 +                                                                               \
79864 +/* Initialize spin lock embedded in @x                 */                      \
79865 +static inline void spin_ ## NAME ## _init(TYPE *x)                             \
79866 +{                                                                              \
79867 +       __ODCA("nikita-2987", x != NULL);                                       \
79868 +       memset(& x->FIELD, 0, sizeof x->FIELD);                                 \
79869 +       spin_lock_init(& x->FIELD.lock);                                        \
79870 +}                                                                              \
79871 +                                                                               \
79872 +/* Increment per-thread lock counter for this lock type and total counter */   \
79873 +/* of acquired spin locks. This is helper function used by spin lock      */   \
79874 +/* acquiring functions below                                              */   \
79875 +static inline void spin_ ## NAME ## _inc(void)                                 \
79876 +{                                                                              \
79877 +       LOCK_CNT_INC(spin_locked_ ## NAME);                                     \
79878 +       LOCK_CNT_INC(spin_locked);                                              \
79879 +}                                                                              \
79880 +                                                                               \
79881 +/* Decrement per-thread lock counter and total counter of acquired spin   */   \
79882 +/* locks. This is helper function used by spin lock releasing functions   */   \
79883 +/* below.                                                                 */   \
79884 +static inline void spin_ ## NAME ## _dec(void)                                 \
79885 +{                                                                              \
79886 +       LOCK_CNT_DEC(spin_locked_ ## NAME);                                     \
79887 +       LOCK_CNT_DEC(spin_locked);                                              \
79888 +}                                                                              \
79889 +                                                                               \
79890 +/* Return true of spin lock embedded in @x is acquired by -current-       */   \
79891 +/* thread                                                                 */   \
79892 +static inline int  spin_ ## NAME ## _is_locked (const TYPE *x)                 \
79893 +{                                                                              \
79894 +       return check_spin_is_locked (& x->FIELD.lock) &&                        \
79895 +              LOCK_CNT_GTZ(spin_locked_ ## NAME);                              \
79896 +}                                                                              \
79897 +                                                                               \
79898 +/* Return true of spin lock embedded in @x is not acquired by -current-   */   \
79899 +/* thread                                                                 */   \
79900 +static inline int  spin_ ## NAME ## _is_not_locked (TYPE *x)                   \
79901 +{                                                                              \
79902 +       return check_spin_is_not_locked (& x->FIELD.lock);                      \
79903 +}                                                                              \
79904 +                                                                               \
79905 +/* Acquire spin lock embedded in @x without checking lock ordering.       */   \
79906 +/* This is useful when, for example, locking just created object.         */   \
79907 +static inline void spin_lock_ ## NAME ## _no_ord (TYPE *x,                     \
79908 +                                                 locksite *t, locksite *h)     \
79909 +{                                                                              \
79910 +       GETCPU(cpu);                                                            \
79911 +       __ODCA("nikita-2703", spin_ ## NAME ## _is_not_locked(x));              \
79912 +       PREG_IN(cpu, &pregion_spin_ ## NAME ## _trying, &x->FIELD.trying, t);   \
79913 +       spin_lock(&x->FIELD.lock);                                              \
79914 +       PREG_REPLACE(cpu,                                                       \
79915 +                    &pregion_spin_ ## NAME ## _held, &x->FIELD.held, h);       \
79916 +       PUTCPU(cpu);                                                            \
79917 +       spin_ ## NAME ## _inc();                                                \
79918 +}                                                                              \
79919 +                                                                               \
79920 +/* Account for spin lock acquired by some other means. For example        */   \
79921 +/* through atomic_dec_and_lock() or similar.                              */   \
79922 +static inline void spin_lock_ ## NAME ## _acc (TYPE *x, locksite *h)           \
79923 +{                                                                              \
79924 +       GETCPU(cpu);                                                            \
79925 +       PREG_IN(cpu, &pregion_spin_ ## NAME ## _held, &x->FIELD.held, h);       \
79926 +       PUTCPU(cpu);                                                            \
79927 +       spin_ ## NAME ## _inc();                                                \
79928 +}                                                                              \
79929 +                                                                               \
79930 +/* Lock @x with explicit indication of spin lock profiling "sites".       */   \
79931 +/* Locksite is used by spin lock profiling code (spinprof.[ch]) to        */   \
79932 +/* identify fragment of code that locks @x.                               */   \
79933 +/*                                                                        */   \
79934 +/* If clock interrupt finds that current thread is spinning waiting for   */   \
79935 +/* the lock on @x, counters in @t will be incremented.                    */   \
79936 +/*                                                                        */   \
79937 +/* If clock interrupt finds that current thread holds the lock on @x,     */   \
79938 +/* counters in @h will be incremented.                                    */   \
79939 +/*                                                                        */   \
79940 +static inline void spin_lock_ ## NAME ## _at (TYPE *x,                                 \
79941 +                                             locksite *t, locksite *h)         \
79942 +{                                                                              \
79943 +       __ODCA("nikita-1383", spin_ordering_pred_ ## NAME(x));                  \
79944 +       spin_lock_ ## NAME ## _no_ord(x, t, h);                                 \
79945 +}                                                                              \
79946 +                                                                               \
79947 +/* Lock @x.                                                               */   \
79948 +static inline void spin_lock_ ## NAME (TYPE *x)                                        \
79949 +{                                                                              \
79950 +       __ODCA("nikita-1383", spin_ordering_pred_ ## NAME(x));                  \
79951 +       spin_lock_ ## NAME ## _no_ord(x, 0, 0);                                 \
79952 +}                                                                              \
79953 +                                                                               \
79954 +/* Try to obtain lock @x. On success, returns 1 with @x locked.           */   \
79955 +/* If @x is already locked, return 0 immediately.                         */   \
79956 +static inline int  spin_trylock_ ## NAME (TYPE *x)                             \
79957 +{                                                                              \
79958 +       if (spin_trylock (& x->FIELD.lock)) {                                   \
79959 +               GETCPU(cpu);                                                    \
79960 +               spin_ ## NAME ## _inc();                                        \
79961 +               PREG_IN(cpu,                                                    \
79962 +                       &pregion_spin_ ## NAME ## _held, &x->FIELD.held, 0);    \
79963 +               PUTCPU(cpu);                                                    \
79964 +               return 1;                                                       \
79965 +       }                                                                       \
79966 +       return 0;                                                               \
79967 +}                                                                              \
79968 +                                                                               \
79969 +/* Unlock @x.                                                             */   \
79970 +static inline void spin_unlock_ ## NAME (TYPE *x)                              \
79971 +{                                                                              \
79972 +       __ODCA("nikita-1375", LOCK_CNT_GTZ(spin_locked_ ## NAME));              \
79973 +       __ODCA("nikita-1376", LOCK_CNT_GTZ(spin_locked > 0));                   \
79974 +       __ODCA("nikita-2703", spin_ ## NAME ## _is_locked(x));                  \
79975 +                                                                               \
79976 +       spin_ ## NAME ## _dec();                                                \
79977 +       spin_unlock (& x->FIELD.lock);                                          \
79978 +       PREG_EX(get_cpu(), &pregion_spin_ ## NAME ## _held);                    \
79979 +}                                                                              \
79980 +                                                                               \
79981 +typedef struct { int foo; } NAME ## _spin_dummy
79982 +
79983 +/*
79984 + * Helper macro to perform a simple operation that requires taking of spin
79985 + * lock.
79986 + *
79987 + * 1. Acquire spin lock on object @obj of type @obj_type.
79988 + *
79989 + * 2. Execute @exp under spin lock, and store result.
79990 + *
79991 + * 3. Release spin lock.
79992 + *
79993 + * 4. Return result of @exp.
79994 + *
79995 + * Example:
79996 + *
79997 + * right_delimiting_key = UNDER_SPIN(dk, current_tree, *znode_get_rd_key(node));
79998 + *
79999 + */
80000 +#define UNDER_SPIN(obj_type, obj, exp)                                         \
80001 +({                                                                             \
80002 +       typeof (obj) __obj;                                                     \
80003 +       typeof (exp) __result;                                                  \
80004 +       LOCKSITE_INIT(__hits_trying);                                           \
80005 +       LOCKSITE_INIT(__hits_held);                                             \
80006 +                                                                               \
80007 +       __obj = (obj);                                                          \
80008 +       __ODCA("nikita-2492", __obj != NULL);                                   \
80009 +       spin_lock_ ## obj_type ## _at (__obj, &__hits_trying, &__hits_held);    \
80010 +       __result = exp;                                                         \
80011 +       spin_unlock_ ## obj_type (__obj);                                       \
80012 +       __result;                                                               \
80013 +})
80014 +
80015 +/*
80016 + * The same as UNDER_SPIN, but without storing and returning @exp's result.
80017 + */
80018 +#define UNDER_SPIN_VOID(obj_type, obj, exp)                                    \
80019 +({                                                                             \
80020 +       typeof (obj) __obj;                                                     \
80021 +       LOCKSITE_INIT(__hits_trying);                                           \
80022 +       LOCKSITE_INIT(__hits_held);                                             \
80023 +                                                                               \
80024 +       __obj = (obj);                                                          \
80025 +       __ODCA("nikita-2492", __obj != NULL);                                   \
80026 +       spin_lock_ ## obj_type ## _at (__obj, &__hits_trying, &__hits_held);    \
80027 +       exp;                                                                    \
80028 +       spin_unlock_ ## obj_type (__obj);                                       \
80029 +})
80030 +
80031 +
80032 +/* Define several inline functions for each type of read write lock. This is
80033 + * insanely long macro definition. */
80034 +#define RW_LOCK_FUNCTIONS(NAME,TYPE,FIELD)                                     \
80035 +                                                                               \
80036 +DECLARE_RW_PROFREGIONS(NAME)                                                   \
80037 +                                                                               \
80038 +/* Initialize read write lock embedded into @x.                           */   \
80039 +static inline void rw_ ## NAME ## _init(TYPE *x)                               \
80040 +{                                                                              \
80041 +       __ODCA("nikita-2988", x != NULL);                                       \
80042 +       memset(& x->FIELD, 0, sizeof x->FIELD);                                 \
80043 +       rwlock_init(& x->FIELD.lock);                                           \
80044 +}                                                                              \
80045 +                                                                               \
80046 +/* True, if @x is read locked by the -current- thread.                    */   \
80047 +static inline int  rw_ ## NAME ## _is_read_locked (const TYPE *x)              \
80048 +{                                                                              \
80049 +       return check_is_read_locked (& x->FIELD.lock);                          \
80050 +}                                                                              \
80051 +                                                                               \
80052 +/* True, if @x is write locked by the -current- thread.                   */   \
80053 +static inline int  rw_ ## NAME ## _is_write_locked (const TYPE *x)             \
80054 +{                                                                              \
80055 +       return check_is_write_locked (& x->FIELD.lock);                         \
80056 +}                                                                              \
80057 +                                                                               \
80058 +/* True, if @x is not read locked by the -current- thread.                */   \
80059 +static inline int  rw_ ## NAME ## _is_not_read_locked (TYPE *x)                        \
80060 +{                                                                              \
80061 +       return check_is_not_read_locked (& x->FIELD.lock);                      \
80062 +}                                                                              \
80063 +                                                                               \
80064 +/* True, if @x is not write locked by the -current- thread.               */   \
80065 +static inline int  rw_ ## NAME ## _is_not_write_locked (TYPE *x)               \
80066 +{                                                                              \
80067 +       return check_is_not_write_locked (& x->FIELD.lock);                     \
80068 +}                                                                              \
80069 +                                                                               \
80070 +/* True, if @x is either read or write locked by the -current- thread.    */   \
80071 +static inline int  rw_ ## NAME ## _is_locked (const TYPE *x)                   \
80072 +{                                                                              \
80073 +       return check_is_read_locked (& x->FIELD.lock) ||                        \
80074 +              check_is_write_locked (& x->FIELD.lock);                         \
80075 +}                                                                              \
80076 +                                                                               \
80077 +/* True, if @x is neither read nor write locked by the -current- thread.  */   \
80078 +static inline int  rw_ ## NAME ## _is_not_locked (const TYPE *x)               \
80079 +{                                                                              \
80080 +       return check_is_not_read_locked (& x->FIELD.lock) &&                    \
80081 +              check_is_not_write_locked (& x->FIELD.lock);                     \
80082 +}                                                                              \
80083 +                                                                               \
80084 +/* This is helper function used by lock acquiring functions below         */   \
80085 +static inline void read_ ## NAME ## _inc(void)                                 \
80086 +{                                                                              \
80087 +       LOCK_CNT_INC(read_locked_ ## NAME);                                     \
80088 +       LOCK_CNT_INC(rw_locked_ ## NAME);                                       \
80089 +       LOCK_CNT_INC(spin_locked);                                              \
80090 +}                                                                              \
80091 +                                                                               \
80092 +/* This is helper function used by lock acquiring functions below         */   \
80093 +static inline void read_ ## NAME ## _dec(void)                                 \
80094 +{                                                                              \
80095 +       LOCK_CNT_DEC(read_locked_ ## NAME);                                     \
80096 +       LOCK_CNT_DEC(rw_locked_ ## NAME);                                       \
80097 +       LOCK_CNT_DEC(spin_locked);                                              \
80098 +}                                                                              \
80099 +                                                                               \
80100 +/* This is helper function used by lock acquiring functions below         */   \
80101 +static inline void write_ ## NAME ## _inc(void)                                        \
80102 +{                                                                              \
80103 +       LOCK_CNT_INC(write_locked_ ## NAME);                                    \
80104 +       LOCK_CNT_INC(rw_locked_ ## NAME);                                       \
80105 +       LOCK_CNT_INC(spin_locked);                                              \
80106 +}                                                                              \
80107 +                                                                               \
80108 +/* This is helper function used by lock acquiring functions below         */   \
80109 +static inline void write_ ## NAME ## _dec(void)                                        \
80110 +{                                                                              \
80111 +       LOCK_CNT_DEC(write_locked_ ## NAME);                                    \
80112 +       LOCK_CNT_DEC(rw_locked_ ## NAME);                                       \
80113 +       LOCK_CNT_DEC(spin_locked);                                              \
80114 +}                                                                              \
80115 +                                                                               \
80116 +/* Acquire read lock on @x without checking lock ordering predicates.     */   \
80117 +/* This is useful when, for example, locking just created object.         */   \
80118 +static inline void read_lock_ ## NAME ## _no_ord (TYPE *x,                     \
80119 +                                                 locksite *t, locksite *h)     \
80120 +{                                                                              \
80121 +       GETCPU(cpu);                                                            \
80122 +       __ODCA("nikita-2976", rw_ ## NAME ## _is_not_read_locked(x));           \
80123 +       PREG_IN(cpu, &pregion_rw_ ## NAME ## _r_trying, &x->FIELD.r_trying, t); \
80124 +       read_lock(&x->FIELD.lock);                                              \
80125 +       PREG_REPLACE(cpu, &pregion_rw_ ## NAME ## _r_held,                      \
80126 +                    &x->FIELD.r_held, h);                                      \
80127 +       PUTCPU(cpu);                                                            \
80128 +       read_ ## NAME ## _inc();                                                \
80129 +}                                                                              \
80130 +                                                                               \
80131 +/* Acquire write lock on @x without checking lock ordering predicates.    */   \
80132 +/* This is useful when, for example, locking just created object.         */   \
80133 +static inline void write_lock_ ## NAME ## _no_ord (TYPE *x,                    \
80134 +                                                  locksite *t, locksite *h)    \
80135 +{                                                                              \
80136 +       GETCPU(cpu);                                                            \
80137 +       __ODCA("nikita-2977", rw_ ## NAME ## _is_not_write_locked(x));          \
80138 +       PREG_IN(cpu, &pregion_rw_ ## NAME ## _w_trying, &x->FIELD.w_trying, t); \
80139 +       write_lock(&x->FIELD.lock);                                             \
80140 +       PREG_REPLACE(cpu, &pregion_rw_ ## NAME ## _w_held,                      \
80141 +                    &x->FIELD.w_held, h);                                      \
80142 +       PUTCPU(cpu);                                                            \
80143 +       write_ ## NAME ## _inc();                                               \
80144 +}                                                                              \
80145 +                                                                               \
80146 +/* Read lock @x with explicit indication of spin lock profiling "sites".  */   \
80147 +/* See spin_lock_foo_at() above for more information.                     */   \
80148 +static inline void read_lock_ ## NAME ## _at (TYPE *x,                                 \
80149 +                                             locksite *t, locksite *h)         \
80150 +{                                                                              \
80151 +       __ODCA("nikita-2975", rw_ordering_pred_ ## NAME(x));                    \
80152 +       read_lock_ ## NAME ## _no_ord(x, t, h);                                 \
80153 +}                                                                              \
80154 +                                                                               \
80155 +/* Write lock @x with explicit indication of spin lock profiling "sites". */   \
80156 +/* See spin_lock_foo_at() above for more information.                     */   \
80157 +static inline void write_lock_ ## NAME ## _at (TYPE *x,                                \
80158 +                                              locksite *t, locksite *h)        \
80159 +{                                                                              \
80160 +       __ODCA("nikita-2978", rw_ordering_pred_ ## NAME(x));                    \
80161 +       write_lock_ ## NAME ## _no_ord(x, t, h);                                \
80162 +}                                                                              \
80163 +                                                                               \
80164 +/* Read lock @x.                                                          */   \
80165 +static inline void read_lock_ ## NAME (TYPE *x)                                        \
80166 +{                                                                              \
80167 +       __ODCA("nikita-2975", rw_ordering_pred_ ## NAME(x));                    \
80168 +       read_lock_ ## NAME ## _no_ord(x, 0, 0);                                 \
80169 +}                                                                              \
80170 +                                                                               \
80171 +/* Write lock @x.                                                         */   \
80172 +static inline void write_lock_ ## NAME (TYPE *x)                               \
80173 +{                                                                              \
80174 +       __ODCA("nikita-2978", rw_ordering_pred_ ## NAME(x));                    \
80175 +       write_lock_ ## NAME ## _no_ord(x, 0, 0);                                \
80176 +}                                                                              \
80177 +                                                                               \
80178 +/* Release read lock on @x.                                               */   \
80179 +static inline void read_unlock_ ## NAME (TYPE *x)                              \
80180 +{                                                                              \
80181 +       __ODCA("nikita-2979", LOCK_CNT_GTZ(read_locked_ ## NAME));              \
80182 +       __ODCA("nikita-2980", LOCK_CNT_GTZ(rw_locked_ ## NAME));                \
80183 +       __ODCA("nikita-2980", LOCK_CNT_GTZ(spin_locked));                       \
80184 +       read_ ## NAME ## _dec();                                                \
80185 +       __ODCA("nikita-2703", rw_ ## NAME ## _is_read_locked(x));               \
80186 +       read_unlock (& x->FIELD.lock);                                          \
80187 +       PREG_EX(get_cpu(), &pregion_rw_ ## NAME ## _r_held);                    \
80188 +}                                                                              \
80189 +                                                                               \
80190 +/* Release write lock on @x.                                              */   \
80191 +static inline void write_unlock_ ## NAME (TYPE *x)                             \
80192 +{                                                                              \
80193 +       __ODCA("nikita-2979", LOCK_CNT_GTZ(write_locked_ ## NAME));             \
80194 +       __ODCA("nikita-2980", LOCK_CNT_GTZ(rw_locked_ ## NAME));                \
80195 +       __ODCA("nikita-2980", LOCK_CNT_GTZ(spin_locked));                       \
80196 +       write_ ## NAME ## _dec();                                               \
80197 +       __ODCA("nikita-2703", rw_ ## NAME ## _is_write_locked(x));              \
80198 +       write_unlock (& x->FIELD.lock);                                         \
80199 +       PREG_EX(get_cpu(), &pregion_rw_ ## NAME ## _w_held);                    \
80200 +}                                                                              \
80201 +                                                                               \
80202 +/* Try to obtain write lock on @x. On success, returns 1 with @x locked.  */   \
80203 +/* If @x is already locked, return 0 immediately.                         */   \
80204 +static inline int  write_trylock_ ## NAME (TYPE *x)                            \
80205 +{                                                                              \
80206 +       if (write_trylock (& x->FIELD.lock)) {                                  \
80207 +               GETCPU(cpu);                                                    \
80208 +               PREG_IN(cpu, &pregion_rw_ ## NAME ## _w_held,                   \
80209 +                       &x->FIELD.w_held, 0);                                   \
80210 +               PUTCPU(cpu);                                                    \
80211 +               write_ ## NAME ## _inc();                                       \
80212 +               return 1;                                                       \
80213 +       }                                                                       \
80214 +       return 0;                                                               \
80215 +}                                                                              \
80216 +                                                                               \
80217 +                                                                               \
80218 +typedef struct { int foo; } NAME ## _rw_dummy
80219 +
80220 +/*
80221 + * Helper macro to perform a simple operation that requires taking of read
80222 + * write lock.
80223 + *
80224 + * 1. Acquire read or write (depending on @rw parameter) lock on object @obj
80225 + * of type @obj_type.
80226 + *
80227 + * 2. Execute @exp under lock, and store result.
80228 + *
80229 + * 3. Release lock.
80230 + *
80231 + * 4. Return result of @exp.
80232 + *
80233 + * Example:
80234 + *
80235 + * tree_height = UNDER_RW(tree, current_tree, read, current_tree->height);
80236 + */
80237 +#define UNDER_RW(obj_type, obj, rw, exp)                               \
80238 +({                                                                     \
80239 +       typeof (obj) __obj;                                             \
80240 +       typeof (exp) __result;                                          \
80241 +       LOCKSITE_INIT(__hits_t);                                        \
80242 +       LOCKSITE_INIT(__hits_h);                                        \
80243 +                                                                       \
80244 +       __obj = (obj);                                                  \
80245 +       __ODCA("nikita-2981", __obj != NULL);                           \
80246 +       rw ## _lock_ ## obj_type ## _at (__obj, &__hits_t, &__hits_h);  \
80247 +       __result = exp;                                                 \
80248 +       rw ## _unlock_ ## obj_type (__obj);                             \
80249 +       __result;                                                       \
80250 +})
80251 +
80252 +/*
80253 + * The same as UNDER_RW, but without storing and returning @exp's result.
80254 + */
80255 +#define UNDER_RW_VOID(obj_type, obj, rw, exp)                          \
80256 +({                                                                     \
80257 +       typeof (obj) __obj;                                             \
80258 +       LOCKSITE_INIT(__hits_t);                                        \
80259 +       LOCKSITE_INIT(__hits_h);                                        \
80260 +                                                                       \
80261 +       __obj = (obj);                                                  \
80262 +       __ODCA("nikita-2982", __obj != NULL);                           \
80263 +       rw ## _lock_ ## obj_type ## _at (__obj, &__hits_t, &__hits_h);  \
80264 +       exp;                                                            \
80265 +       rw ## _unlock_ ## obj_type (__obj);                             \
80266 +})
80267 +
80268 +#if REISER4_LOCKPROF
80269 +
80270 +/*
80271 + * Wrapper function to work with locks of certain reiser4 objects. These
80272 + * functions allows to track where in code locks are held (or tried) for the
80273 + * longest time.
80274 + */
80275 +
80276 +#define LOCK_JNODE(node)                               \
80277 +({                                                     \
80278 +       LOCKSITE_INIT(__hits_t);                        \
80279 +       LOCKSITE_INIT(__hits_h);                        \
80280 +                                                       \
80281 +       spin_lock_jnode_at(node, &__hits_t, &__hits_h); \
80282 +})
80283 +
80284 +#define LOCK_JLOAD(node)                               \
80285 +({                                                     \
80286 +       LOCKSITE_INIT(__hits_t);                        \
80287 +       LOCKSITE_INIT(__hits_h);                        \
80288 +                                                       \
80289 +       spin_lock_jload_at(node, &__hits_t, &__hits_h); \
80290 +})
80291 +
80292 +#define LOCK_ATOM(atom)                                        \
80293 +({                                                     \
80294 +       LOCKSITE_INIT(__hits_t);                        \
80295 +       LOCKSITE_INIT(__hits_h);                        \
80296 +                                                       \
80297 +       spin_lock_atom_at(atom, &__hits_t, &__hits_h);  \
80298 +})
80299 +
80300 +#define LOCK_TXNH(txnh)                                        \
80301 +({                                                     \
80302 +       LOCKSITE_INIT(__hits_t);                        \
80303 +       LOCKSITE_INIT(__hits_h);                        \
80304 +                                                       \
80305 +       spin_lock_txnh_at(txnh, &__hits_t, &__hits_h);  \
80306 +})
80307 +
80308 +#define LOCK_INODE(inode)                                      \
80309 +({                                                             \
80310 +       LOCKSITE_INIT(__hits_t);                                \
80311 +       LOCKSITE_INIT(__hits_h);                                \
80312 +                                                               \
80313 +       spin_lock_inode_object_at(inode, &__hits_t, &__hits_h); \
80314 +})
80315 +
80316 +#define RLOCK_TREE(tree)                               \
80317 +({                                                     \
80318 +       LOCKSITE_INIT(__hits_t);                        \
80319 +       LOCKSITE_INIT(__hits_h);                        \
80320 +                                                       \
80321 +       read_lock_tree_at(tree, &__hits_t, &__hits_h);  \
80322 +})
80323 +
80324 +#define WLOCK_TREE(tree)                               \
80325 +({                                                     \
80326 +       LOCKSITE_INIT(__hits_t);                        \
80327 +       LOCKSITE_INIT(__hits_h);                        \
80328 +                                                       \
80329 +       write_lock_tree_at(tree, &__hits_t, &__hits_h); \
80330 +})
80331 +
80332 +#define RLOCK_DK(tree)                                 \
80333 +({                                                     \
80334 +       LOCKSITE_INIT(__hits_t);                        \
80335 +       LOCKSITE_INIT(__hits_h);                        \
80336 +                                                       \
80337 +       read_lock_dk_at(tree, &__hits_t, &__hits_h);    \
80338 +})
80339 +
80340 +#define WLOCK_DK(tree)                                 \
80341 +({                                                     \
80342 +       LOCKSITE_INIT(__hits_t);                        \
80343 +       LOCKSITE_INIT(__hits_h);                        \
80344 +                                                       \
80345 +       write_lock_dk_at(tree, &__hits_t, &__hits_h);   \
80346 +})
80347 +
80348 +#define RLOCK_ZLOCK(lock)                              \
80349 +({                                                     \
80350 +       LOCKSITE_INIT(__hits_t);                        \
80351 +       LOCKSITE_INIT(__hits_h);                        \
80352 +                                                       \
80353 +       read_lock_zlock_at(lock, &__hits_t, &__hits_h); \
80354 +})
80355 +
80356 +#define WLOCK_ZLOCK(lock)                              \
80357 +({                                                     \
80358 +       LOCKSITE_INIT(__hits_t);                        \
80359 +       LOCKSITE_INIT(__hits_h);                        \
80360 +                                                       \
80361 +       write_lock_zlock_at(lock, &__hits_t, &__hits_h);        \
80362 +})
80363 +
80364 +
80365 +#else
80366 +#define LOCK_JNODE(node) spin_lock_jnode(node)
80367 +#define LOCK_JLOAD(node) spin_lock_jload(node)
80368 +#define LOCK_ATOM(atom) spin_lock_atom(atom)
80369 +#define LOCK_TXNH(txnh) spin_lock_txnh(txnh)
80370 +#define LOCK_INODE(inode) spin_lock_inode_object(inode)
80371 +#define RLOCK_TREE(tree) read_lock_tree(tree)
80372 +#define WLOCK_TREE(tree) write_lock_tree(tree)
80373 +#define RLOCK_DK(tree) read_lock_dk(tree)
80374 +#define WLOCK_DK(tree) write_lock_dk(tree)
80375 +#define RLOCK_ZLOCK(lock) read_lock_zlock(lock)
80376 +#define WLOCK_ZLOCK(lock) write_lock_zlock(lock)
80377 +#endif
80378 +
80379 +#define UNLOCK_JNODE(node) spin_unlock_jnode(node)
80380 +#define UNLOCK_JLOAD(node) spin_unlock_jload(node)
80381 +#define UNLOCK_ATOM(atom) spin_unlock_atom(atom)
80382 +#define UNLOCK_TXNH(txnh) spin_unlock_txnh(txnh)
80383 +#define UNLOCK_INODE(inode) spin_unlock_inode_object(inode)
80384 +#define RUNLOCK_TREE(tree) read_unlock_tree(tree)
80385 +#define WUNLOCK_TREE(tree) write_unlock_tree(tree)
80386 +#define RUNLOCK_DK(tree) read_unlock_dk(tree)
80387 +#define WUNLOCK_DK(tree) write_unlock_dk(tree)
80388 +#define RUNLOCK_ZLOCK(lock) read_unlock_zlock(lock)
80389 +#define WUNLOCK_ZLOCK(lock) write_unlock_zlock(lock)
80390 +
80391 +/* __SPIN_MACROS_H__ */
80392 +#endif
80393 +
80394 +/* Make Linus happy.
80395 +   Local variables:
80396 +   c-indentation-style: "K&R"
80397 +   mode-name: "LC"
80398 +   c-basic-offset: 8
80399 +   tab-width: 8
80400 +   fill-column: 120
80401 +   scroll-step: 1
80402 +   End:
80403 +*/
80404 diff -rupN linux-2.6.8-rc3/fs/reiser4/spinprof.c linux-2.6.8-rc3-a/fs/reiser4/spinprof.c
80405 --- linux-2.6.8-rc3/fs/reiser4/spinprof.c       1970-01-01 03:00:00.000000000 +0300
80406 +++ linux-2.6.8-rc3-a/fs/reiser4/spinprof.c     2004-08-05 21:20:53.008676244 +0400
80407 @@ -0,0 +1,567 @@
80408 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
80409 + * reiser4/README */
80410 +
80411 +/* spin lock profiling */
80412 +
80413 +/*
80414 + * Spin-lock profiling code.
80415 + *
80416 + * Basic notion in our profiling code is "profiling region" (struct
80417 + * profregion). Profiling region is entered and left by calling
80418 + * profregion_in() and profregion_ex() function correspondingly. It is invalid
80419 + * to be preempted (voluntary or not) while inside profiling region. Profiling
80420 + * regions can be entered recursively, and it is not necessary nest then
80421 + * properly, that is
80422 + *
80423 + *     profregion_in(&A);
80424 + *     profregion_in(&B);
80425 + *     profregion_ex(&A);
80426 + *     profregion_ex(&B))
80427 + *
80428 + * is valid sequence of operations. Each CPU maintains an array of currently
80429 + * active profiling regions. This array is consulted by clock interrupt
80430 + * handler, and counters in the profiling regions found active by handler are
80431 + * incremented. This allows one to estimate for how long region has been
80432 + * active on average. Spin-locking code in spin_macros.h uses this to measure
80433 + * spin-lock contention. Specifically two profiling regions are defined for
80434 + * each spin-lock type: one is activate while thread is trying to acquire
80435 + * lock, and another when it holds the lock.
80436 + *
80437 + * Profiling regions export their statistics in the sysfs, under special
80438 + * directory /sys/profregion.
80439 + *
80440 + * Each profregion is represented as a child directory
80441 + * /sys/profregion/foo. Internally it's represented as a struct kobject (viz
80442 + * one embedded into struct profregion, see spinprof.h).
80443 + *
80444 + * Each /sys/profregion/foo directory contains files representing fields in
80445 + * profregion:
80446 + *
80447 + *     hits
80448 + *     busy
80449 + *     obj
80450 + *     objhit
80451 + *     code
80452 + *     codehit
80453 + *
80454 + * See spinprof.h for details.
80455 + *
80456 + *
80457 + */
80458 +
80459 +#include "kattr.h"
80460 +#include "spinprof.h"
80461 +#include "debug.h"
80462 +
80463 +#include <linux/percpu.h>
80464 +#include <linux/notifier.h>
80465 +#include <linux/kallsyms.h>
80466 +
80467 +#include <asm/irq.h>
80468 +#include <asm/ptrace.h> /* for instruction_pointer() */
80469 +
80470 +#if REISER4_LOCKPROF
80471 +
80472 +/*
80473 + * helper macro: how many bytes left in the PAGE_SIZE buffer, starting at @buf
80474 + * and used up to and including @p.
80475 + */
80476 +#define LEFT(p, buf) (PAGE_SIZE - ((p) - (buf)) - 1)
80477 +
80478 +void profregion_functions_start_here(void);
80479 +void profregion_functions_end_here(void);
80480 +
80481 +static locksite none = {
80482 +       .hits = STATCNT_INIT,
80483 +       .func = "",
80484 +       .line = 0
80485 +};
80486 +
80487 +/*
80488 + * sysfs holder.
80489 + */
80490 +struct profregion_attr {
80491 +       struct attribute attr;
80492 +       ssize_t (*show)(struct profregion *pregion, char *buf);
80493 +};
80494 +
80495 +/*
80496 + * macro to define profregion_attr for the given profregion
80497 + */
80498 +#define PROFREGION_ATTR(aname)                 \
80499 +static struct profregion_attr aname = {                \
80500 +       .attr = {                               \
80501 +               .name = (char *)#aname,         \
80502 +               .mode = 0666                    \
80503 +       },                                      \
80504 +       .show = aname ## _show                  \
80505 +}
80506 +
80507 +/*
80508 + * ->show() method for the "hits" attribute.
80509 + */
80510 +static ssize_t hits_show(struct profregion *pregion, char *buf)
80511 +{
80512 +       char *p = buf;
80513 +       KATTR_PRINT(p, buf, "%li\n", statcnt_get(&pregion->hits));
80514 +       return (p - buf);
80515 +}
80516 +
80517 +/*
80518 + * ->show() method for the "busy" attribute.
80519 + */
80520 +static ssize_t busy_show(struct profregion *pregion, char *buf)
80521 +{
80522 +       char *p = buf;
80523 +       KATTR_PRINT(p, buf, "%li\n", statcnt_get(&pregion->busy));
80524 +       return (p - buf);
80525 +}
80526 +
80527 +/*
80528 + * ->show() method for the "obj" attribute.
80529 + */
80530 +static ssize_t obj_show(struct profregion *pregion, char *buf)
80531 +{
80532 +       char *p = buf;
80533 +       KATTR_PRINT(p, buf, "%p\n", pregion->obj);
80534 +       return (p - buf);
80535 +}
80536 +
80537 +/*
80538 + * ->show() method for the "objhit" attribute.
80539 + */
80540 +static ssize_t objhit_show(struct profregion *pregion, char *buf)
80541 +{
80542 +       char *p = buf;
80543 +       KATTR_PRINT(p, buf, "%i\n", pregion->objhit);
80544 +       return (p - buf);
80545 +}
80546 +
80547 +/*
80548 + * ->show() method for the "code" attribute.
80549 + */
80550 +static ssize_t code_show(struct profregion *pregion, char *buf)
80551 +{
80552 +       char *p = buf;
80553 +       locksite *site;
80554 +
80555 +       site = pregion->code ? : &none;
80556 +       KATTR_PRINT(p, buf, "%s:%i\n", site->func, site->line);
80557 +       return (p - buf);
80558 +}
80559 +
80560 +/*
80561 + * ->show() method for the "codehit" attribute.
80562 + */
80563 +static ssize_t codehit_show(struct profregion *pregion, char *buf)
80564 +{
80565 +       char *p = buf;
80566 +       KATTR_PRINT(p, buf, "%i\n", pregion->codehit);
80567 +       return (p - buf);
80568 +}
80569 +
80570 +PROFREGION_ATTR(hits);
80571 +PROFREGION_ATTR(busy);
80572 +PROFREGION_ATTR(obj);
80573 +PROFREGION_ATTR(objhit);
80574 +PROFREGION_ATTR(code);
80575 +PROFREGION_ATTR(codehit);
80576 +
80577 +/*
80578 + * wrapper to call attribute ->show() methods (defined above). This is called
80579 + * by sysfs.
80580 + */
80581 +static ssize_t
80582 +profregion_show(struct kobject * kobj, struct attribute *attr, char *buf)
80583 +{
80584 +       struct profregion *pregion;
80585 +       struct profregion_attr *pattr;
80586 +
80587 +       pregion = container_of(kobj, struct profregion, kobj);
80588 +       pattr   = container_of(attr, struct profregion_attr, attr);
80589 +
80590 +       return pattr->show(pregion, buf);
80591 +}
80592 +
80593 +/*
80594 + * ->store() method for profregion sysfs object. Any write to this object,
80595 + * just resets profregion stats.
80596 + */
80597 +static ssize_t profregion_store(struct kobject * kobj,
80598 +                               struct attribute * attr UNUSED_ARG,
80599 +                               const char * buf UNUSED_ARG,
80600 +                               size_t size)
80601 +{
80602 +       struct profregion *pregion;
80603 +
80604 +       pregion = container_of(kobj, struct profregion, kobj);
80605 +       statcnt_reset(&pregion->hits);
80606 +       statcnt_reset(&pregion->busy);
80607 +       pregion->obj     = 0;
80608 +       pregion->objhit  = 0;
80609 +       pregion->code    = 0;
80610 +       pregion->codehit = 0;
80611 +       return size;
80612 +}
80613 +
80614 +/*
80615 + * sysfs attribute operations vector...
80616 + */
80617 +static struct sysfs_ops profregion_attr_ops = {
80618 +       .show  = profregion_show,
80619 +       .store = profregion_store
80620 +};
80621 +
80622 +/*
80623 + * ...and attributes themselves.
80624 + */
80625 +static struct attribute * def_attrs[] = {
80626 +       &hits.attr,
80627 +       &busy.attr,
80628 +       &obj.attr,
80629 +       &objhit.attr,
80630 +       &code.attr,
80631 +       &codehit.attr,
80632 +       NULL
80633 +};
80634 +
80635 +/*
80636 + * ktype for kobjects representing profregions.
80637 + */
80638 +static struct kobj_type ktype_profregion = {
80639 +       .sysfs_ops      = &profregion_attr_ops,
80640 +       .default_attrs  = def_attrs,
80641 +};
80642 +
80643 +/*
80644 + * sysfs object for /sys/profregion
80645 + */
80646 +static decl_subsys(profregion, &ktype_profregion, NULL);
80647 +
80648 +/*
80649 + * profregionstack for each CPU
80650 + */
80651 +DEFINE_PER_CPU(struct profregionstack, inregion) = {0};
80652 +
80653 +/*
80654 + * profregion meaning "no other profregion is active"
80655 + */
80656 +struct profregion outside = {
80657 +       .hits = STATCNT_INIT,
80658 +       .kobj = {
80659 +               .name = "outside"
80660 +       }
80661 +};
80662 +
80663 +/*
80664 + * profregion meaning "we are in reiser4 context, but no locks are held"
80665 + */
80666 +struct profregion incontext = {
80667 +       .hits = STATCNT_INIT,
80668 +       .kobj = {
80669 +               .name = "incontext"
80670 +       }
80671 +};
80672 +
80673 +/*
80674 + * profregion meaning "we are profregion handling code". This is to estimate
80675 + * profregion overhead.
80676 + */
80677 +struct profregion overhead = {
80678 +       .hits = STATCNT_INIT,
80679 +       .kobj = {
80680 +               .name = "overhead"
80681 +       }
80682 +};
80683 +
80684 +extern struct profregion pregion_spin_jnode_held;
80685 +extern struct profregion pregion_spin_jnode_trying;
80686 +
80687 +/*
80688 + * This is main profregion handling function. It is called from clock
80689 + * interrupt handler on each tick (HZ times per second).
80690 + *
80691 + * It determines what profregions are active at the moment of call, and
80692 + * updates their fields correspondingly.
80693 + */
80694 +static int callback(struct notifier_block *self UNUSED_ARG,
80695 +                   unsigned long val UNUSED_ARG, void *p)
80696 +{
80697 +       struct profregionstack *stack;
80698 +       struct pt_regs *regs;
80699 +       unsigned long pc;
80700 +       int ntop;
80701 +
80702 +       regs = p;
80703 +       /* instruction pointer at which interrupt happened */
80704 +       pc = instruction_pointer(regs);
80705 +
80706 +       if (pc > (unsigned long)profregion_functions_start_here &&
80707 +           pc < (unsigned long)profregion_functions_end_here) {
80708 +               /* if @pc lies in this file---count it as overhead */
80709 +               statcnt_inc(&overhead.hits);
80710 +               return 0;
80711 +       }
80712 +
80713 +       stack = &get_cpu_var(inregion);
80714 +       ntop = stack->top;
80715 +       if (unlikely(ntop != 0)) {
80716 +               struct pregactivation *act;
80717 +               struct profregion *preg;
80718 +               int hits;
80719 +
80720 +               act = &stack->stack[ntop - 1];
80721 +               preg = act->preg;
80722 +               statcnt_inc(&preg->hits);
80723 +
80724 +               hits = 0;
80725 +               if (act->objloc != NULL) {
80726 +                       BUG_ON(*act->objloc == 0x6b6b6b6b);
80727 +                       BUG_ON(*act->objloc == 0x5a5a5a5a);
80728 +                       hits = ++ (*act->objloc);
80729 +               }
80730 +               if (unlikely(hits > preg->objhit)) {
80731 +                       if (preg->obj != act->objloc) {
80732 +                               preg->objhit = hits;
80733 +                               preg->obj    = act->objloc;
80734 +                               if (preg->champion != NULL)
80735 +                                       preg->champion(preg);
80736 +                       }
80737 +               }
80738 +
80739 +               hits = 0;
80740 +               if (act->codeloc != NULL) {
80741 +                       statcnt_inc(&act->codeloc->hits);
80742 +                       hits = statcnt_get(&act->codeloc->hits);
80743 +               }
80744 +               if (unlikely(hits > preg->codehit)) {
80745 +                       preg->codehit = hits;
80746 +                       preg->code    = act->codeloc;
80747 +               }
80748 +               for (; ntop > 0 ; --ntop) {
80749 +                       preg = stack->stack[ntop - 1].preg;
80750 +                       if (preg != NULL)
80751 +                               statcnt_inc(&preg->busy);
80752 +               }
80753 +       } else if (is_in_reiser4_context())
80754 +               statcnt_inc(&incontext.hits);
80755 +       else
80756 +               statcnt_inc(&outside.hits);
80757 +       put_cpu_var(inregion);
80758 +       return 0;
80759 +}
80760 +
80761 +/*
80762 + * notifier block used to register our callback for clock interrupt handler.
80763 + */
80764 +static struct notifier_block profregionnotifier = {
80765 +       .notifier_call = callback
80766 +};
80767 +
80768 +/* different architectures tend to declare register_profile_notifier() in
80769 + * different places */
80770 +extern int register_profile_notifier(struct notifier_block * nb);
80771 +
80772 +/*
80773 + * profregion initialization: setup sysfs things.
80774 + */
80775 +int __init
80776 +profregion_init(void)
80777 +{
80778 +       int result;
80779 +
80780 +       /* register /sys/profregion */
80781 +       result = subsystem_register(&profregion_subsys);
80782 +       if (result != 0)
80783 +               return result;
80784 +
80785 +       /* register /sys/profregion/outside */
80786 +       result = profregion_register(&outside);
80787 +       if (result != 0)
80788 +               return result;
80789 +
80790 +       /* register /sys/profregion/incontext */
80791 +       result = profregion_register(&incontext);
80792 +       if (result != 0)
80793 +               return result;
80794 +
80795 +       /* register /sys/profregion/overhead */
80796 +       result = profregion_register(&overhead);
80797 +       if (result != 0)
80798 +               return result;
80799 +
80800 +       /* register our callback function to be called on each clock tick */
80801 +       return register_profile_notifier(&profregionnotifier);
80802 +}
80803 +subsys_initcall(profregion_init);
80804 +
80805 +/*
80806 + * undo profregion_init() actions.
80807 + */
80808 +static void __exit
80809 +profregion_exit(void)
80810 +{
80811 +       profregion_unregister(&overhead);
80812 +       profregion_unregister(&incontext);
80813 +       profregion_unregister(&outside);
80814 +       subsystem_unregister(&profregion_subsys);
80815 +}
80816 +__exitcall(profregion_exit);
80817 +
80818 +/*
80819 + * register profregion
80820 + */
80821 +int profregion_register(struct profregion *pregion)
80822 +{
80823 +       /* tell sysfs that @pregion is part of /sys/profregion "subsystem" */
80824 +       kobj_set_kset_s(pregion, profregion_subsys);
80825 +       /* and register /sys/profregion/<pregion> */
80826 +       return kobject_register(&pregion->kobj);
80827 +}
80828 +
80829 +/*
80830 + * dual to profregion_register(): unregister profregion
80831 + */
80832 +void profregion_unregister(struct profregion *pregion)
80833 +{
80834 +       kobject_register(&pregion->kobj);
80835 +}
80836 +
80837 +void profregion_functions_start_here(void) { }
80838 +
80839 +/*
80840 + * search for @pregion in the stack of currently active profregions on this
80841 + * cpu. Return its index if found, 0 otherwise.
80842 + */
80843 +int profregion_find(struct profregionstack *stack, struct profregion *pregion)
80844 +{
80845 +       int i;
80846 +
80847 +       for (i = stack->top - 2 ; i >= 0 ; -- i) {
80848 +               if (stack->stack[i].preg == pregion) {
80849 +                       return i;
80850 +               }
80851 +       }
80852 +       BUG();
80853 +       return 0;
80854 +}
80855 +
80856 +/*
80857 + * Fill @act slot with information
80858 + */
80859 +void profregfill(struct pregactivation *act,
80860 +                struct profregion *pregion,
80861 +                void *objloc, locksite *codeloc)
80862 +{
80863 +       act->objloc  = NULL;
80864 +       act->codeloc = NULL;
80865 +       /* barrier is needed here, because clock interrupt can come at any
80866 +        * point, and we want our callback to see consistent data */
80867 +       barrier();
80868 +       act->preg    = pregion;
80869 +       act->objloc  = objloc;
80870 +       act->codeloc = codeloc;
80871 +}
80872 +
80873 +/*
80874 + * activate profregion @pregion on processor @cpu.
80875 + */
80876 +void profregion_in(int cpu, struct profregion *pregion,
80877 +                  void *objloc, locksite *codeloc)
80878 +{
80879 +       struct profregionstack *stack;
80880 +       int ntop;
80881 +
80882 +       preempt_disable();
80883 +       stack = &per_cpu(inregion, cpu);
80884 +       ntop = stack->top;
80885 +       /* check for stack overflow */
80886 +       BUG_ON(ntop == PROFREGION_MAX_DEPTH);
80887 +       /* store information about @pregion in the next free slot on the
80888 +        * stack */
80889 +       profregfill(&stack->stack[ntop], pregion, objloc, codeloc);
80890 +       /* put optimization barrier here */
80891 +       /* barrier is needed here, because clock interrupt can come at any
80892 +        * point, and we want our callback to see consistent data */
80893 +       barrier();
80894 +       ++ stack->top;
80895 +}
80896 +
80897 +/*
80898 + * deactivate (leave) @pregion at processor @cpu.
80899 + */
80900 +void profregion_ex(int cpu, struct profregion *pregion)
80901 +{
80902 +       struct profregionstack *stack;
80903 +       int ntop;
80904 +
80905 +       stack = &per_cpu(inregion, cpu);
80906 +       ntop = stack->top;
80907 +       BUG_ON(ntop == 0);
80908 +       /*
80909 +        * in the usual case (when locks nest properly), @pregion uses top
80910 +        * slot of the stack. Free it.
80911 +        */
80912 +       if(likely(stack->stack[ntop - 1].preg == pregion)) {
80913 +               do {
80914 +                       -- ntop;
80915 +               } while (ntop > 0 &&
80916 +                        stack->stack[ntop - 1].preg == NULL);
80917 +               /* put optimization barrier here */
80918 +               barrier();
80919 +               stack->top = ntop;
80920 +       } else
80921 +               /*
80922 +                * Otherwise (locks are not nested), find slot used by
80923 +                * @prefion and free it.
80924 +                */
80925 +               stack->stack[profregion_find(stack, pregion)].preg = NULL;
80926 +       preempt_enable();
80927 +       put_cpu();
80928 +}
80929 +
80930 +/*
80931 + * simultaneously deactivate top-level profregion in the stack, and activate
80932 + * @pregion. This is optimization to serve common case, when profregion
80933 + * covering "trying to take lock X" is immediately followed by profregion
80934 + * covering "holding lock X".
80935 + */
80936 +void profregion_replace(int cpu, struct profregion *pregion,
80937 +                       void *objloc, void *codeloc)
80938 +{
80939 +       struct profregionstack *stack;
80940 +       int ntop;
80941 +
80942 +       stack = &per_cpu(inregion, cpu);
80943 +       ntop = stack->top;
80944 +       BUG_ON(ntop == 0);
80945 +       profregfill(&stack->stack[ntop - 1], pregion, objloc, codeloc);
80946 +}
80947 +
80948 +void profregion_functions_end_here(void) { }
80949 +
80950 +/* REISER4_LOCKPROF */
80951 +#else
80952 +
80953 +#if defined(CONFIG_REISER4_NOOPT) || defined(CONFIG_KGDB)
80954 +
80955 +locksite __hits;
80956 +locksite __hits_h;
80957 +locksite __hits_t;
80958 +locksite __hits_held;
80959 +locksite __hits_trying;
80960 +
80961 +#endif /* CONFIG_REISER4_NOOPT */
80962 +
80963 +/* REISER4_LOCKPROF */
80964 +#endif
80965 +
80966 +/* Make Linus happy.
80967 +   Local variables:
80968 +   c-indentation-style: "K&R"
80969 +   mode-name: "LC"
80970 +   c-basic-offset: 8
80971 +   tab-width: 8
80972 +   fill-column: 120
80973 +   End:
80974 +*/
80975 diff -rupN linux-2.6.8-rc3/fs/reiser4/spinprof.h linux-2.6.8-rc3-a/fs/reiser4/spinprof.h
80976 --- linux-2.6.8-rc3/fs/reiser4/spinprof.h       1970-01-01 03:00:00.000000000 +0300
80977 +++ linux-2.6.8-rc3-a/fs/reiser4/spinprof.h     2004-08-05 21:20:53.430587253 +0400
80978 @@ -0,0 +1,131 @@
80979 +/* Copyright 2002, 2003, 2004 by Hans Reiser, licensing governed by
80980 + * reiser4/README */
80981 +
80982 +/* spin lock profiling. See spinprof.c for comments. */
80983 +
80984 +#ifndef __SPINPROF_H__
80985 +#define __SPINPROF_H__
80986 +
80987 +#include "debug.h"
80988 +#include "spin_macros.h"
80989 +#include "statcnt.h"
80990 +
80991 +#include <linux/config.h>
80992 +#include <linux/profile.h>
80993 +#include <linux/kobject.h>
80994 +
80995 +#if REISER4_LOCKPROF
80996 +
80997 +/* maximal number of profiling regions that can be active at the same time */
80998 +#define PROFREGION_MAX_DEPTH (12)
80999 +
81000 +typedef struct percpu_counter scnt_t;
81001 +
81002 +/* spin-locking code uses this to identify place in the code, where particular
81003 + * call to locking function is made. */
81004 +typedef struct locksite {
81005 +       statcnt_t   hits;   /* number of times profiling region that is
81006 +                            * entered at this place of code was found active
81007 +                            * my clock interrupt handler. */
81008 +       const char *func;   /* function */
81009 +       int         line;   /* line in the source file */
81010 +} locksite;
81011 +
81012 +/* macro to initialize locksite */
81013 +#define LOCKSITE_INIT(name)                    \
81014 +       static locksite name = {                \
81015 +               .hits = STATCNT_INIT,           \
81016 +               .func = __FUNCTION__,           \
81017 +               .line = __LINE__                \
81018 +       }
81019 +
81020 +/* profiling region */
81021 +struct profregion {
81022 +       /* how many times clock interrupt handler found this profiling region
81023 +        * to be at the top of array of active regions. */
81024 +       statcnt_t      hits;
81025 +       /* how many times clock interrupt handler found this profiling region
81026 +        * in active array */
81027 +       statcnt_t      busy;
81028 +       /* sysfs handle */
81029 +       struct kobject kobj;
81030 +       /* object that (so far) was observed to be locked/contended most
81031 +        * through this region */
81032 +       void          *obj;
81033 +       /* number of times ->obj's lock was requested/held while in this
81034 +        * region */
81035 +       int            objhit;
81036 +       /* place in code that (so far) was most active user of this
81037 +        * profregion */
81038 +       locksite      *code;
81039 +       /* number of times clock interrupt handler observed that ->code was in
81040 +        * this profregion */
81041 +       int            codehit;
81042 +       /*
81043 +        * optional method called when ->obj is changed. Can be used to output
81044 +        * information about most contended objects.
81045 +        */
81046 +       void (*champion)(struct profregion * preg);
81047 +};
81048 +
81049 +/*
81050 + * slot in profregionstack used when profregion is activated (that is,
81051 + * entered).
81052 + */
81053 +struct pregactivation {
81054 +       /* profiling region */
81055 +       struct profregion *preg;
81056 +       /* pointer to hits counter, embedded into object */
81057 +       int               *objloc;
81058 +       /* current lock site */
81059 +       locksite          *codeloc;
81060 +};
81061 +
81062 +/*
81063 + * Stack recording currently active profregion activations. Strictly speaking
81064 + * this is not a stack at all, because locks (and profregions) do not
81065 + * necessary nest properly.
81066 + */
81067 +struct profregionstack {
81068 +       /* index of next free slot */
81069 +       int top;
81070 +       /* array of slots for profregion activations */
81071 +       struct pregactivation stack[PROFREGION_MAX_DEPTH];
81072 +};
81073 +
81074 +DECLARE_PER_CPU(struct profregionstack, inregion);
81075 +
81076 +extern int  profregion_register(struct profregion *pregion);
81077 +extern void profregion_unregister(struct profregion *pregion);
81078 +
81079 +extern void profregion_in(int cpu, struct profregion *pregion,
81080 +                         void *objloc, locksite *codeloc);
81081 +extern void profregion_ex(int cpu, struct profregion *pregion);
81082 +extern void profregion_replace(int cpu, struct profregion *pregion,
81083 +                              void *objloc, void *codeloc);
81084 +
81085 +/* REISER4_LOCKPROF */
81086 +#else
81087 +
81088 +struct profregionstack {};
81089 +#define profregion_register(pregion) (0)
81090 +#define profregion_unregister(pregion) noop
81091 +
81092 +typedef struct locksite {} locksite;
81093 +#define LOCKSITE_INIT(name) extern locksite name
81094 +
81095 +/* REISER4_LOCKPROF */
81096 +#endif
81097 +
81098 +/* __SPINPROF_H__ */
81099 +#endif
81100 +/* Make Linus happy.
81101 +   Local variables:
81102 +   c-indentation-style: "K&R"
81103 +   mode-name: "LC"
81104 +   c-basic-offset: 8
81105 +   tab-width: 8
81106 +   fill-column: 120
81107 +   scroll-step: 1
81108 +   End:
81109 +*/
81110 diff -rupN linux-2.6.8-rc3/fs/reiser4/statcnt.h linux-2.6.8-rc3-a/fs/reiser4/statcnt.h
81111 --- linux-2.6.8-rc3/fs/reiser4/statcnt.h        1970-01-01 03:00:00.000000000 +0300
81112 +++ linux-2.6.8-rc3-a/fs/reiser4/statcnt.h      2004-08-05 21:20:52.946689319 +0400
81113 @@ -0,0 +1,113 @@
81114 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
81115 +
81116 +/* Efficient counter for statistics collection. */
81117 +
81118 +/*
81119 + * This is write-optimized statistical counter. There is stock counter of such
81120 + * kind (include/linux/percpu_counter.h), but it is read-optimized, that is
81121 + * reads are cheap, updates are relatively expensive. We need data-structure
81122 + * for our statistical counters (stats.[ch]) that are write-mostly, that is,
81123 + * they are updated much more frequently than read.
81124 + */
81125 +
81126 +#ifndef __STATCNT_H__
81127 +#define __STATCNT_H__
81128 +
81129 +#include <linux/types.h>
81130 +#include <linux/config.h>
81131 +#include <linux/spinlock.h>
81132 +#include <linux/smp.h>
81133 +#include <linux/threads.h>
81134 +
81135 +#ifdef CONFIG_SMP
81136 +
81137 +struct __statcnt_slot {
81138 +       long count;
81139 +} ____cacheline_aligned;
81140 +
81141 +/*
81142 + * counter.
81143 + *
81144 + * This is an array of integer counters, one per each processor. Each
81145 + * individual counter is cacheline aligned, so that processor don't fight for
81146 + * cache-lines when accessing this. Such alignment makes this counter _huge_.
81147 + *
81148 + * To update counter, thread just modifies an element in array corresponding
81149 + * to its current processor.
81150 + *
81151 + * To read value of counter array is scanned and all elements are summed
81152 + * up. This means that read value can be slightly inaccurate, because scan is
81153 + * not protected by any lock, but that is it.
81154 + */
81155 +typedef struct statcnt {
81156 +       struct __statcnt_slot counters[NR_CPUS];
81157 +} statcnt_t;
81158 +
81159 +#define STATCNT_INIT                                           \
81160 +{                                                              \
81161 +       .counters = { [ 0 ... NR_CPUS - 1 ] = { .count = 0 } }  \
81162 +}
81163 +
81164 +static inline void statcnt_add(statcnt_t *cnt, int val)
81165 +{
81166 +       int cpu;
81167 +
81168 +       cpu = get_cpu();
81169 +       cnt->counters[cpu].count += val;
81170 +       put_cpu();
81171 +}
81172 +
81173 +static inline long statcnt_get(statcnt_t *cnt)
81174 +{
81175 +       long result;
81176 +       int  i;
81177 +
81178 +       for (i = 0, result = 0; i < NR_CPUS; ++i)
81179 +               result += cnt->counters[i].count;
81180 +       return result;
81181 +}
81182 +
81183 +/* CONFIG_SMP */
81184 +#else
81185 +
81186 +typedef struct statcnt {
81187 +       long count;
81188 +} statcnt_t;
81189 +
81190 +#define STATCNT_INIT { .count = 0 }
81191 +
81192 +static inline void statcnt_add(statcnt_t *cnt, int val)
81193 +{
81194 +       cnt->count += val;
81195 +}
81196 +
81197 +static inline long statcnt_get(statcnt_t *cnt)
81198 +{
81199 +       return cnt->count;
81200 +}
81201 +
81202 +/* CONFIG_SMP */
81203 +#endif
81204 +
81205 +static inline void statcnt_init(statcnt_t *cnt)
81206 +{
81207 +       xmemset(cnt, 0, sizeof *cnt);
81208 +}
81209 +
81210 +#define statcnt_reset(cnt) statcnt_init(cnt)
81211 +
81212 +#define statcnt_inc(cnt) statcnt_add((cnt), +1)
81213 +#define statcnt_dec(cnt) statcnt_add((cnt), -1)
81214 +
81215 +/* __STATCNT_H__ */
81216 +#endif
81217 +
81218 +/* Make Linus happy.
81219 +   Local variables:
81220 +   c-indentation-style: "K&R"
81221 +   mode-name: "LC"
81222 +   c-basic-offset: 8
81223 +   tab-width: 8
81224 +   fill-column: 120
81225 +   End:
81226 +*/
81227 diff -rupN linux-2.6.8-rc3/fs/reiser4/stats.c linux-2.6.8-rc3-a/fs/reiser4/stats.c
81228 --- linux-2.6.8-rc3/fs/reiser4/stats.c  1970-01-01 03:00:00.000000000 +0300
81229 +++ linux-2.6.8-rc3-a/fs/reiser4/stats.c        2004-08-05 21:20:52.850709564 +0400
81230 @@ -0,0 +1,639 @@
81231 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
81232 + * reiser4/README */
81233 +
81234 +/* Statistical facilities. */
81235 +
81236 +/*
81237 + * Reiser4 has special REISER4_STATS compilation option (flippable through
81238 + * kernel configuration process). When it is on, code to collect statistics is
81239 + * compiled in. When option is off, code is preprocessed to zilch.
81240 + *
81241 + * Statistics are ["statistics" as singular is a name of a science, used as
81242 + * plural it refers to "classified facts respecting ... any particular class
81243 + * or interest" (Webster)] collected in the form of "statistical
81244 + * counters". Each counter has type statcnt_t (see statcnt.h). Counters are
81245 + * organized into per-super block reiser4_statistics structure. This structure
81246 + * contains sub-structures used to group counters. This grouping is only for
81247 + * cleanness, it has no effect on execution.
81248 + *
81249 + * In addition to sub-structures reiser4_statistics also contains array of
81250 + * reiser4_level_stat structures used to collect statistics attributable to
81251 + * particular level in reiser4 internal tree.
81252 + *
81253 + * As explained in statcnt.h, statcnt_t is large, hence, reiser4_statistics,
81254 + * containing fewscores of counters is _huge_. It is so huge, that it cannot
81255 + * be allocated with kmalloc() and vmalloc() is used for this.
81256 + *
81257 + * reiser4_stat_inc() and reiser4_stat_add() macros defined in stats.h are
81258 + * main means of updating statistical counters. Note, that due to the
81259 + * construction of statcnt_t counters said macros are completely lock-less
81260 + * and, no than less, almost accurate.
81261 + *
81262 + * Each statistical counter is exported through sysfs (see kattr.c for more
81263 + * details).
81264 + *
81265 + * If you adding new stat-counter, you should add it into appropriate struct
81266 + * definition in stats.h (reiser4_level_statistics or reiser4_statistics), and
81267 + * add initialization of counter to the reiser4_stat_defs[] or
81268 + * reiser4_stat_level_defs[] arrays in this file.
81269 + *
81270 + * Note: it is probably possible to avoid this duplication by placing some
81271 + * description of counters into third file and using preprocessor to generate
81272 + * definitions of structs and arrays.
81273 + *
81274 + */
81275 +
81276 +#include "kattr.h"
81277 +#include "reiser4.h"
81278 +#include "stats.h"
81279 +#include "statcnt.h"
81280 +#include "super.h"
81281 +#include "debug.h"
81282 +
81283 +#include <linux/sysfs.h>
81284 +#include <linux/vmalloc.h>
81285 +
81286 +/*
81287 + * Data-type describing how to export stat-counter through sysfs.
81288 + */
81289 +typedef struct reiser4_stats_cnt {
81290 +       /* standard object to interact with sysfs */
81291 +       reiser4_kattr  kattr;
81292 +       /* offset to the counter from the beginning of embedding data-type
81293 +        * (reiser4_level_statistics or reiser4_statistics) */
81294 +       ptrdiff_t      offset;
81295 +       /* counter size in bytes */
81296 +       size_t         size;
81297 +       /* printf(3) format to output counter value */
81298 +       const char    *format;
81299 +} reiser4_stats_cnt;
81300 +
81301 +/*
81302 + * helper macro: return value of counter (of type @type) located at the offset
81303 + * @offset (in bytes) from the data-structure located at @ptr.
81304 + */
81305 +#define getptrat(type, ptr, offset) ((type *)(((char *)(ptr)) + (offset)))
81306 +
81307 +/*
81308 + * helper macro to define reiser4_stats_cnt describing particular
81309 + * stat-counter.
81310 + */
81311 +#define DEFINE_STATCNT_0(aname,   /* name under which counter will be exported \
81312 +                                  * in sysfs */                        \
81313 +                        afield,  /* name of field in the embedding type */ \
81314 +                        atype,   /* data-type of counter */            \
81315 +                        afmt,    /* output printf(3) format */         \
81316 +                        ashow,   /* show method */                     \
81317 +                        astore   /* store method */)                   \
81318 +{                                                                      \
81319 +       .kattr = {                                                      \
81320 +               .attr = {                                               \
81321 +                       .kattr = {                                      \
81322 +                               .name = (char *)aname,                  \
81323 +                               .mode = 0666 /* rw-rw-rw- */            \
81324 +                       },                                              \
81325 +                       .show   = ashow,                                \
81326 +                       .store  = astore                                \
81327 +               },                                                      \
81328 +               .cookie = 0,                                            \
81329 +       },                                                              \
81330 +       .format = afmt "\n",                                            \
81331 +       .offset = offsetof(atype, afield),                              \
81332 +       .size   = sizeof(((atype *)0)->afield)                          \
81333 +}
81334 +
81335 +#if REISER4_STATS
81336 +
81337 +/*
81338 + * return counter corresponding to the @fskattr. struct fs_kattr is embedded
81339 + * into reiser4_kattr, and reiser4_kattr is embedded into reiser4_stats_cnt.
81340 + */
81341 +static inline reiser4_stats_cnt *getcnt(struct fs_kattr * fskattr)
81342 +{
81343 +       return container_of(fskattr, reiser4_stats_cnt, kattr.attr);
81344 +}
81345 +
81346 +/*
81347 + * ->show() method for "global" --that is, not per-level-- stat-counter.
81348 + */
81349 +static ssize_t
81350 +show_stat_attr(struct super_block * s, struct fs_kobject * fskobj,
81351 +              struct fs_kattr * fskattr, char * buf)
81352 +{
81353 +       char *p;
81354 +       reiser4_stats_cnt *cnt;
81355 +       statcnt_t *val;
81356 +
81357 +       /* obtain counter description */
81358 +       cnt = getcnt(fskattr);
81359 +       /* use byte offset stored in description to obtain counter value */
81360 +       val = getptrat(statcnt_t, get_super_private(s)->stats, cnt->offset);
81361 +       p = buf;
81362 +       KATTR_PRINT(p, buf, cnt->format, statcnt_get(val));
81363 +       return (p - buf);
81364 +}
81365 +
81366 +/*
81367 + * ->store() method for "global" --that is, not per-level-- stat-counter.
81368 + */
81369 +static ssize_t
81370 +store_stat_attr(struct super_block * s, struct fs_kobject * fskobj,
81371 +               struct fs_kattr * fskattr, const char * buf, size_t size)
81372 +{
81373 +       reiser4_stats_cnt *cnt;
81374 +       statcnt_t *val;
81375 +
81376 +       /*
81377 +        * any write into file representing stat-counter in sysfs, resets
81378 +        * counter to zero
81379 +        */
81380 +
81381 +       cnt = getcnt(fskattr);
81382 +       val = getptrat(statcnt_t, get_super_private(s)->stats, cnt->offset);
81383 +       statcnt_reset(val);
81384 +       return size;
81385 +}
81386 +
81387 +/*
81388 + * ->show() method for per-level stat-counter
81389 + */
81390 +static ssize_t
81391 +show_stat_level_attr(struct super_block * s, struct fs_kobject * fskobj,
81392 +                    struct fs_kattr * fskattr, char * buf)
81393 +{
81394 +       char *p;
81395 +       reiser4_stats_cnt *cnt;
81396 +       statcnt_t *val;
81397 +       int level;
81398 +
81399 +       /* obtain level from reiser4_level_stats_kobj */
81400 +       level = container_of(fskobj, reiser4_level_stats_kobj, kobj)->level;
81401 +       /* obtain counter description */
81402 +       cnt = getcnt(fskattr);
81403 +       /* obtain counter value, using level and byte-offset from the
81404 +        * beginning of per-level counter struct */
81405 +       val = getptrat(statcnt_t, &get_super_private(s)->stats->level[level],
81406 +                      cnt->offset);
81407 +       p = buf;
81408 +       KATTR_PRINT(p, buf, cnt->format, statcnt_get(val));
81409 +       return (p - buf);
81410 +}
81411 +
81412 +/*
81413 + * ->store() method for per-level stat-counter
81414 + */
81415 +static ssize_t
81416 +store_stat_level_attr(struct super_block * s, struct fs_kobject * fskobj,
81417 +                     struct fs_kattr * fskattr, const char * buf, size_t size)
81418 +{
81419 +       reiser4_stats_cnt *cnt;
81420 +       statcnt_t *val;
81421 +       int level;
81422 +
81423 +       /*
81424 +        * any write into file representing stat-counter in sysfs, resets
81425 +        * counter to zero
81426 +        */
81427 +
81428 +       level = container_of(fskobj, reiser4_level_stats_kobj, kobj)->level;
81429 +       cnt = getcnt(fskattr);
81430 +       val = getptrat(statcnt_t, &get_super_private(s)->stats->level[level],
81431 +                      cnt->offset);
81432 +       statcnt_reset(val);
81433 +       return size;
81434 +}
81435 +
81436 +/*
81437 + * macro defining reiser4_stats_cnt instance describing particular global
81438 + * stat-counter
81439 + */
81440 +#define DEFINE_STATCNT(field)                                  \
81441 +       DEFINE_STATCNT_0(#field, field, reiser4_stat, "%lu",    \
81442 +                        show_stat_attr, store_stat_attr)
81443 +
81444 +reiser4_stats_cnt reiser4_stat_defs[] = {
81445 +       DEFINE_STATCNT(tree.cbk),
81446 +       DEFINE_STATCNT(tree.cbk_found),
81447 +       DEFINE_STATCNT(tree.cbk_notfound),
81448 +       DEFINE_STATCNT(tree.cbk_restart),
81449 +       DEFINE_STATCNT(tree.cbk_cache_hit),
81450 +       DEFINE_STATCNT(tree.cbk_cache_miss),
81451 +       DEFINE_STATCNT(tree.cbk_cache_wrong_node),
81452 +       DEFINE_STATCNT(tree.cbk_cache_race),
81453 +       DEFINE_STATCNT(tree.object_lookup_novroot),
81454 +       DEFINE_STATCNT(tree.object_lookup_moved),
81455 +       DEFINE_STATCNT(tree.object_lookup_outside),
81456 +       DEFINE_STATCNT(tree.object_lookup_cannotlock),
81457 +       DEFINE_STATCNT(tree.object_lookup_restart),
81458 +       DEFINE_STATCNT(tree.pos_in_parent_hit),
81459 +       DEFINE_STATCNT(tree.pos_in_parent_miss),
81460 +       DEFINE_STATCNT(tree.pos_in_parent_set),
81461 +       DEFINE_STATCNT(tree.fast_insert),
81462 +       DEFINE_STATCNT(tree.fast_paste),
81463 +       DEFINE_STATCNT(tree.fast_cut),
81464 +       DEFINE_STATCNT(tree.reparenting),
81465 +       DEFINE_STATCNT(tree.rd_key_skew),
81466 +       DEFINE_STATCNT(tree.check_left_nonuniq),
81467 +       DEFINE_STATCNT(tree.left_nonuniq_found),
81468 +
81469 +       DEFINE_STATCNT(vfs_calls.open),
81470 +       DEFINE_STATCNT(vfs_calls.lookup),
81471 +       DEFINE_STATCNT(vfs_calls.create),
81472 +       DEFINE_STATCNT(vfs_calls.mkdir),
81473 +       DEFINE_STATCNT(vfs_calls.symlink),
81474 +       DEFINE_STATCNT(vfs_calls.mknod),
81475 +       DEFINE_STATCNT(vfs_calls.rename),
81476 +       DEFINE_STATCNT(vfs_calls.readlink),
81477 +       DEFINE_STATCNT(vfs_calls.follow_link),
81478 +       DEFINE_STATCNT(vfs_calls.setattr),
81479 +       DEFINE_STATCNT(vfs_calls.getattr),
81480 +       DEFINE_STATCNT(vfs_calls.read),
81481 +       DEFINE_STATCNT(vfs_calls.write),
81482 +       DEFINE_STATCNT(vfs_calls.truncate),
81483 +       DEFINE_STATCNT(vfs_calls.statfs),
81484 +       DEFINE_STATCNT(vfs_calls.bmap),
81485 +       DEFINE_STATCNT(vfs_calls.link),
81486 +       DEFINE_STATCNT(vfs_calls.llseek),
81487 +       DEFINE_STATCNT(vfs_calls.readdir),
81488 +       DEFINE_STATCNT(vfs_calls.ioctl),
81489 +       DEFINE_STATCNT(vfs_calls.mmap),
81490 +       DEFINE_STATCNT(vfs_calls.unlink),
81491 +       DEFINE_STATCNT(vfs_calls.rmdir),
81492 +       DEFINE_STATCNT(vfs_calls.alloc_inode),
81493 +       DEFINE_STATCNT(vfs_calls.destroy_inode),
81494 +       DEFINE_STATCNT(vfs_calls.delete_inode),
81495 +       DEFINE_STATCNT(vfs_calls.write_super),
81496 +       DEFINE_STATCNT(vfs_calls.private_data_alloc),
81497 +
81498 +       DEFINE_STATCNT(dir.readdir.calls),
81499 +       DEFINE_STATCNT(dir.readdir.reset),
81500 +       DEFINE_STATCNT(dir.readdir.rewind_left),
81501 +       DEFINE_STATCNT(dir.readdir.left_non_uniq),
81502 +       DEFINE_STATCNT(dir.readdir.left_restart),
81503 +       DEFINE_STATCNT(dir.readdir.rewind_right),
81504 +       DEFINE_STATCNT(dir.readdir.adjust_pos),
81505 +       DEFINE_STATCNT(dir.readdir.adjust_lt),
81506 +       DEFINE_STATCNT(dir.readdir.adjust_gt),
81507 +       DEFINE_STATCNT(dir.readdir.adjust_eq),
81508 +
81509 +       DEFINE_STATCNT(file.page_ops.readpage_calls),
81510 +       DEFINE_STATCNT(file.page_ops.writepage_calls),
81511 +       DEFINE_STATCNT(file.tail2extent),
81512 +       DEFINE_STATCNT(file.extent2tail),
81513 +       DEFINE_STATCNT(file.find_file_item),
81514 +       DEFINE_STATCNT(file.find_file_item_via_seal),
81515 +       DEFINE_STATCNT(file.find_file_item_via_right_neighbor),
81516 +       DEFINE_STATCNT(file.find_file_item_via_cbk),
81517 +
81518 +       DEFINE_STATCNT(extent.unfm_block_reads),
81519 +       DEFINE_STATCNT(extent.broken_seals),
81520 +       DEFINE_STATCNT(extent.bdp_caused_repeats),
81521 +
81522 +       DEFINE_STATCNT(tail.bdp_caused_repeats),
81523 +
81524 +       DEFINE_STATCNT(txnmgr.slept_in_wait_atom),
81525 +       DEFINE_STATCNT(txnmgr.slept_in_wait_event),
81526 +       DEFINE_STATCNT(txnmgr.commits),
81527 +       DEFINE_STATCNT(txnmgr.post_commit_writes),
81528 +       DEFINE_STATCNT(txnmgr.time_spent_in_commits),
81529 +       DEFINE_STATCNT(txnmgr.empty_bio),
81530 +       DEFINE_STATCNT(txnmgr.commit_from_writepage),
81531 +       DEFINE_STATCNT(txnmgr.capture_equal),
81532 +       DEFINE_STATCNT(txnmgr.capture_both),
81533 +       DEFINE_STATCNT(txnmgr.capture_block),
81534 +       DEFINE_STATCNT(txnmgr.capture_txnh),
81535 +       DEFINE_STATCNT(txnmgr.capture_none),
81536 +       DEFINE_STATCNT(txnmgr.restart.atom_begin),
81537 +       DEFINE_STATCNT(txnmgr.restart.cannot_commit),
81538 +       DEFINE_STATCNT(txnmgr.restart.should_wait),
81539 +       DEFINE_STATCNT(txnmgr.restart.flush),
81540 +       DEFINE_STATCNT(txnmgr.restart.fuse_lock_owners_fused),
81541 +       DEFINE_STATCNT(txnmgr.restart.fuse_lock_owners),
81542 +       DEFINE_STATCNT(txnmgr.restart.trylock_throttle),
81543 +       DEFINE_STATCNT(txnmgr.restart.assign_block),
81544 +       DEFINE_STATCNT(txnmgr.restart.assign_txnh),
81545 +       DEFINE_STATCNT(txnmgr.restart.fuse_wait_nonblock),
81546 +       DEFINE_STATCNT(txnmgr.restart.fuse_wait_slept),
81547 +       DEFINE_STATCNT(txnmgr.restart.init_fusion_atomf),
81548 +       DEFINE_STATCNT(txnmgr.restart.init_fusion_atomh),
81549 +       DEFINE_STATCNT(txnmgr.restart.init_fusion_fused),
81550 +
81551 +       DEFINE_STATCNT(flush.squeezed_completely),
81552 +       DEFINE_STATCNT(flush.flushed_with_unallocated),
81553 +       DEFINE_STATCNT(flush.squeezed_leaves),
81554 +       DEFINE_STATCNT(flush.squeezed_leaf_items),
81555 +       DEFINE_STATCNT(flush.squeezed_leaf_bytes),
81556 +       DEFINE_STATCNT(flush.flush),
81557 +       DEFINE_STATCNT(flush.left),
81558 +       DEFINE_STATCNT(flush.right),
81559 +       DEFINE_STATCNT(flush.slept_in_mtflush_sem),
81560 +
81561 +       DEFINE_STATCNT(pool.alloc),
81562 +       DEFINE_STATCNT(pool.kmalloc),
81563 +
81564 +       DEFINE_STATCNT(seal.perfect_match),
81565 +       DEFINE_STATCNT(seal.out_of_cache),
81566 +
81567 +       DEFINE_STATCNT(hashes.znode.lookup),
81568 +       DEFINE_STATCNT(hashes.znode.insert),
81569 +       DEFINE_STATCNT(hashes.znode.remove),
81570 +       DEFINE_STATCNT(hashes.znode.scanned),
81571 +       DEFINE_STATCNT(hashes.zfake.lookup),
81572 +       DEFINE_STATCNT(hashes.zfake.insert),
81573 +       DEFINE_STATCNT(hashes.zfake.remove),
81574 +       DEFINE_STATCNT(hashes.zfake.scanned),
81575 +       DEFINE_STATCNT(hashes.jnode.lookup),
81576 +       DEFINE_STATCNT(hashes.jnode.insert),
81577 +       DEFINE_STATCNT(hashes.jnode.remove),
81578 +       DEFINE_STATCNT(hashes.jnode.scanned),
81579 +       DEFINE_STATCNT(hashes.lnode.lookup),
81580 +       DEFINE_STATCNT(hashes.lnode.insert),
81581 +       DEFINE_STATCNT(hashes.lnode.remove),
81582 +       DEFINE_STATCNT(hashes.lnode.scanned),
81583 +       DEFINE_STATCNT(hashes.eflush.lookup),
81584 +       DEFINE_STATCNT(hashes.eflush.insert),
81585 +       DEFINE_STATCNT(hashes.eflush.remove),
81586 +       DEFINE_STATCNT(hashes.eflush.scanned),
81587 +
81588 +       DEFINE_STATCNT(block_alloc.nohint),
81589 +
81590 +       DEFINE_STATCNT(non_uniq),
81591 +
81592 +       /* pcwb - page common write back */
81593 +       DEFINE_STATCNT(pcwb.calls),
81594 +       DEFINE_STATCNT(pcwb.no_jnode),
81595 +       DEFINE_STATCNT(pcwb.written),
81596 +       DEFINE_STATCNT(pcwb.not_written),
81597 +
81598 +       /* cop on capture stats */
81599 +       DEFINE_STATCNT(coc.calls),
81600 +       /* satisfied requests */
81601 +       DEFINE_STATCNT(coc.ok_uber),
81602 +       DEFINE_STATCNT(coc.ok_nopage),
81603 +       DEFINE_STATCNT(coc.ok_clean),
81604 +       DEFINE_STATCNT(coc.ok_ovrwr),
81605 +       DEFINE_STATCNT(coc.ok_reloc),
81606 +       /* refused copy on capture requests */
81607 +       DEFINE_STATCNT(coc.forbidden),
81608 +       DEFINE_STATCNT(coc.writeback),
81609 +       DEFINE_STATCNT(coc.flush_queued),
81610 +       DEFINE_STATCNT(coc.dirty),
81611 +       DEFINE_STATCNT(coc.eflush),
81612 +       DEFINE_STATCNT(coc.scan_race),
81613 +       DEFINE_STATCNT(coc.atom_changed),
81614 +       DEFINE_STATCNT(coc.coc_race),
81615 +       DEFINE_STATCNT(coc.coc_wait),
81616 +};
81617 +
81618 +/*
81619 + * macro defining reiser4_stats_cnt instance describing particular per-level
81620 + * stat-counter
81621 + */
81622 +#define DEFINE_STAT_LEVEL_CNT(field)                                   \
81623 +       DEFINE_STATCNT_0(#field, field,                                 \
81624 +                        reiser4_level_stat, "%lu",                     \
81625 +                        show_stat_level_attr, store_stat_level_attr)
81626 +
81627 +reiser4_stats_cnt reiser4_stat_level_defs[] = {
81628 +       DEFINE_STAT_LEVEL_CNT(carry_restart),
81629 +       DEFINE_STAT_LEVEL_CNT(carry_done),
81630 +       DEFINE_STAT_LEVEL_CNT(carry_left_in_carry),
81631 +       DEFINE_STAT_LEVEL_CNT(carry_left_in_cache),
81632 +       DEFINE_STAT_LEVEL_CNT(carry_left_missed),
81633 +       DEFINE_STAT_LEVEL_CNT(carry_left_not_avail),
81634 +       DEFINE_STAT_LEVEL_CNT(carry_left_refuse),
81635 +       DEFINE_STAT_LEVEL_CNT(carry_right_in_carry),
81636 +       DEFINE_STAT_LEVEL_CNT(carry_right_in_cache),
81637 +       DEFINE_STAT_LEVEL_CNT(carry_right_missed),
81638 +       DEFINE_STAT_LEVEL_CNT(carry_right_not_avail),
81639 +       DEFINE_STAT_LEVEL_CNT(insert_looking_left),
81640 +       DEFINE_STAT_LEVEL_CNT(insert_looking_right),
81641 +       DEFINE_STAT_LEVEL_CNT(insert_alloc_new),
81642 +       DEFINE_STAT_LEVEL_CNT(insert_alloc_many),
81643 +       DEFINE_STAT_LEVEL_CNT(insert),
81644 +       DEFINE_STAT_LEVEL_CNT(delete),
81645 +       DEFINE_STAT_LEVEL_CNT(cut),
81646 +       DEFINE_STAT_LEVEL_CNT(paste),
81647 +       DEFINE_STAT_LEVEL_CNT(extent),
81648 +       DEFINE_STAT_LEVEL_CNT(paste_restarted),
81649 +       DEFINE_STAT_LEVEL_CNT(update),
81650 +       DEFINE_STAT_LEVEL_CNT(modify),
81651 +       DEFINE_STAT_LEVEL_CNT(half_split_race),
81652 +       DEFINE_STAT_LEVEL_CNT(dk_vs_create_race),
81653 +       DEFINE_STAT_LEVEL_CNT(track_lh),
81654 +       DEFINE_STAT_LEVEL_CNT(sibling_search),
81655 +       DEFINE_STAT_LEVEL_CNT(cbk_key_moved),
81656 +       DEFINE_STAT_LEVEL_CNT(cbk_met_ghost),
81657 +       DEFINE_STAT_LEVEL_CNT(object_lookup_start),
81658 +
81659 +       DEFINE_STAT_LEVEL_CNT(jnode.jload),
81660 +       DEFINE_STAT_LEVEL_CNT(jnode.jload_already),
81661 +       DEFINE_STAT_LEVEL_CNT(jnode.jload_page),
81662 +       DEFINE_STAT_LEVEL_CNT(jnode.jload_async),
81663 +       DEFINE_STAT_LEVEL_CNT(jnode.jload_read),
81664 +       DEFINE_STAT_LEVEL_CNT(jnode.jput),
81665 +       DEFINE_STAT_LEVEL_CNT(jnode.jputlast),
81666 +
81667 +       DEFINE_STAT_LEVEL_CNT(znode.lock),
81668 +       DEFINE_STAT_LEVEL_CNT(znode.lock_iteration),
81669 +       DEFINE_STAT_LEVEL_CNT(znode.lock_neighbor),
81670 +       DEFINE_STAT_LEVEL_CNT(znode.lock_neighbor_iteration),
81671 +       DEFINE_STAT_LEVEL_CNT(znode.lock_read),
81672 +       DEFINE_STAT_LEVEL_CNT(znode.lock_write),
81673 +       DEFINE_STAT_LEVEL_CNT(znode.lock_lopri),
81674 +       DEFINE_STAT_LEVEL_CNT(znode.lock_hipri),
81675 +       DEFINE_STAT_LEVEL_CNT(znode.lock_contented),
81676 +       DEFINE_STAT_LEVEL_CNT(znode.lock_uncontented),
81677 +       DEFINE_STAT_LEVEL_CNT(znode.lock_dying),
81678 +       DEFINE_STAT_LEVEL_CNT(znode.lock_cannot_lock),
81679 +       DEFINE_STAT_LEVEL_CNT(znode.lock_can_lock),
81680 +       DEFINE_STAT_LEVEL_CNT(znode.lock_no_capture),
81681 +       DEFINE_STAT_LEVEL_CNT(znode.unlock),
81682 +       DEFINE_STAT_LEVEL_CNT(znode.wakeup),
81683 +       DEFINE_STAT_LEVEL_CNT(znode.wakeup_found),
81684 +       DEFINE_STAT_LEVEL_CNT(znode.wakeup_found_read),
81685 +       DEFINE_STAT_LEVEL_CNT(znode.wakeup_scan),
81686 +       DEFINE_STAT_LEVEL_CNT(znode.wakeup_convoy),
81687 +       DEFINE_STAT_LEVEL_CNT(node.lookup.calls),
81688 +       DEFINE_STAT_LEVEL_CNT(node.lookup.items),
81689 +       DEFINE_STAT_LEVEL_CNT(node.lookup.binary),
81690 +       DEFINE_STAT_LEVEL_CNT(node.lookup.seq),
81691 +       DEFINE_STAT_LEVEL_CNT(node.lookup.found),
81692 +       DEFINE_STAT_LEVEL_CNT(node.lookup.pos),
81693 +       DEFINE_STAT_LEVEL_CNT(node.lookup.posrelative),
81694 +       DEFINE_STAT_LEVEL_CNT(node.lookup.samepos),
81695 +       DEFINE_STAT_LEVEL_CNT(node.lookup.nextpos),
81696 +
81697 +       DEFINE_STAT_LEVEL_CNT(vm.release.try),
81698 +       DEFINE_STAT_LEVEL_CNT(vm.release.ok),
81699 +       DEFINE_STAT_LEVEL_CNT(vm.release.loaded),
81700 +       DEFINE_STAT_LEVEL_CNT(vm.release.copy),
81701 +       DEFINE_STAT_LEVEL_CNT(vm.release.eflushed),
81702 +       DEFINE_STAT_LEVEL_CNT(vm.release.fake),
81703 +       DEFINE_STAT_LEVEL_CNT(vm.release.dirty),
81704 +       DEFINE_STAT_LEVEL_CNT(vm.release.ovrwr),
81705 +       DEFINE_STAT_LEVEL_CNT(vm.release.writeback),
81706 +       DEFINE_STAT_LEVEL_CNT(vm.release.keepme),
81707 +       DEFINE_STAT_LEVEL_CNT(vm.release.bitmap),
81708 +
81709 +       DEFINE_STAT_LEVEL_CNT(vm.eflush.called),
81710 +       DEFINE_STAT_LEVEL_CNT(vm.eflush.ok),
81711 +       DEFINE_STAT_LEVEL_CNT(vm.eflush.nolonger),
81712 +       DEFINE_STAT_LEVEL_CNT(vm.eflush.needs_block),
81713 +       DEFINE_STAT_LEVEL_CNT(vm.eflush.loaded),
81714 +       DEFINE_STAT_LEVEL_CNT(vm.eflush.queued),
81715 +       DEFINE_STAT_LEVEL_CNT(vm.eflush.protected),
81716 +       DEFINE_STAT_LEVEL_CNT(vm.eflush.heard_banshee),
81717 +       DEFINE_STAT_LEVEL_CNT(vm.eflush.nopage),
81718 +       DEFINE_STAT_LEVEL_CNT(vm.eflush.writeback),
81719 +       DEFINE_STAT_LEVEL_CNT(vm.eflush.bitmap),
81720 +       DEFINE_STAT_LEVEL_CNT(vm.eflush.clustered),
81721 +       DEFINE_STAT_LEVEL_CNT(vm.eflush.eflushed),
81722 +
81723 +       DEFINE_STAT_LEVEL_CNT(time_slept),
81724 +       DEFINE_STAT_LEVEL_CNT(total_hits_at_level)
81725 +};
81726 +
81727 +/*
81728 + * print single stat-counter.
81729 + */
81730 +static void
81731 +print_cnt(reiser4_stats_cnt * cnt, const char * prefix, void * base)
81732 +{
81733 +       printk("%s%s:\t ", prefix, cnt->kattr.attr.kattr.name);
81734 +       printk(cnt->format,
81735 +              statcnt_get(getptrat(statcnt_t, base, cnt->offset)));
81736 +}
81737 +
81738 +/*
81739 + * Print statistical data accumulated so far.
81740 + *
81741 + * This is done during umount if REISER4_STATS_ON_UMOUNT flag is set in
81742 + * ->debug_flags.
81743 + */
81744 +reiser4_internal void
81745 +reiser4_print_stats(void)
81746 +{
81747 +       reiser4_stat *s;
81748 +       int i;
81749 +
81750 +       s = get_current_super_private()->stats;
81751 +       for(i = 0 ; i < sizeof_array(reiser4_stat_defs) ; ++ i)
81752 +               print_cnt(&reiser4_stat_defs[i], "", s);
81753 +
81754 +       for (i = 0; i < REAL_MAX_ZTREE_HEIGHT; ++i) {
81755 +               int j;
81756 +
81757 +               if (statcnt_get(&s->level[i].total_hits_at_level) <= 0)
81758 +                       continue;
81759 +               printk("tree: at level: %i\n", i +  LEAF_LEVEL);
81760 +               for(j = 0 ; j < sizeof_array(reiser4_stat_level_defs) ; ++ j)
81761 +                       print_cnt(&reiser4_stat_level_defs[j], "\t", &s->level[i]);
81762 +       }
81763 +}
81764 +
81765 +/*
81766 + * add all defined per-level stat-counters as attributes to the @kobj. This is
81767 + * the way stat-counters are exported through sysfs.
81768 + */
81769 +int
81770 +reiser4_populate_kattr_level_dir(struct kobject * kobj)
81771 +{
81772 +       int result;
81773 +       int i;
81774 +
81775 +       result = 0;
81776 +       for(i = 0 ; i < sizeof_array(reiser4_stat_level_defs) && !result ; ++ i){
81777 +               struct attribute *a;
81778 +
81779 +               a = &reiser4_stat_level_defs[i].kattr.attr.kattr;
81780 +               result = sysfs_create_file(kobj, a);
81781 +       }
81782 +       if (result != 0)
81783 +               warning("nikita-2921", "Failed to add sysfs level attr: %i, %i",
81784 +                       result, i);
81785 +       return result;
81786 +}
81787 +
81788 +/*
81789 + * initialize stat-counters for a super block. Called during mount.
81790 + */
81791 +int reiser4_stat_init(reiser4_stat ** storehere)
81792 +{
81793 +       reiser4_stat *stats;
81794 +       statcnt_t *cnt;
81795 +       int num, i;
81796 +
81797 +       /* sanity check: @stats size is multiple of statcnt_t size */
81798 +       cassert((sizeof *stats) / (sizeof *cnt) * (sizeof *cnt) == (sizeof *stats));
81799 +
81800 +       /*
81801 +        * allocate @stats via vmalloc_32: it's too large for kmalloc.
81802 +        */
81803 +       stats = vmalloc_32(sizeof *stats);
81804 +       if (stats == NULL)
81805 +               return -ENOMEM;
81806 +
81807 +       *storehere = stats;
81808 +
81809 +       num = (sizeof *stats) / (sizeof *cnt);
81810 +       cnt = (statcnt_t *)stats;
81811 +       /*
81812 +        * initialize all counters
81813 +        */
81814 +       for (i = 0; i < num ; ++i, ++cnt)
81815 +               statcnt_init(cnt);
81816 +       return 0;
81817 +}
81818 +
81819 +/*
81820 + * release resources used by stat-counters. Called during umount. Dual to
81821 + * reiser4_stat_init.
81822 + */
81823 +void reiser4_stat_done(reiser4_stat ** stats)
81824 +{
81825 +       vfree(*stats);
81826 +       *stats = NULL;
81827 +}
81828 +
81829 +#else
81830 +void
81831 +reiser4_print_stats()
81832 +{
81833 +}
81834 +#endif
81835 +
81836 +/*
81837 + * populate /sys/fs/reiser4/<dev>/stats with sub-objects.
81838 + */
81839 +reiser4_internal int
81840 +reiser4_populate_kattr_dir(struct kobject * kobj UNUSED_ARG)
81841 +{
81842 +       int result;
81843 +       ON_STATS(int i);
81844 +
81845 +       result = 0;
81846 +#if REISER4_STATS
81847 +       for(i = 0 ; i < sizeof_array(reiser4_stat_defs) && !result ; ++ i) {
81848 +               struct attribute *a;
81849 +
81850 +               a = &reiser4_stat_defs[i].kattr.attr.kattr;
81851 +               result = sysfs_create_file(kobj, a);
81852 +       }
81853 +       if (result != 0)
81854 +               warning("nikita-2920", "Failed to add sysfs attr: %i, %i",
81855 +                       result, i);
81856 +#endif
81857 +       return result;
81858 +}
81859 +
81860 +
81861 +/* Make Linus happy.
81862 +   Local variables:
81863 +   c-indentation-style: "K&R"
81864 +   mode-name: "LC"
81865 +   c-basic-offset: 8
81866 +   tab-width: 8
81867 +   fill-column: 120
81868 +   End:
81869 +*/
81870 diff -rupN linux-2.6.8-rc3/fs/reiser4/stats.h linux-2.6.8-rc3-a/fs/reiser4/stats.h
81871 --- linux-2.6.8-rc3/fs/reiser4/stats.h  1970-01-01 03:00:00.000000000 +0300
81872 +++ linux-2.6.8-rc3-a/fs/reiser4/stats.h        2004-08-05 21:20:53.482576288 +0400
81873 @@ -0,0 +1,769 @@
81874 +/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
81875 + * reiser4/README */
81876 +
81877 +/* Statistics gathering. See stats.c for comments. */
81878 +
81879 +#if !defined( __FS_REISER4_STATS_H__ )
81880 +#define __FS_REISER4_STATS_H__
81881 +
81882 +#include "forward.h"
81883 +#include "reiser4.h"
81884 +#include "debug.h"
81885 +#include "statcnt.h"
81886 +
81887 +/* for __u?? types */
81888 +#include <linux/types.h>
81889 +/* for struct super_block, etc */
81890 +#include <linux/fs.h>
81891 +/* for in_interrupt() */
81892 +#include <asm/hardirq.h>
81893 +
81894 +#include <linux/sched.h>
81895 +
81896 +#if REISER4_STATS
81897 +
81898 +/* following macros update counters from &reiser4_stat below, which
81899 +   see */
81900 +
81901 +#define ON_STATS(e) e
81902 +/* statistics gathering features. */
81903 +
81904 +#define REISER4_STATS_STRICT (0)
81905 +
81906 +/* statistical counters collected on each level of internal tree */
81907 +typedef struct reiser4_level_statistics {
81908 +       /* carries restarted due to deadlock avoidance algorithm */
81909 +       statcnt_t carry_restart;
81910 +       /* carries performed */
81911 +       statcnt_t carry_done;
81912 +       /* how many times carry, trying to find left neighbor of a given node,
81913 +          found it already in a carry set. */
81914 +       statcnt_t carry_left_in_carry;
81915 +       /* how many times carry, trying to find left neighbor of a given node,
81916 +          found it already in a memory. */
81917 +       statcnt_t carry_left_in_cache;
81918 +       /* how many times carry, trying to find left neighbor of a given node,
81919 +          found it is not in a memory. */
81920 +       statcnt_t carry_left_missed;
81921 +       /* how many times carry, trying to find left neighbor of a given node,
81922 +          found that left neighbor either doesn't exist (we are at the left
81923 +          border of the tree already), or that there is extent on the left.
81924 +       */
81925 +       statcnt_t carry_left_not_avail;
81926 +       /* how many times carry, trying to find left neighbor of a given node,
81927 +          gave this up to avoid deadlock */
81928 +       statcnt_t carry_left_refuse;
81929 +       /* how many times carry, trying to find right neighbor of a given
81930 +          node, found it already in a carry set. */
81931 +       statcnt_t carry_right_in_carry;
81932 +       /* how many times carry, trying to find right neighbor of a given
81933 +          node, found it already in a memory. */
81934 +       statcnt_t carry_right_in_cache;
81935 +       /* how many times carry, trying to find right neighbor of a given
81936 +          node, found it is not in a memory. */
81937 +       statcnt_t carry_right_missed;
81938 +       /* how many times carry, trying to find right neighbor of a given
81939 +          node, found that right neighbor either doesn't exist (we are at the
81940 +          right border of the tree already), or that there is extent on the
81941 +          right.
81942 +       */
81943 +       statcnt_t carry_right_not_avail;
81944 +       /* how many times insertion has to look into the left neighbor,
81945 +          searching for the free space. */
81946 +       statcnt_t insert_looking_left;
81947 +       /* how many times insertion has to look into the right neighbor,
81948 +          searching for the free space. */
81949 +       statcnt_t insert_looking_right;
81950 +       /* how many times insertion has to allocate new node, searching for
81951 +          the free space. */
81952 +       statcnt_t insert_alloc_new;
81953 +       /* how many times insertion has to allocate several new nodes in a
81954 +          row, searching for the free space. */
81955 +       statcnt_t insert_alloc_many;
81956 +       /* how many insertions were performed by carry. */
81957 +       statcnt_t insert;
81958 +       /* how many deletions were performed by carry. */
81959 +       statcnt_t delete;
81960 +       /* how many cuts were performed by carry. */
81961 +       statcnt_t cut;
81962 +       /* how many pastes (insertions into existing items) were performed by
81963 +          carry. */
81964 +       statcnt_t paste;
81965 +       /* how many extent insertions were done by carry. */
81966 +       statcnt_t extent;
81967 +       /* how many paste operations were restarted as insert. */
81968 +       statcnt_t paste_restarted;
81969 +       /* how many updates of delimiting keys were performed by carry. */
81970 +       statcnt_t update;
81971 +       /* how many times carry notified parent node about updates in its
81972 +          child. */
81973 +       statcnt_t modify;
81974 +       /* how many times node was found reparented at the time when its
81975 +          parent has to be updated. */
81976 +       statcnt_t half_split_race;
81977 +       /* how many times new node was inserted into sibling list after
81978 +          concurrent balancing modified right delimiting key if its left
81979 +          neighbor.
81980 +       */
81981 +       statcnt_t dk_vs_create_race;
81982 +       /* how many times insert or paste ultimately went into node different
81983 +          from original target */
81984 +       statcnt_t track_lh;
81985 +       /* how many times sibling lookup required getting that high in a
81986 +          tree */
81987 +       statcnt_t sibling_search;
81988 +       /* key was moved out of node while thread was waiting for the lock */
81989 +       statcnt_t cbk_key_moved;
81990 +       /* node was moved out of tree while thread was waiting for the lock */
81991 +       statcnt_t cbk_met_ghost;
81992 +       /* how many times vroot ("virtual root") optimization was used during
81993 +        * tree lookup */
81994 +       statcnt_t object_lookup_start;
81995 +       struct {
81996 +               /* calls to jload() */
81997 +               statcnt_t jload;
81998 +               /* calls to jload() that found jnode already loaded */
81999 +               statcnt_t jload_already;
82000 +               /* calls to jload() that found page already in memory */
82001 +               statcnt_t jload_page;
82002 +               /* calls to jload() that found jnode with asynchronous io
82003 +                * started */
82004 +               statcnt_t jload_async;
82005 +               /* calls to jload() that actually had to read data */
82006 +               statcnt_t jload_read;
82007 +               /* calls to jput() */
82008 +               statcnt_t jput;
82009 +               /* calls to jput() that released last reference */
82010 +               statcnt_t jputlast;
82011 +       } jnode;
82012 +       struct {
82013 +               /* calls to lock_znode() */
82014 +               statcnt_t lock;
82015 +               /* number of times loop inside lock_znode() was executed */
82016 +               statcnt_t lock_iteration;
82017 +               /* calls to lock_neighbor() */
82018 +               statcnt_t lock_neighbor;
82019 +               /* number of times loop inside lock_neighbor() was executed */
82020 +               statcnt_t lock_neighbor_iteration;
82021 +               /* read locks taken */
82022 +               statcnt_t lock_read;
82023 +               /* write locks taken */
82024 +               statcnt_t lock_write;
82025 +               /* low priority locks taken */
82026 +               statcnt_t lock_lopri;
82027 +               /* high priority locks taken */
82028 +               statcnt_t lock_hipri;
82029 +               /* how many requests for znode long term lock couldn't succeed
82030 +                * immediately. */
82031 +               statcnt_t lock_contented;
82032 +               /* how many requests for znode long term lock managed to
82033 +                * succeed immediately. */
82034 +               statcnt_t lock_uncontented;
82035 +               /* attempt to acquire a lock failed, because target node was
82036 +                * dying */
82037 +               statcnt_t lock_dying;
82038 +               /* lock wasn't immediately available, due to incompatible lock
82039 +                * mode */
82040 +               statcnt_t lock_cannot_lock;
82041 +               /* lock was immediately available (i.e., without wait) */
82042 +               statcnt_t lock_can_lock;
82043 +               /* no node capture was necessary when acquiring a lock */
82044 +               statcnt_t lock_no_capture;
82045 +               /* number of unlocks */
82046 +               statcnt_t unlock;
82047 +               /* number of times unlock decided to wake up sleeping
82048 +                * requestors */
82049 +               statcnt_t wakeup;
82050 +               /* number of times requestors were actually found during wake
82051 +                * up */
82052 +               statcnt_t wakeup_found;
82053 +               /* number of read-mode requestors found */
82054 +               statcnt_t wakeup_found_read;
82055 +               /* number of requestor queue items scanned during wake-up
82056 +                * processing */
82057 +               statcnt_t wakeup_scan;
82058 +               /* number of requestors bundled into convoys */
82059 +               statcnt_t wakeup_convoy;
82060 +       } znode;
82061 +       struct {
82062 +               /* node lookup stats */
82063 +               struct {
82064 +                       /* ->lookup() calls */
82065 +                       statcnt_t calls;
82066 +                       /* items in all nodes */
82067 +                       statcnt_t items;
82068 +                       /* "hops" of binary search */
82069 +                       statcnt_t binary;
82070 +                       /* iterations of sequential search */
82071 +                       statcnt_t seq;
82072 +                       /* how many times key sought for was found */
82073 +                       statcnt_t found;
82074 +                       /* average position where key was found */
82075 +                       statcnt_t pos;
82076 +                       /* average position where key was found relative to
82077 +                        * total number of items */
82078 +                       statcnt_t posrelative;
82079 +                       /* number of times key was found in the same position
82080 +                        * as in the previous lookup in this node */
82081 +                       statcnt_t samepos;
82082 +                       /* number of times key was found in the next position
82083 +                        * relative to the previous lookup in this node */
82084 +                       statcnt_t nextpos;
82085 +               } lookup;
82086 +       } node;
82087 +       struct {
82088 +               /* reiser4_releasepage() stats */
82089 +               struct {
82090 +                       /* for how many pages on this level ->releasepage()
82091 +                        * was called. */
82092 +                       statcnt_t try;
82093 +                       /* how many pages were released on this level */
82094 +                       statcnt_t ok;
82095 +                       /*
82096 +                        * how many times we failed to release a page,
82097 +                        * because...
82098 +                        */
82099 +                       /* jnode pinned it in memory */
82100 +                       statcnt_t loaded;
82101 +                       /* it's coced page */
82102 +                       statcnt_t copy;
82103 +                       /* it has fake block number */
82104 +                       statcnt_t fake;
82105 +                       /* it is dirty */
82106 +                       statcnt_t dirty;
82107 +                       /* it is in the overwrite set */
82108 +                       statcnt_t ovrwr;
82109 +                       /* it is under writeback */
82110 +                       statcnt_t writeback;
82111 +                       /* it's anonymous page, and jnode is not yet captured
82112 +                        * into atom. */
82113 +                       statcnt_t keepme;
82114 +                       /* it's bitmap */
82115 +                       statcnt_t bitmap;
82116 +
82117 +                       /* emergency flush was performed on this page/jnode,
82118 +                        * so it's ok to release */
82119 +                       statcnt_t eflushed;
82120 +               } release;
82121 +               /* emergency flush statistics */
82122 +               struct {
82123 +                       /* how many times emergency flush was invoked on this
82124 +                        * level */
82125 +                       statcnt_t called;
82126 +                       /* eflush was successful */
82127 +                       statcnt_t ok;
82128 +                       /* jnode ceased to be flushable after lock release */
82129 +                       statcnt_t nolonger;
82130 +                       /* new block number was needed for eflush */
82131 +                       statcnt_t needs_block;
82132 +                       /*
82133 +                        * eflush failed, because...
82134 +                        */
82135 +                       /* jnode is loaded */
82136 +                       statcnt_t loaded;
82137 +                       /* jnode is in the flush queue */
82138 +                       statcnt_t queued;
82139 +                       /* jnode is protected (JNODE_PROTECTED bit is on) */
82140 +                       statcnt_t protected;
82141 +                       /* jnode heard banshee already */
82142 +                       statcnt_t heard_banshee;
82143 +                       /* jnode has no page */
82144 +                       statcnt_t nopage;
82145 +                       /* jnode is under writeback */
82146 +                       statcnt_t writeback;
82147 +                       /* jnode is bitmap */
82148 +                       statcnt_t bitmap;
82149 +                       /* jnode is crypto-compress cluster */
82150 +                       statcnt_t clustered;
82151 +                       /* jnode is already eflushed */
82152 +                       statcnt_t eflushed;
82153 +               } eflush;
82154 +       } vm;
82155 +       /*
82156 +        * non zero, if there is some other non-zero counter at this tree
82157 +        * level. Used to suppress processing of higher tree levels, that
82158 +        * don't exist on the underlying file system.
82159 +        */
82160 +       statcnt_t total_hits_at_level;
82161 +       /* total time (in jiffies) threads sleep for the longterm locks on
82162 +        * this level */
82163 +       statcnt_t time_slept;
82164 +} reiser4_level_stat;
82165 +
82166 +/*
82167 + * hash table statistics. Such object is added to each type safe hash table
82168 + * instance (see fs/reiser4/type_safe_hash.h).
82169 + */
82170 +typedef struct tshash_stat {
82171 +       statcnt_t lookup;  /* number of lookup calls */
82172 +       statcnt_t insert;  /* number of insert calls */
82173 +       statcnt_t remove;  /* number of remove calls */
82174 +       statcnt_t scanned; /* total number of items inspected during all
82175 +                           * operations. This can be used to estimate average
82176 +                           * hash-chain depth. */
82177 +} tshash_stat;
82178 +
82179 +#define TSHASH_LOOKUP(stat) ({ if(stat) statcnt_inc(&stat->lookup); })
82180 +#define TSHASH_INSERT(stat) ({ if(stat) statcnt_inc(&stat->insert); })
82181 +#define TSHASH_REMOVE(stat) ({ if(stat) statcnt_inc(&stat->remove); })
82182 +#define TSHASH_SCANNED(stat) ({ if(stat) statcnt_inc(&stat->scanned); })
82183 +
82184 +/* set of statistics counter. This is embedded into super-block when
82185 +   REISER4_STATS is on. */
82186 +typedef struct reiser4_statistics {
82187 +       struct {
82188 +               /* calls to coord_by_key */
82189 +               statcnt_t cbk;
82190 +               /* calls to coord_by_key that found requested key */
82191 +               statcnt_t cbk_found;
82192 +               /* calls to coord_by_key that didn't find requested key */
82193 +               statcnt_t cbk_notfound;
82194 +               /* number of times calls to coord_by_key restarted */
82195 +               statcnt_t cbk_restart;
82196 +               /* calls to coord_by_key that found key in coord cache */
82197 +               statcnt_t cbk_cache_hit;
82198 +               /* calls to coord_by_key that didn't find key in coord
82199 +                  cache */
82200 +               statcnt_t cbk_cache_miss;
82201 +               /* cbk cache search found wrong node */
82202 +               statcnt_t cbk_cache_wrong_node;
82203 +               /* search for key in coord cache raced against parallel
82204 +                  balancing and lose. This should be rare. If not,
82205 +                  update cbk_cache_search() according to comment
82206 +                  therewithin.
82207 +               */
82208 +               statcnt_t cbk_cache_race;
82209 +               /*
82210 +                * statistics for vroot ("virtual root") optimization of tree
82211 +                * lookup.
82212 +                */
82213 +               /*
82214 +                * vroot usage failed, because...
82215 +                */
82216 +               /* given object has no vroot set */
82217 +               statcnt_t object_lookup_novroot;
82218 +               /* vroot changed due to race with balancing */
82219 +               statcnt_t object_lookup_moved;
82220 +               /* object is not fitted into its vroot any longer */
82221 +               statcnt_t object_lookup_outside;
82222 +               /* failed to lock vroot */
82223 +               statcnt_t object_lookup_cannotlock;
82224 +
82225 +               /* tree traversal had to be re-started due to vroot failure */
82226 +               statcnt_t object_lookup_restart;
82227 +
82228 +               /* number of times coord of child in its parent, cached
82229 +                  in a former, was reused. */
82230 +               statcnt_t pos_in_parent_hit;
82231 +               /* number of time binary search for child position in
82232 +                  its parent had to be redone. */
82233 +               statcnt_t pos_in_parent_miss;
82234 +               /* number of times position of child in its parent was
82235 +                  cached in the former */
82236 +               statcnt_t pos_in_parent_set;
82237 +               /* how many times carry() was skipped by doing "fast
82238 +                  insertion path". See
82239 +                  fs/reiser4/plugin/node/node.h:->fast_insert() method.
82240 +               */
82241 +               statcnt_t fast_insert;
82242 +               /* how many times carry() was skipped by doing "fast
82243 +                  paste path". See
82244 +                  fs/reiser4/plugin/node/node.h:->fast_paste() method.
82245 +               */
82246 +               statcnt_t fast_paste;
82247 +               /* how many times carry() was skipped by doing "fast
82248 +                  cut path". See
82249 +                  fs/reiser4/plugin/node/node.h:->cut_insert() method.
82250 +               */
82251 +               statcnt_t fast_cut;
82252 +               /* children reparented due to shifts at the parent level */
82253 +               statcnt_t reparenting;
82254 +               /* right delimiting key is not exact */
82255 +               statcnt_t rd_key_skew;
82256 +               statcnt_t check_left_nonuniq;
82257 +               statcnt_t left_nonuniq_found;
82258 +       } tree;
82259 +       reiser4_level_stat level[REISER4_MAX_ZTREE_HEIGHT];
82260 +       /* system call statistics. Indicates how many times given system (or,
82261 +        * sometimes, internal kernel function) was
82262 +        * invoked. Self-explanatory. */
82263 +       struct {
82264 +               statcnt_t open;
82265 +               statcnt_t lookup;
82266 +               statcnt_t create;
82267 +               statcnt_t mkdir;
82268 +               statcnt_t symlink;
82269 +               statcnt_t mknod;
82270 +               statcnt_t rename;
82271 +               statcnt_t readlink;
82272 +               statcnt_t follow_link;
82273 +               statcnt_t setattr;
82274 +               statcnt_t getattr;
82275 +               statcnt_t read;
82276 +               statcnt_t write;
82277 +               statcnt_t truncate;
82278 +               statcnt_t statfs;
82279 +               statcnt_t bmap;
82280 +               statcnt_t link;
82281 +               statcnt_t llseek;
82282 +               statcnt_t readdir;
82283 +               statcnt_t ioctl;
82284 +               statcnt_t mmap;
82285 +               statcnt_t unlink;
82286 +               statcnt_t rmdir;
82287 +               statcnt_t alloc_inode;
82288 +               statcnt_t destroy_inode;
82289 +               statcnt_t delete_inode;
82290 +               statcnt_t write_super;
82291 +               statcnt_t private_data_alloc; /* allocations of either per
82292 +                                              * struct dentry or per struct
82293 +                                              * file data */
82294 +       } vfs_calls;
82295 +       struct {
82296 +               /* readdir stats */
82297 +               struct {
82298 +                       /* calls to readdir */
82299 +                       statcnt_t calls;
82300 +                       /* rewinds to the beginning of directory */
82301 +                       statcnt_t reset;
82302 +                       /* partial rewinds to the left */
82303 +                       statcnt_t rewind_left;
82304 +                       /* rewind to left that was completely within sequence
82305 +                        * of duplicate keys */
82306 +                       statcnt_t left_non_uniq;
82307 +                       /* restarts of rewinds to the left due to hi/lo
82308 +                        * priority locking */
82309 +                       statcnt_t left_restart;
82310 +                       /* rewinds to the right */
82311 +                       statcnt_t rewind_right;
82312 +                       /* how many times readdir position has to be adjusted
82313 +                        * due to directory modification. Large readdir
82314 +                        * comment in plugin/dir/dir.c */
82315 +                       statcnt_t adjust_pos;
82316 +                       /* how many times adjustment was on the left of
82317 +                        * current readdir position */
82318 +                       statcnt_t adjust_lt;
82319 +                       /* how many times adjustment was on the right of
82320 +                        * current readdir position */
82321 +                       statcnt_t adjust_gt;
82322 +                       /* how many times adjustment was exactly on the
82323 +                        * current readdir position */
82324 +                       statcnt_t adjust_eq;
82325 +               } readdir;
82326 +       } dir;
82327 +
82328 +       /* statistics of unix file plugin */
82329 +       struct {
82330 +
82331 +               struct {
82332 +                       statcnt_t readpage_calls;
82333 +                       statcnt_t writepage_calls;
82334 +               } page_ops;
82335 +
82336 +               /* number of tail conversions */
82337 +               statcnt_t tail2extent;
82338 +               statcnt_t extent2tail;
82339 +
82340 +               /* find_next_item statistic */
82341 +               statcnt_t find_file_item;
82342 +               statcnt_t find_file_item_via_seal;
82343 +               statcnt_t find_file_item_via_right_neighbor;
82344 +               statcnt_t find_file_item_via_cbk;
82345 +
82346 +       } file;
82347 +       struct {
82348 +               /* how many unformatted nodes were read */
82349 +               statcnt_t unfm_block_reads;
82350 +
82351 +               /* extent_write seals and unlock znode before locking/capturing
82352 +                  page which is to be modified. After page is locked/captured
82353 +                  it validates a seal. Number of found broken seals is stored
82354 +                  here
82355 +               */
82356 +               statcnt_t broken_seals;
82357 +
82358 +               /* extent_write calls balance_dirty_pages after it modifies
82359 +                  every page. Before that it seals node it currently holds
82360 +                  and uses seal_validate to lock it again. This field stores
82361 +                  how many times balance_dirty_pages broke that seal and
82362 +                  caused to repease search tree traversal
82363 +               */
82364 +               statcnt_t bdp_caused_repeats;
82365 +               /* how many times extent_write could not write a coord and had
82366 +                * to ask for research */
82367 +               statcnt_t repeats;
82368 +       } extent;
82369 +       struct { /* stats on tail items */
82370 +               /* tail_write calls balance_dirty_pages after every call to
82371 +                  insert_flow. Before that it seals node it currently holds
82372 +                  and uses seal_validate to lock it again. This field stores
82373 +                  how many times balance_dirty_pages broke that seal and
82374 +                  caused to repease search tree traversal
82375 +               */
82376 +               statcnt_t bdp_caused_repeats;
82377 +       } tail;
82378 +       /* transaction manager stats */
82379 +       struct {
82380 +               /* jiffies, spent in atom_wait_event() */
82381 +               statcnt_t slept_in_wait_event;
82382 +               /* jiffies, spent in capture_fuse_wait (wait for atom state
82383 +                * change) */
82384 +               statcnt_t slept_in_wait_atom;
82385 +               /* number of commits */
82386 +               statcnt_t commits;
82387 +               /*number of post commit writes */
82388 +               statcnt_t post_commit_writes;
82389 +               /* jiffies, spent in commits and post commit writes */
82390 +               statcnt_t time_spent_in_commits;
82391 +               /* how many times attempt to write a flush queue ended up with
82392 +                * an empty bio */
82393 +               statcnt_t empty_bio;
82394 +               /* how many times ->writepage kicked ktxnmged to start commit
82395 +                * of an atom */
82396 +               statcnt_t commit_from_writepage;
82397 +
82398 +               /*
82399 +                * fs/txnmgrd.c:try_capture_block() stats
82400 +                */
82401 +
82402 +               /* atoms of node and transaction handle are the same
82403 +                * already */
82404 +               statcnt_t capture_equal;
82405 +               /* node and handle both belong to atoms */
82406 +               statcnt_t capture_both;
82407 +               /* only node belongs to atom */
82408 +               statcnt_t capture_block;
82409 +               /* only handle belongs to atom */
82410 +               statcnt_t capture_txnh;
82411 +               /* neither node nor handle belong to atom */
82412 +               statcnt_t capture_none;
82413 +
82414 +               /*
82415 +                * how many times some transaction manager activity had to be
82416 +                * re-started, because...
82417 +                */
82418 +               struct {
82419 +                       /* new atom was created */
82420 +                       statcnt_t atom_begin;
82421 +                       /* commit_current_atom() found atom in use */
82422 +                       statcnt_t cannot_commit;
82423 +                       /* committer had to wait */
82424 +                       statcnt_t should_wait;
82425 +                       /* jnode_flush was invoked several times in a row */
82426 +                       statcnt_t flush;
82427 +                       /* fuse_not_fused_lock_owners() fused atoms */
82428 +                       statcnt_t fuse_lock_owners_fused;
82429 +                       /* fuse_not_fused_lock_owners() has to restart */
82430 +                       statcnt_t fuse_lock_owners;
82431 +                       /* trylock failed on atom */
82432 +                       statcnt_t trylock_throttle;
82433 +                       /* atom trylock failed in capture_assign_block() */
82434 +                       statcnt_t assign_block;
82435 +                       /* atom trylock failed in capture_assign_txnh() */
82436 +                       statcnt_t assign_txnh;
82437 +                       /* capture_fuse_wait() was called in non-blocking
82438 +                        * mode */
82439 +                       statcnt_t fuse_wait_nonblock;
82440 +                       /* capture_fuse_wait() had to sleep */
82441 +                       statcnt_t fuse_wait_slept;
82442 +                       /* capture_init_fusion() failed to try-lock node
82443 +                        * atom */
82444 +                       statcnt_t init_fusion_atomf;
82445 +                       /* capture_init_fusion() failed to try-lock handle
82446 +                        * atom */
82447 +                       statcnt_t init_fusion_atomh;
82448 +                       /* capture_init_fusion_locked() slept during fusion */
82449 +                       statcnt_t init_fusion_fused;
82450 +               } restart;
82451 +       } txnmgr;
82452 +       struct {
82453 +               /* how many nodes were squeezed to left neighbor completely */
82454 +               statcnt_t squeezed_completely;
82455 +               /* how many times nodes with unallocated children are written */
82456 +               statcnt_t flushed_with_unallocated;
82457 +               /* how many leaves were squeezed to left */
82458 +               statcnt_t squeezed_leaves;
82459 +               /* how many items were squeezed on leaf level */
82460 +               statcnt_t squeezed_leaf_items;
82461 +               /* how mnay bytes were squeezed on leaf level */
82462 +               statcnt_t squeezed_leaf_bytes;
82463 +               /* how many times jnode_flush was called */
82464 +               statcnt_t flush;
82465 +               /* how many nodes were scanned by scan_left() */
82466 +               statcnt_t left;
82467 +               /* how many nodes were scanned by scan_right() */
82468 +               statcnt_t right;
82469 +               /* an overhead of MTFLUSH semaphore */
82470 +               statcnt_t slept_in_mtflush_sem;
82471 +       } flush;
82472 +       struct {
82473 +               /* how many carry objects were allocated */
82474 +               statcnt_t alloc;
82475 +               /* how many "extra" carry objects were allocated by
82476 +                  kmalloc. */
82477 +               statcnt_t kmalloc;
82478 +       } pool;
82479 +       struct {
82480 +               /* seals that were found pristine */
82481 +               statcnt_t perfect_match;
82482 +               /* how many times node under seal was out of cache */
82483 +               statcnt_t out_of_cache;
82484 +       } seal;
82485 +       /* hash tables stats. See tshash_stat above. */
82486 +       struct {
82487 +               /* for the hash table of znodes with real block numbers */
82488 +               tshash_stat znode;
82489 +               /* for the hash table of znodes with fake block numbers */
82490 +               tshash_stat zfake;
82491 +               /* for the hash table of jnodes */
82492 +               tshash_stat jnode;
82493 +               /* for the hash table of lnodes */
82494 +               tshash_stat lnode;
82495 +               /* for the hash table of eflush_node_t's */
82496 +               tshash_stat eflush;
82497 +       } hashes;
82498 +       struct {
82499 +               /* how many times block was allocated without having valid
82500 +                * preceder. */
82501 +               statcnt_t nohint;
82502 +       } block_alloc;
82503 +       /* how many non-unique keys were scanned into tree */
82504 +       statcnt_t non_uniq;
82505 +
82506 +       /* page_common_writeback stats */
82507 +       struct {
82508 +               /* calls to ->writepage() */
82509 +               statcnt_t calls;
82510 +               /* ->writepage() failed to allocate jnode for the page */
82511 +               statcnt_t no_jnode;
82512 +               /* emergency flush succeed */
82513 +               statcnt_t written;
82514 +               /* emergency flush failed */
82515 +               statcnt_t not_written;
82516 +       } pcwb;
82517 +
82518 +       /* stat of copy on capture requests */
82519 +       struct {
82520 +               statcnt_t calls;
82521 +               /* satisfied requests */
82522 +               statcnt_t ok_uber;
82523 +               statcnt_t ok_nopage;
82524 +               statcnt_t ok_clean;
82525 +               statcnt_t ok_ovrwr;
82526 +               statcnt_t ok_reloc;
82527 +               /* refused copy on capture requests */
82528 +               statcnt_t forbidden;
82529 +               statcnt_t writeback;
82530 +               statcnt_t flush_queued;
82531 +               statcnt_t dirty;
82532 +               statcnt_t eflush;
82533 +               statcnt_t scan_race;
82534 +               statcnt_t atom_changed;
82535 +               statcnt_t coc_race;
82536 +               statcnt_t coc_wait;
82537 +       } coc;
82538 +
82539 +       statcnt_t pages_dirty;
82540 +       statcnt_t pages_clean;
82541 +} reiser4_stat;
82542 +
82543 +
82544 +#define get_current_stat()                                     \
82545 +       (get_super_private_nocheck(reiser4_get_current_sb())->stats)
82546 +
82547 +/* Macros to gather statistical data. If REISER4_STATS is disabled, they
82548 +   are preprocessed to nothing.
82549 +*/
82550 +
82551 +#define        reiser4_stat(sb, cnt) (&get_super_private_nocheck(sb)->stats->cnt)
82552 +
82553 +#define        reiser4_stat_inc_at(sb, counter)                                        \
82554 +       statcnt_inc(&get_super_private_nocheck(sb)->stats->counter)
82555 +
82556 +#define        reiser4_stat_inc(counter)                               \
82557 +       ON_CONTEXT(statcnt_inc(&get_current_stat()->counter))
82558 +
82559 +#define reiser4_stat_add(counter, delta)                       \
82560 +       ON_CONTEXT(statcnt_add(&get_current_stat()->counter, delta))
82561 +
82562 +#define        reiser4_stat_inc_at_level(lev, stat)                                    \
82563 +({                                                                             \
82564 +       int __level;                                                            \
82565 +                                                                               \
82566 +       __level = (lev);                                                        \
82567 +       if (__level >= 0) {                                                     \
82568 +               if(__level < REISER4_MAX_ZTREE_HEIGHT) {                        \
82569 +                       reiser4_stat_inc(level[__level]. stat);                 \
82570 +                       reiser4_stat_inc(level[__level]. total_hits_at_level);  \
82571 +               }                                                               \
82572 +       }                                                                       \
82573 +})
82574 +
82575 +#define        reiser4_stat_add_at_level(lev, stat, value)                             \
82576 +({                                                                             \
82577 +       int level;                                                              \
82578 +                                                                               \
82579 +       level = (lev);                                                          \
82580 +       if (level >= 0) {                                                       \
82581 +               if(level < REISER4_MAX_ZTREE_HEIGHT) {                          \
82582 +                       reiser4_stat_add(level[level]. stat , value );          \
82583 +                       reiser4_stat_inc(level[level]. total_hits_at_level);    \
82584 +               }                                                               \
82585 +       }                                                                       \
82586 +})
82587 +
82588 +#define        reiser4_stat_level_inc(l, stat)                 \
82589 +       reiser4_stat_inc_at_level((l)->level_no, stat)
82590 +
82591 +
82592 +struct kobject;
82593 +extern int reiser4_populate_kattr_level_dir(struct kobject * kobj);
82594 +extern int reiser4_stat_init(reiser4_stat ** stats);
82595 +extern void reiser4_stat_done(reiser4_stat ** stats);
82596 +
82597 +/* REISER4_STATS */
82598 +#else
82599 +
82600 +#define ON_STATS(e) noop
82601 +
82602 +#define        reiser4_stat(sb, cnt) ((void *)NULL)
82603 +#define        reiser4_stat_inc(counter)  noop
82604 +#define reiser4_stat_add(counter, delta) noop
82605 +
82606 +#define        reiser4_stat_inc_at(sb, counter) noop
82607 +#define        reiser4_stat_inc_at_level(lev, stat) noop
82608 +#define reiser4_stat_add_at_level(lev, stat, cnt) noop
82609 +#define        reiser4_stat_level_inc(l, stat) noop
82610 +
82611 +typedef struct {
82612 +} reiser4_stat;
82613 +
82614 +typedef struct tshash_stat {
82615 +} tshash_stat;
82616 +
82617 +#define TSHASH_LOOKUP(stat) noop
82618 +#define TSHASH_INSERT(stat) noop
82619 +#define TSHASH_REMOVE(stat) noop
82620 +#define TSHASH_SCANNED(stat) noop
82621 +
82622 +#define reiser4_populate_kattr_level_dir(kobj, i) (0)
82623 +#define reiser4_stat_init(s) (0)
82624 +#define reiser4_stat_done(s) noop
82625 +
82626 +#endif
82627 +
82628 +extern int reiser4_populate_kattr_dir(struct kobject * kobj);
82629 +
82630 +
82631 +/* __FS_REISER4_STATS_H__ */
82632 +#endif
82633 +
82634 +/* Make Linus happy.
82635 +   Local variables:
82636 +   c-indentation-style: "K&R"
82637 +   mode-name: "LC"
82638 +   c-basic-offset: 8
82639 +   tab-width: 8
82640 +   fill-column: 120
82641 +   End:
82642 +*/
82643 diff -rupN linux-2.6.8-rc3/fs/reiser4/status_flags.c linux-2.6.8-rc3-a/fs/reiser4/status_flags.c
82644 --- linux-2.6.8-rc3/fs/reiser4/status_flags.c   1970-01-01 03:00:00.000000000 +0300
82645 +++ linux-2.6.8-rc3-a/fs/reiser4/status_flags.c 2004-08-05 21:20:53.434586410 +0400
82646 @@ -0,0 +1,193 @@
82647 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
82648 + * reiser4/README */
82649 +
82650 +/* Functions that deal with reiser4 status block, query status and update it, if needed */
82651 +
82652 +#include <linux/page-flags.h>
82653 +#include <linux/bio.h>
82654 +#include <linux/highmem.h>
82655 +#include <linux/fs.h>
82656 +#include <linux/blkdev.h>
82657 +#include "debug.h"
82658 +#include "dformat.h"
82659 +#include "status_flags.h"
82660 +#include "super.h"
82661 +
82662 +/* This is our end I/O handler that marks page uptodate if IO was succesful. It also
82663 +   unconditionally unlocks the page, so we can see that io was done.
82664 +   We do not free bio, because we hope to reuse that. */
82665 +static int reiser4_status_endio(struct bio *bio, unsigned int bytes_done, int err)
82666 +{
82667 +       if (bio->bi_size)
82668 +               return 1;
82669 +
82670 +       if (test_bit(BIO_UPTODATE, &bio->bi_flags)) {
82671 +               SetPageUptodate(bio->bi_io_vec->bv_page);
82672 +       } else {
82673 +               ClearPageUptodate(bio->bi_io_vec->bv_page);
82674 +               SetPageError(bio->bi_io_vec->bv_page);
82675 +       }
82676 +       unlock_page(bio->bi_io_vec->bv_page);
82677 +//     bio_put(bio);
82678 +       return 0;
82679 +}
82680 +
82681 +/* Initialise status code. This is expected to be called from the disk format
82682 +   code. block paremeter is where status block lives. */
82683 +reiser4_internal int reiser4_status_init(reiser4_block_nr block)
82684 +{
82685 +       struct super_block *sb = reiser4_get_current_sb();
82686 +       struct reiser4_status *statuspage;
82687 +       struct bio *bio;
82688 +       struct page *page;
82689 +
82690 +       get_super_private(sb)->status_page = NULL;
82691 +       get_super_private(sb)->status_bio = NULL;
82692 +
82693 +       page = alloc_pages(GFP_KERNEL, 0);
82694 +       if (!page)
82695 +               return -ENOMEM;
82696 +
82697 +       bio = bio_alloc(GFP_KERNEL, 1);
82698 +       if (bio != NULL) {
82699 +               bio->bi_sector = block * (sb->s_blocksize >> 9);
82700 +               bio->bi_bdev = sb->s_bdev;
82701 +               bio->bi_io_vec[0].bv_page = page;
82702 +               bio->bi_io_vec[0].bv_len = sb->s_blocksize;
82703 +               bio->bi_io_vec[0].bv_offset = 0;
82704 +               bio->bi_vcnt = 1;
82705 +               bio->bi_size = sb->s_blocksize;
82706 +               bio->bi_end_io = reiser4_status_endio;
82707 +       } else {
82708 +               __free_pages(page, 0);
82709 +               return -ENOMEM;
82710 +       }
82711 +       lock_page(page);
82712 +       submit_bio(READ, bio);
82713 +       blk_run_address_space(get_super_fake(sb)->i_mapping);
82714 +       /*blk_run_queues();*/
82715 +       wait_on_page_locked(page);
82716 +       if ( !PageUptodate(page) ) {
82717 +               warning("green-2007", "I/O error while tried to read status page\n");
82718 +               return -EIO;
82719 +       }
82720 +
82721 +       statuspage = kmap_atomic(page, KM_USER0);
82722 +       if ( memcmp( statuspage->magic, REISER4_STATUS_MAGIC, sizeof(REISER4_STATUS_MAGIC)) ) {
82723 +               /* Magic does not match. */
82724 +               kunmap_atomic(page, KM_USER0);
82725 +               warning("green-2008", "Wrong magic in status block\n");
82726 +               __free_pages(page, 0);
82727 +               bio_put(bio);
82728 +               return -EINVAL;
82729 +       }
82730 +       kunmap_atomic(page, KM_USER0);
82731 +
82732 +       get_super_private(sb)->status_page = page;
82733 +       get_super_private(sb)->status_bio = bio;
82734 +       return 0;
82735 +}
82736 +
82737 +/* Query the status of fs. Returns if the FS can be safely mounted.
82738 +   Also if "status" and "extended" parameters are given, it will fill
82739 +   actual parts of status from disk there. */
82740 +reiser4_internal int reiser4_status_query(u64 *status, u64 *extended)
82741 +{
82742 +       struct super_block *sb = reiser4_get_current_sb();
82743 +       struct reiser4_status *statuspage;
82744 +       int retval;
82745 +
82746 +       if ( !get_super_private(sb)->status_page ) { // No status page?
82747 +               return REISER4_STATUS_MOUNT_UNKNOWN;
82748 +       }
82749 +       statuspage = kmap_atomic(get_super_private(sb)->status_page, KM_USER0);
82750 +       switch ( (long)d64tocpu(&statuspage->status) ) { // FIXME: this cast is a hack for 32 bit arches to work.
82751 +       case REISER4_STATUS_OK:
82752 +               retval = REISER4_STATUS_MOUNT_OK;
82753 +               break;
82754 +       case REISER4_STATUS_CORRUPTED:
82755 +               retval = REISER4_STATUS_MOUNT_WARN;
82756 +               break;
82757 +       case REISER4_STATUS_DAMAGED:
82758 +       case REISER4_STATUS_DESTROYED:
82759 +       case REISER4_STATUS_IOERROR:
82760 +               retval = REISER4_STATUS_MOUNT_RO;
82761 +               break;
82762 +       default:
82763 +               retval = REISER4_STATUS_MOUNT_UNKNOWN;
82764 +               break;
82765 +       }
82766 +
82767 +       if ( status )
82768 +               *status = d64tocpu(&statuspage->status);
82769 +       if ( extended )
82770 +               *extended = d64tocpu(&statuspage->extended_status);
82771 +
82772 +       kunmap_atomic(get_super_private(sb)->status_page, KM_USER0);
82773 +       return retval;
82774 +}
82775 +
82776 +/* This function should be called when something bad happens (e.g. from reiser4_panic).
82777 +   It fills the status structure and tries to push it to disk. */
82778 +reiser4_internal int
82779 +reiser4_status_write(u64 status, u64 extended_status, char *message)
82780 +{
82781 +       struct super_block *sb = reiser4_get_current_sb();
82782 +       struct reiser4_status *statuspage;
82783 +       struct bio *bio = get_super_private(sb)->status_bio;
82784 +
82785 +       if ( !get_super_private(sb)->status_page ) { // No status page?
82786 +               return -1;
82787 +       }
82788 +       statuspage = kmap_atomic(get_super_private(sb)->status_page, KM_USER0);
82789 +
82790 +       cputod64(status, &statuspage->status);
82791 +       cputod64(extended_status, &statuspage->extended_status);
82792 +       strncpy(statuspage->texterror, message, REISER4_TEXTERROR_LEN);
82793 +
82794 +#ifdef CONFIG_FRAME_POINTER
82795 +#define GETFRAME(no)                                           \
82796 +       cputod64((unsigned long)__builtin_return_address(no),   \
82797 +                &statuspage->stacktrace[no])
82798 +
82799 +       GETFRAME(0);
82800 +       GETFRAME(1);
82801 +       GETFRAME(2);
82802 +       GETFRAME(3);
82803 +       GETFRAME(4);
82804 +       GETFRAME(5);
82805 +       GETFRAME(6);
82806 +       GETFRAME(7);
82807 +       GETFRAME(8);
82808 +       GETFRAME(9);
82809 +
82810 +#undef GETFRAME
82811 +#endif
82812 +       kunmap_atomic(get_super_private(sb)->status_page, KM_USER0);
82813 +       bio->bi_bdev = sb->s_bdev;
82814 +       bio->bi_io_vec[0].bv_page = get_super_private(sb)->status_page;
82815 +       bio->bi_io_vec[0].bv_len = sb->s_blocksize;
82816 +       bio->bi_io_vec[0].bv_offset = 0;
82817 +       bio->bi_vcnt = 1;
82818 +       bio->bi_size = sb->s_blocksize;
82819 +       bio->bi_end_io = reiser4_status_endio;
82820 +       lock_page(get_super_private(sb)->status_page); // Safe as nobody should touch our page.
82821 +       /* We can block now, but we have no other choice anyway */
82822 +       submit_bio(WRITE, bio);
82823 +       blk_run_address_space(get_super_fake(sb)->i_mapping);
82824 +       /*blk_run_queues();*/ // Now start the i/o.
82825 +       return 0; // We do not wait for io to finish.
82826 +}
82827 +
82828 +/* Frees the page with status and bio structure. Should be called by disk format at umount time */
82829 +reiser4_internal int reiser4_status_finish(void)
82830 +{
82831 +       struct super_block *sb = reiser4_get_current_sb();
82832 +
82833 +       __free_pages(get_super_private(sb)->status_page, 0);
82834 +       get_super_private(sb)->status_page = NULL;
82835 +       bio_put(get_super_private(sb)->status_bio);
82836 +       get_super_private(sb)->status_bio = NULL;
82837 +       return 0;
82838 +}
82839 +
82840 diff -rupN linux-2.6.8-rc3/fs/reiser4/status_flags.h linux-2.6.8-rc3-a/fs/reiser4/status_flags.h
82841 --- linux-2.6.8-rc3/fs/reiser4/status_flags.h   1970-01-01 03:00:00.000000000 +0300
82842 +++ linux-2.6.8-rc3-a/fs/reiser4/status_flags.h 2004-08-05 21:20:53.095657898 +0400
82843 @@ -0,0 +1,43 @@
82844 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
82845 + * reiser4/README */
82846 +
82847 +/* Here we declare structures and flags that store reiser4 status on disk.
82848 +   The status that helps us to find out if the filesystem is valid or if it
82849 +   contains some critical, or not so critical errors */
82850 +
82851 +#if !defined( __REISER4_STATUS_FLAGS_H__ )
82852 +#define __REISER4_STATUS_FLAGS_H__
82853 +
82854 +#include "dformat.h"
82855 +/* These are major status flags */
82856 +#define REISER4_STATUS_OK 0
82857 +#define REISER4_STATUS_CORRUPTED 0x1
82858 +#define REISER4_STATUS_DAMAGED 0x2
82859 +#define REISER4_STATUS_DESTROYED 0x4
82860 +#define REISER4_STATUS_IOERROR 0x8
82861 +
82862 +/* Return values for reiser4_status_query() */
82863 +#define REISER4_STATUS_MOUNT_OK 0
82864 +#define REISER4_STATUS_MOUNT_WARN 1
82865 +#define REISER4_STATUS_MOUNT_RO 2
82866 +#define REISER4_STATUS_MOUNT_UNKNOWN -1
82867 +
82868 +#define REISER4_TEXTERROR_LEN 256
82869 +
82870 +#define REISER4_STATUS_MAGIC "ReiSeR4StATusBl"
82871 +/* We probably need to keep its size under sector size which is 512 bytes */
82872 +struct reiser4_status {
82873 +       char magic[16];
82874 +       d64 status;   /* Current FS state */
82875 +       d64 extended_status; /* Any additional info that might have sense in addition to "status". E.g.
82876 +                               last sector where io error happened if status is "io error encountered" */
82877 +       d64 stacktrace[10];  /* Last ten functional calls made (addresses)*/
82878 +       char texterror[REISER4_TEXTERROR_LEN]; /* Any error message if appropriate, otherwise filled with zeroes */
82879 +};
82880 +
82881 +int reiser4_status_init(reiser4_block_nr block);
82882 +int reiser4_status_query(u64 *status, u64 *extended);
82883 +int reiser4_status_write(u64 status, u64 extended_status, char *message);
82884 +int reiser4_status_finish(void);
82885 +
82886 +#endif
82887 diff -rupN linux-2.6.8-rc3/fs/reiser4/super.c linux-2.6.8-rc3-a/fs/reiser4/super.c
82888 --- linux-2.6.8-rc3/fs/reiser4/super.c  1970-01-01 03:00:00.000000000 +0300
82889 +++ linux-2.6.8-rc3-a/fs/reiser4/super.c        2004-08-05 21:20:52.848709986 +0400
82890 @@ -0,0 +1,552 @@
82891 +/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
82892 + * reiser4/README */
82893 +
82894 +/* Super-block manipulations. */
82895 +
82896 +#include "debug.h"
82897 +#include "dformat.h"
82898 +#include "key.h"
82899 +#include "plugin/security/perm.h"
82900 +#include "plugin/space/space_allocator.h"
82901 +#include "plugin/plugin.h"
82902 +#include "tree.h"
82903 +#include "vfs_ops.h"
82904 +#include "super.h"
82905 +#include "reiser4.h"
82906 +
82907 +#include <linux/types.h>       /* for __u??  */
82908 +#include <linux/fs.h>          /* for struct super_block  */
82909 +
82910 +/*const __u32 REISER4_SUPER_MAGIC = 0x52345362;*/      /* (*(__u32 *)"R4Sb"); */
82911 +
82912 +static __u64 reserved_for_gid(const struct super_block *super, gid_t gid);
82913 +static __u64 reserved_for_uid(const struct super_block *super, uid_t uid);
82914 +static __u64 reserved_for_root(const struct super_block *super);
82915 +
82916 +/* Return reiser4-specific part of super block */
82917 +reiser4_internal reiser4_super_info_data *
82918 +get_super_private_nocheck(const struct super_block *super      /* super block
82919 +                                                                * queried */ )
82920 +{
82921 +       return (reiser4_super_info_data *) super->s_fs_info;
82922 +}
82923 +
82924 +
82925 +/* Return reiser4 fstype: value that is returned in ->f_type field by statfs() */
82926 +reiser4_internal long
82927 +statfs_type(const struct super_block *super UNUSED_ARG /* super block
82928 +                                                        * queried */ )
82929 +{
82930 +       assert("nikita-448", super != NULL);
82931 +       assert("nikita-449", is_reiser4_super(super));
82932 +       return (long) REISER4_SUPER_MAGIC;
82933 +}
82934 +
82935 +/* block size used by file system corresponding to @super */
82936 +reiser4_internal int
82937 +reiser4_blksize(const struct super_block *super /* super block queried */ )
82938 +{
82939 +       assert("nikita-450", super != NULL);
82940 +       assert("nikita-451", is_reiser4_super(super));
82941 +       /* FIXME-VS: blocksize has to be 512, 1024, 2048, etc */
82942 +       assert("zam-391", super->s_blocksize > 0);
82943 +       return super->s_blocksize;
82944 +}
82945 +
82946 +/* functions to read/modify fields of reiser4_super_info_data */
82947 +
82948 +/* get number of blocks in file system */
82949 +reiser4_internal __u64
82950 +reiser4_block_count(const struct super_block * super   /* super block
82951 +                                                          queried */ )
82952 +{
82953 +       assert("vs-494", super != NULL);
82954 +       assert("vs-495", is_reiser4_super(super));
82955 +       return get_super_private(super)->block_count;
82956 +}
82957 +
82958 +/*
82959 + * number of blocks in the current file system
82960 + */
82961 +reiser4_internal __u64 reiser4_current_block_count(void)
82962 +{
82963 +       return get_current_super_private()->block_count;
82964 +}
82965 +
82966 +
82967 +/* set number of block in filesystem */
82968 +reiser4_internal void
82969 +reiser4_set_block_count(const struct super_block *super, __u64 nr)
82970 +{
82971 +       assert("vs-501", super != NULL);
82972 +       assert("vs-502", is_reiser4_super(super));
82973 +       get_super_private(super)->block_count = nr;
82974 +       /* The proper calculation of the reserved space counter (%5 of device
82975 +          block counter) we need a 64 bit division which is missing in Linux on
82976 +          i386 platform. Because we do not need a precise calculation here we
82977 +          can replace a div64 operation by this combination of multiplication
82978 +          and shift: 51. / (2^10) == .0498 .*/
82979 +       get_super_private(super)->blocks_reserved = ((nr * 51) >> 10);
82980 +}
82981 +
82982 +/* amount of blocks used (allocated for data) in file system */
82983 +reiser4_internal __u64
82984 +reiser4_data_blocks(const struct super_block *super    /* super block
82985 +                                                                  queried */ )
82986 +{
82987 +       assert("nikita-452", super != NULL);
82988 +       assert("nikita-453", is_reiser4_super(super));
82989 +       return get_super_private(super)->blocks_used;
82990 +}
82991 +
82992 +/* set number of block used in filesystem */
82993 +reiser4_internal void
82994 +reiser4_set_data_blocks(const struct super_block *super, __u64 nr)
82995 +{
82996 +       assert("vs-503", super != NULL);
82997 +       assert("vs-504", is_reiser4_super(super));
82998 +       get_super_private(super)->blocks_used = nr;
82999 +}
83000 +
83001 +/* amount of free blocks in file system */
83002 +reiser4_internal __u64
83003 +reiser4_free_blocks(const struct super_block *super    /* super block
83004 +                                                          queried */ )
83005 +{
83006 +       assert("nikita-454", super != NULL);
83007 +       assert("nikita-455", is_reiser4_super(super));
83008 +       return get_super_private(super)->blocks_free;
83009 +}
83010 +
83011 +/* set number of blocks free in filesystem */
83012 +reiser4_internal void
83013 +reiser4_set_free_blocks(const struct super_block *super, __u64 nr)
83014 +{
83015 +       assert("vs-505", super != NULL);
83016 +       assert("vs-506", is_reiser4_super(super));
83017 +       get_super_private(super)->blocks_free = nr;
83018 +}
83019 +
83020 +/* increment reiser4_super_info_data's counter of free blocks */
83021 +reiser4_internal void
83022 +reiser4_inc_free_blocks(const struct super_block *super)
83023 +{
83024 +       assert("vs-496", reiser4_free_blocks(super) < reiser4_block_count(super));
83025 +       get_super_private(super)->blocks_free++;
83026 +}
83027 +
83028 +/* get mkfs unique identifier */
83029 +reiser4_internal __u32
83030 +reiser4_mkfs_id(const struct super_block *super        /* super block
83031 +                                                  queried */ )
83032 +{
83033 +       assert("vpf-221", super != NULL);
83034 +       assert("vpf-222", is_reiser4_super(super));
83035 +       return get_super_private(super)->mkfs_id;
83036 +}
83037 +
83038 +/* set mkfs unique identifier */
83039 +reiser4_internal void
83040 +reiser4_set_mkfs_id(const struct super_block *super, __u32 id)
83041 +{
83042 +       assert("vpf-223", super != NULL);
83043 +       assert("vpf-224", is_reiser4_super(super));
83044 +       get_super_private(super)->mkfs_id = id;
83045 +}
83046 +
83047 +/* amount of free blocks in file system */
83048 +reiser4_internal __u64
83049 +reiser4_free_committed_blocks(const struct super_block *super)
83050 +{
83051 +       assert("vs-497", super != NULL);
83052 +       assert("vs-498", is_reiser4_super(super));
83053 +       return get_super_private(super)->blocks_free_committed;
83054 +}
83055 +
83056 +/* this is only used once on mount time to number of free blocks in
83057 +   filesystem */
83058 +reiser4_internal void
83059 +reiser4_set_free_committed_blocks(const struct super_block *super, __u64 nr)
83060 +{
83061 +       assert("vs-507", super != NULL);
83062 +       assert("vs-508", is_reiser4_super(super));
83063 +       get_super_private(super)->blocks_free_committed = nr;
83064 +}
83065 +
83066 +/* amount of blocks in the file system reserved for @uid and @gid */
83067 +reiser4_internal long
83068 +reiser4_reserved_blocks(const struct super_block *super        /* super block
83069 +                                                          queried */ ,
83070 +                       uid_t uid /* user id */ , gid_t gid /* group id */ )
83071 +{
83072 +       long reserved;
83073 +
83074 +       assert("nikita-456", super != NULL);
83075 +       assert("nikita-457", is_reiser4_super(super));
83076 +
83077 +       reserved = 0;
83078 +       if (REISER4_SUPPORT_GID_SPACE_RESERVATION)
83079 +               reserved += reserved_for_gid(super, gid);
83080 +       if (REISER4_SUPPORT_UID_SPACE_RESERVATION)
83081 +               reserved += reserved_for_uid(super, uid);
83082 +       if (REISER4_SUPPORT_ROOT_SPACE_RESERVATION && (uid == 0))
83083 +               reserved += reserved_for_root(super);
83084 +       return reserved;
83085 +}
83086 +
83087 +/* get/set value of/to grabbed blocks counter */
83088 +reiser4_internal __u64 reiser4_grabbed_blocks(const struct super_block * super)
83089 +{
83090 +       assert("zam-512", super != NULL);
83091 +       assert("zam-513", is_reiser4_super(super));
83092 +
83093 +       return get_super_private(super)->blocks_grabbed;
83094 +}
83095 +
83096 +reiser4_internal void
83097 +reiser4_set_grabbed_blocks(const struct super_block *super, __u64 nr)
83098 +{
83099 +       assert("zam-514", super != NULL);
83100 +       assert("zam-515", is_reiser4_super(super));
83101 +
83102 +       get_super_private(super)->blocks_grabbed = nr;
83103 +}
83104 +
83105 +reiser4_internal __u64 flush_reserved (const struct super_block *super)
83106 +{
83107 +       assert ("vpf-285", super != NULL);
83108 +       assert ("vpf-286", is_reiser4_super (super));
83109 +
83110 +       return get_super_private(super)->blocks_flush_reserved;
83111 +}
83112 +
83113 +reiser4_internal void
83114 +set_flush_reserved (const struct super_block *super, __u64 nr)
83115 +{
83116 +       assert ("vpf-282", super != NULL);
83117 +       assert ("vpf-283", is_reiser4_super (super));
83118 +
83119 +       get_super_private(super)->blocks_flush_reserved = nr;
83120 +}
83121 +
83122 +/* get/set value of/to counter of fake allocated formatted blocks */
83123 +reiser4_internal __u64 reiser4_fake_allocated(const struct super_block *super)
83124 +{
83125 +       assert("zam-516", super != NULL);
83126 +       assert("zam-517", is_reiser4_super(super));
83127 +
83128 +       return get_super_private(super)->blocks_fake_allocated;
83129 +}
83130 +
83131 +reiser4_internal void
83132 +reiser4_set_fake_allocated(const struct super_block *super, __u64 nr)
83133 +{
83134 +       assert("zam-518", super != NULL);
83135 +       assert("zam-519", is_reiser4_super(super));
83136 +
83137 +       get_super_private(super)->blocks_fake_allocated = nr;
83138 +}
83139 +
83140 +/* get/set value of/to counter of fake allocated unformatted blocks */
83141 +reiser4_internal __u64
83142 +reiser4_fake_allocated_unformatted(const struct super_block *super)
83143 +{
83144 +       assert("zam-516", super != NULL);
83145 +       assert("zam-517", is_reiser4_super(super));
83146 +
83147 +       return get_super_private(super)->blocks_fake_allocated_unformatted;
83148 +}
83149 +
83150 +reiser4_internal void
83151 +reiser4_set_fake_allocated_unformatted(const struct super_block *super, __u64 nr)
83152 +{
83153 +       assert("zam-518", super != NULL);
83154 +       assert("zam-519", is_reiser4_super(super));
83155 +
83156 +       get_super_private(super)->blocks_fake_allocated_unformatted = nr;
83157 +}
83158 +
83159 +
83160 +/* get/set value of/to counter of clustered blocks */
83161 +reiser4_internal __u64 reiser4_clustered_blocks(const struct super_block *super)
83162 +{
83163 +       assert("edward-601", super != NULL);
83164 +       assert("edward-602", is_reiser4_super(super));
83165 +
83166 +       return get_super_private(super)->blocks_clustered;
83167 +}
83168 +
83169 +reiser4_internal void
83170 +reiser4_set_clustered_blocks(const struct super_block *super, __u64 nr)
83171 +{
83172 +       assert("edward-603", super != NULL);
83173 +       assert("edward-604", is_reiser4_super(super));
83174 +
83175 +       get_super_private(super)->blocks_clustered = nr;
83176 +}
83177 +
83178 +/* space allocator used by this file system */
83179 +reiser4_internal reiser4_space_allocator *
83180 +get_space_allocator(const struct super_block * super)
83181 +{
83182 +       assert("nikita-1965", super != NULL);
83183 +       assert("nikita-1966", is_reiser4_super(super));
83184 +       return &get_super_private(super)->space_allocator;
83185 +}
83186 +
83187 +/* return fake inode used to bind formatted nodes in the page cache */
83188 +reiser4_internal struct inode *
83189 +get_super_fake(const struct super_block *super /* super block
83190 +                                                  queried */ )
83191 +{
83192 +       assert("nikita-1757", super != NULL);
83193 +       return get_super_private(super)->fake;
83194 +}
83195 +
83196 +/* return fake inode used to bind copied on capture nodes in the page cache */
83197 +reiser4_internal struct inode *
83198 +get_cc_fake(const struct super_block *super    /* super block
83199 +                                                  queried */ )
83200 +{
83201 +       assert("nikita-1757", super != NULL);
83202 +       return get_super_private(super)->cc;
83203 +}
83204 +
83205 +/* tree used by this file system */
83206 +reiser4_internal reiser4_tree *
83207 +get_tree(const struct super_block * super      /* super block
83208 +                                                * queried */ )
83209 +{
83210 +       assert("nikita-460", super != NULL);
83211 +       assert("nikita-461", is_reiser4_super(super));
83212 +       return &get_super_private(super)->tree;
83213 +}
83214 +
83215 +/* Check that @super is (looks like) reiser4 super block. This is mainly for
83216 +   use in assertions. */
83217 +reiser4_internal int
83218 +is_reiser4_super(const struct super_block *super       /* super block
83219 +                                                        * queried */ )
83220 +{
83221 +       return
83222 +               super != NULL &&
83223 +               get_super_private(super) != NULL &&
83224 +               super->s_op == &get_super_private(super)->ops.super;
83225 +}
83226 +
83227 +reiser4_internal int
83228 +reiser4_is_set(const struct super_block *super, reiser4_fs_flag f)
83229 +{
83230 +       return test_bit((int) f, &get_super_private(super)->fs_flags);
83231 +}
83232 +
83233 +/* amount of blocks reserved for given group in file system */
83234 +static __u64
83235 +reserved_for_gid(const struct super_block *super UNUSED_ARG    /* super
83236 +                                                                * block
83237 +                                                                * queried */ ,
83238 +                gid_t gid UNUSED_ARG /* group id */ )
83239 +{
83240 +       return 0;
83241 +}
83242 +
83243 +/* amount of blocks reserved for given user in file system */
83244 +static __u64
83245 +reserved_for_uid(const struct super_block *super UNUSED_ARG    /* super
83246 +                                                                  block
83247 +                                                                  queried */ ,
83248 +                uid_t uid UNUSED_ARG /* user id */ )
83249 +{
83250 +       return 0;
83251 +}
83252 +
83253 +/* amount of blocks reserved for super user in file system */
83254 +static __u64
83255 +reserved_for_root(const struct super_block *super UNUSED_ARG   /* super
83256 +                                                                  block
83257 +                                                                  queried */ )
83258 +{
83259 +       return 0;
83260 +}
83261 +
83262 +/*
83263 + * true if block number @blk makes sense for the file system at @super.
83264 + */
83265 +reiser4_internal int
83266 +reiser4_blocknr_is_sane_for(const struct super_block *super,
83267 +                               const reiser4_block_nr *blk)
83268 +{
83269 +       reiser4_super_info_data *sbinfo;
83270 +
83271 +       assert("nikita-2957", super != NULL);
83272 +       assert("nikita-2958", blk != NULL);
83273 +
83274 +       if (blocknr_is_fake(blk))
83275 +               return 1;
83276 +
83277 +       sbinfo = get_super_private(super);
83278 +       return *blk < sbinfo->block_count;
83279 +}
83280 +
83281 +/*
83282 + * true, if block number @blk makes sense for the current file system
83283 + */
83284 +reiser4_internal int
83285 +reiser4_blocknr_is_sane(const reiser4_block_nr *blk)
83286 +{
83287 +       return reiser4_blocknr_is_sane_for(reiser4_get_current_sb(), blk);
83288 +}
83289 +
83290 +/*
83291 + * construct various VFS related operation vectors that are embedded into @ops
83292 + * inside of @super.
83293 + */
83294 +reiser4_internal void
83295 +build_object_ops(struct super_block *super, object_ops *ops)
83296 +{
83297 +       struct inode_operations iops;
83298 +
83299 +       assert("nikita-3248", super != NULL);
83300 +       assert("nikita-3249", ops   != NULL);
83301 +
83302 +       iops = reiser4_inode_operations;
83303 +
83304 +       /* setup super_operations... */
83305 +       ops->super  = reiser4_super_operations;
83306 +       /* ...and export operations for NFS */
83307 +       ops->export = reiser4_export_operations;
83308 +
83309 +       /* install pointers to the per-super-block vectors into super-block
83310 +        * fields */
83311 +       super->s_op        = &ops->super;
83312 +       super->s_export_op = &ops->export;
83313 +
83314 +       /* cleanup XATTR related fields in inode operations---we don't support
83315 +        * Linux xattr API... */
83316 +       iops.setxattr = NULL;
83317 +       iops.getxattr = NULL;
83318 +       iops.listxattr = NULL;
83319 +       iops.removexattr = NULL;
83320 +
83321 +       /* ...and we don't need ->clear_inode, because its only user was
83322 +        * xattrs */
83323 +       /*ops->super.clear_inode = NULL;*/
83324 +
83325 +       ops->regular = iops;
83326 +       ops->dir     = iops;
83327 +
83328 +       ops->file    = reiser4_file_operations;
83329 +       ops->symlink = reiser4_symlink_inode_operations;
83330 +       ops->special = reiser4_special_inode_operations;
83331 +       ops->dentry  = reiser4_dentry_operations;
83332 +       ops->as      = reiser4_as_operations;
83333 +
83334 +       if (reiser4_is_set(super, REISER4_NO_PSEUDO)) {
83335 +               /* if we don't support pseudo files, we need neither ->open,
83336 +                * nor ->lookup on regular files */
83337 +               ops->regular.lookup = NULL;
83338 +               ops->file.open = NULL;
83339 +       }
83340 +
83341 +}
83342 +
83343 +#if REISER4_DEBUG_OUTPUT
83344 +/*
83345 + * debugging function: output human readable information about file system
83346 + * parameters
83347 + */
83348 +reiser4_internal void
83349 +print_fs_info(const char *prefix, const struct super_block *s)
83350 +{
83351 +       reiser4_super_info_data *sbinfo;
83352 +
83353 +       sbinfo = get_super_private(s);
83354 +
83355 +       printk("================ fs info (%s) =================\n", prefix);
83356 +       printk("root block: %lli\ntree height: %i\n", sbinfo->tree.root_block, sbinfo->tree.height);
83357 +       sa_print_info("", get_space_allocator(s));
83358 +
83359 +       printk("Oids: next to use %llu, in use %llu\n", sbinfo->next_to_use, sbinfo->oids_in_use);
83360 +       printk("Block counters:\n\tblock count\t%llu\n\tfree blocks\t%llu\n"
83361 +              "\tused blocks\t%llu\n\tgrabbed\t%llu\n\tfake allocated formatted\t%llu\n"
83362 +              "\tfake allocated unformatted\t%llu\n",
83363 +              reiser4_block_count(s), reiser4_free_blocks(s),
83364 +              reiser4_data_blocks(s), reiser4_grabbed_blocks(s),
83365 +              reiser4_fake_allocated(s), reiser4_fake_allocated_unformatted(s));
83366 +       print_key("Root directory key", sbinfo->df_plug->root_dir_key(s));
83367 +
83368 +       if ( sbinfo->diskmap_block)
83369 +               printk("Diskmap is present in %llu block\n", sbinfo->diskmap_block);
83370 +       else
83371 +               printk("Diskmap is not present\n");
83372 +
83373 +       if (sbinfo->df_plug->print_info) {
83374 +               printk("=========== disk format info (%s) =============\n", sbinfo->df_plug->h.label);
83375 +               sbinfo->df_plug->print_info(s);
83376 +       }
83377 +
83378 +}
83379 +#endif
83380 +
83381 +
83382 +#if REISER4_DEBUG
83383 +
83384 +/* this is caller when unallocated extent pointer is added */
83385 +void
83386 +inc_unalloc_unfm_ptr(void)
83387 +{
83388 +       reiser4_super_info_data *sbinfo;
83389 +
83390 +       sbinfo = get_super_private(get_current_context()->super);
83391 +       reiser4_spin_lock_sb(sbinfo);
83392 +       sbinfo->unalloc_extent_pointers ++;
83393 +       reiser4_spin_unlock_sb(sbinfo);
83394 +}
83395 +
83396 +/* this is called when unallocated extent is converted to allocated */
83397 +void
83398 +dec_unalloc_unfm_ptrs(int nr)
83399 +{
83400 +       reiser4_super_info_data *sbinfo;
83401 +
83402 +       sbinfo = get_super_private(get_current_context()->super);
83403 +       reiser4_spin_lock_sb(sbinfo);
83404 +       BUG_ON(sbinfo->unalloc_extent_pointers < nr);
83405 +       sbinfo->unalloc_extent_pointers -= nr;
83406 +       reiser4_spin_unlock_sb(sbinfo);
83407 +}
83408 +
83409 +void
83410 +inc_unfm_ef(void)
83411 +{
83412 +       reiser4_super_info_data *sbinfo;
83413 +
83414 +       sbinfo = get_super_private(get_current_context()->super);
83415 +       reiser4_spin_lock_sb(sbinfo);
83416 +       sbinfo->eflushed_unformatted ++;
83417 +       reiser4_spin_unlock_sb(sbinfo);
83418 +}
83419 +
83420 +void
83421 +dec_unfm_ef(void)
83422 +{
83423 +       reiser4_super_info_data *sbinfo;
83424 +
83425 +       sbinfo = get_super_private(get_current_context()->super);
83426 +       reiser4_spin_lock_sb(sbinfo);
83427 +       BUG_ON(sbinfo->eflushed_unformatted == 0);
83428 +       sbinfo->eflushed_unformatted --;
83429 +       reiser4_spin_unlock_sb(sbinfo);
83430 +}
83431 +
83432 +#endif
83433 +
83434 +/* Make Linus happy.
83435 +   Local variables:
83436 +   c-indentation-style: "K&R"
83437 +   mode-name: "LC"
83438 +   c-basic-offset: 8
83439 +   tab-width: 8
83440 +   fill-column: 120
83441 +   End:
83442 +*/
83443 diff -rupN linux-2.6.8-rc3/fs/reiser4/super.h linux-2.6.8-rc3-a/fs/reiser4/super.h
83444 --- linux-2.6.8-rc3/fs/reiser4/super.h  1970-01-01 03:00:00.000000000 +0300
83445 +++ linux-2.6.8-rc3-a/fs/reiser4/super.h        2004-08-05 21:20:52.909697122 +0400
83446 @@ -0,0 +1,591 @@
83447 +/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
83448 + * reiser4/README */
83449 +
83450 +/* Super-block functions. See super.c for details. */
83451 +
83452 +#if !defined( __REISER4_SUPER_H__ )
83453 +#define __REISER4_SUPER_H__
83454 +
83455 +#include "forward.h"
83456 +#include "debug.h"
83457 +#include "tree.h"
83458 +#include "context.h"
83459 +#include "log.h"
83460 +#include "lnode.h"
83461 +#include "entd.h"
83462 +#include "plugin/plugin.h"
83463 +#include "prof.h"
83464 +#include "wander.h"
83465 +
83466 +#include "plugin/space/space_allocator.h"
83467 +
83468 +#include "plugin/disk_format/disk_format40.h"
83469 +#include "plugin/security/perm.h"
83470 +#include "plugin/dir/dir.h"
83471 +
83472 +#include "emergency_flush.h"
83473 +
83474 +#include <linux/spinlock.h>
83475 +#include <linux/types.h>       /* for __u??, etc.  */
83476 +#include <linux/fs.h>          /* for struct super_block, etc.  */
83477 +#include <linux/list.h>                /* for struct list_head */
83478 +#include <linux/kobject.h>      /* for kobject */
83479 +
83480 +/*
83481 + * Flush algorithms parameters.
83482 + */
83483 +typedef struct {
83484 +       unsigned relocate_threshold;
83485 +       unsigned relocate_distance;
83486 +       unsigned written_threshold;
83487 +       unsigned scan_maxnodes;
83488 +} flush_params;
83489 +
83490 +typedef enum {
83491 +       /* True if this file system doesn't support hard-links (multiple
83492 +          names) for directories: this is default UNIX behavior.
83493 +
83494 +          If hard-links on directoires are not allowed, file system is
83495 +          Acyclic Directed Graph (modulo dot, and dotdot, of course).
83496 +
83497 +          This is used by reiser4_link().
83498 +       */
83499 +       REISER4_ADG = 0,
83500 +       /* set if all nodes in internal tree have the same node layout plugin.
83501 +          If so, znode_guess_plugin() will return tree->node_plugin in stead
83502 +          of guessing plugin by plugin id stored in the node.
83503 +       */
83504 +       REISER4_ONE_NODE_PLUGIN = 1,
83505 +       /* if set, bsd gid assignment is supported. */
83506 +       REISER4_BSD_GID = 2,
83507 +       /* [mac]_time are 32 bit in inode */
83508 +       REISER4_32_BIT_TIMES = 3,
83509 +       /* allow concurrent flushes */
83510 +       REISER4_MTFLUSH = 4,
83511 +       /* disable support for pseudo files. Don't treat regular files as
83512 +        * directories. */
83513 +       REISER4_NO_PSEUDO = 5,
83514 +       /* load all bitmap blocks at mount time */
83515 +       REISER4_DONT_LOAD_BITMAP = 6
83516 +} reiser4_fs_flag;
83517 +
83518 +#if REISER4_STATS
83519 +
83520 +/*
83521 + * Object to export per-level stat-counters through sysfs. See stats.[ch] and
83522 + * kattr.[ch]
83523 + */
83524 +typedef struct reiser4_level_stats_kobj {
83525 +       struct fs_kobject kobj;   /* file system kobject exported for each
83526 +                                  * level */
83527 +       int level;                /* tree level */
83528 +} reiser4_level_stats_kobj;
83529 +
83530 +#endif
83531 +
83532 +/*
83533 + * VFS related operation vectors.
83534 + *
83535 + * Usually file system has one instance of those, but in reiser4 we sometimes
83536 + * want to be able to modify vectors on per-mount basis. For example, reiser4
83537 + * needs ->open method to handle pseudo files correctly, but if file system is
83538 + * mounted with "nopseudo" mount option, it's better to have ->open set to
83539 + * NULL, as this makes sys_open() a little bit more efficient.
83540 + *
83541 + */
83542 +typedef struct object_ops {
83543 +       struct super_operations         super;
83544 +       struct file_operations          file;
83545 +       struct dentry_operations        dentry;
83546 +       struct address_space_operations as;
83547 +
83548 +       struct inode_operations         regular;
83549 +       struct inode_operations         dir;
83550 +       struct inode_operations         symlink;
83551 +       struct inode_operations         special;
83552 +
83553 +       struct export_operations        export;
83554 +} object_ops;
83555 +
83556 +/* reiser4-specific part of super block
83557 +
83558 +   Locking
83559 +
83560 +   Fields immutable after mount:
83561 +
83562 +    ->oid*
83563 +    ->space*
83564 +    ->default_[ug]id
83565 +    ->mkfs_id
83566 +    ->trace_flags
83567 +    ->debug_flags
83568 +    ->fs_flags
83569 +    ->df_plug
83570 +    ->optimal_io_size
83571 +    ->plug
83572 +    ->flush
83573 +    ->u (bad name)
83574 +    ->txnmgr
83575 +    ->ra_params
83576 +    ->fsuid
83577 +    ->journal_header
83578 +    ->journal_footer
83579 +
83580 +   Fields protected by ->lnode_guard
83581 +
83582 +    ->lnode_htable
83583 +
83584 +   Fields protected by per-super block spin lock
83585 +
83586 +    ->block_count
83587 +    ->blocks_used
83588 +    ->blocks_free
83589 +    ->blocks_free_committed
83590 +    ->blocks_grabbed
83591 +    ->blocks_fake_allocated_unformatted
83592 +    ->blocks_fake_allocated
83593 +    ->blocks_flush_reserved
83594 +    ->eflushed
83595 +    ->blocknr_hint_default
83596 +
83597 +   After journal replaying during mount,
83598 +
83599 +    ->last_committed_tx
83600 +
83601 +   is protected by ->tmgr.commit_semaphore
83602 +
83603 +   Invariants involving this data-type:
83604 +
83605 +      [sb-block-counts]
83606 +      [sb-grabbed]
83607 +      [sb-fake-allocated]
83608 +*/
83609 +struct reiser4_super_info_data {
83610 +       /* guard spinlock which protects reiser4 super
83611 +          block fields (currently blocks_free,
83612 +          blocks_free_committed)
83613 +       */
83614 +       reiser4_spin_data guard;
83615 +
83616 +       /*
83617 +        * object id manager
83618 +        */
83619 +       /* next oid that will be returned by oid_allocate() */
83620 +       oid_t next_to_use;
83621 +       /* total number of used oids */
83622 +       oid_t oids_in_use;
83623 +
83624 +       /* space manager plugin */
83625 +       reiser4_space_allocator space_allocator;
83626 +
83627 +       /* reiser4 internal tree */
83628 +       reiser4_tree tree;
83629 +
83630 +       /* default user id used for light-weight files without their own
83631 +          stat-data. */
83632 +       uid_t default_uid;
83633 +
83634 +       /* default group id used for light-weight files without their own
83635 +          stat-data. */
83636 +       gid_t default_gid;
83637 +
83638 +       /* mkfs identifier generated at mkfs time. */
83639 +       __u32 mkfs_id;
83640 +       /* amount of blocks in a file system */
83641 +       __u64 block_count;
83642 +
83643 +       /* inviolable reserve */
83644 +       __u64 blocks_reserved;
83645 +
83646 +       /* amount of blocks used by file system data and meta-data. */
83647 +       __u64 blocks_used;
83648 +
83649 +       /* amount of free blocks. This is "working" free blocks counter. It is
83650 +          like "working" bitmap, please see block_alloc.c for description. */
83651 +       __u64 blocks_free;
83652 +
83653 +       /* free block count for fs committed state. This is "commit" version
83654 +          of free block counter. */
83655 +       __u64 blocks_free_committed;
83656 +
83657 +       /* number of blocks reserved for further allocation, for all threads. */
83658 +       __u64 blocks_grabbed;
83659 +
83660 +       /* number of fake allocated unformatted blocks in tree. */
83661 +       __u64 blocks_fake_allocated_unformatted;
83662 +
83663 +       /* number of fake allocated formatted blocks in tree. */
83664 +       __u64 blocks_fake_allocated;
83665 +
83666 +       /* number of blocks reserved for flush operations. */
83667 +       __u64 blocks_flush_reserved;
83668 +
83669 +        /* number of blocks reserved for cluster operations. */
83670 +       __u64 blocks_clustered;
83671 +
83672 +       /* unique file-system identifier */
83673 +       /* does this conform to Andreas Dilger UUID stuff? */
83674 +       __u32 fsuid;
83675 +
83676 +       /* per-fs tracing flags. Use reiser4_trace_flags enum to set
83677 +          bits in it. */
83678 +       __u32 trace_flags;
83679 +
83680 +       /* per-fs log flags. Use reiser4_log_flags enum to set
83681 +          bits in it. */
83682 +       __u32 log_flags;
83683 +       __u32 oid_to_log;
83684 +
83685 +       /* file where tracing goes (if enabled). */
83686 +       reiser4_log_file log_file;
83687 +
83688 +       /* per-fs debugging flags. This is bitmask populated from
83689 +          reiser4_debug_flags enum. */
83690 +       __u32 debug_flags;
83691 +
83692 +       /* super block flags */
83693 +
83694 +       /* file-system wide flags. See reiser4_fs_flag enum */
83695 +       unsigned long fs_flags;
83696 +
83697 +       /* transaction manager */
83698 +       txn_mgr tmgr;
83699 +
83700 +       /* ent thread */
83701 +       entd_context entd;
83702 +
83703 +       /* fake inode used to bind formatted nodes */
83704 +       struct inode *fake;
83705 +       /* inode used to bind bitmaps (and journal heads) */
83706 +       struct inode *bitmap;
83707 +       /* inode used to bind copied on capture nodes */
83708 +       struct inode *cc;
83709 +
83710 +       /* disk layout plugin */
83711 +       disk_format_plugin *df_plug;
83712 +
83713 +       /* disk layout specific part of reiser4 super info data */
83714 +       union {
83715 +               format40_super_info format40;
83716 +       } u;
83717 +
83718 +       /*
83719 +        * value we return in st_blksize on stat(2).
83720 +        */
83721 +       unsigned long optimal_io_size;
83722 +
83723 +       /* parameters for the flush algorithm */
83724 +       flush_params flush;
83725 +
83726 +       /* see emergency_flush.c for details */
83727 +       reiser4_spin_data eflush_guard;
83728 +       /* number of emergency flushed nodes */
83729 +       int               eflushed;
83730 +#if REISER4_USE_EFLUSH
83731 +       /* hash table used by emergency flush. Protected by ->eflush_guard */
83732 +       ef_hash_table     efhash_table;
83733 +#endif
83734 +       /* pointers to jnodes for journal header and footer */
83735 +       jnode *journal_header;
83736 +       jnode *journal_footer;
83737 +
83738 +       journal_location jloc;
83739 +
83740 +       /* head block number of last committed transaction */
83741 +       __u64 last_committed_tx;
83742 +
83743 +       /* we remember last written location for using as a hint for
83744 +          new block allocation */
83745 +       __u64 blocknr_hint_default;
83746 +
83747 +       /* committed number of files (oid allocator state variable ) */
83748 +       __u64 nr_files_committed;
83749 +
83750 +       ra_params_t ra_params;
83751 +
83752 +       /* A semaphore for serializing cut tree operation if
83753 +          out-of-free-space: the only one cut_tree thread is allowed to grab
83754 +          space from reserved area (it is 5% of disk space) */
83755 +       struct semaphore delete_sema;
83756 +       /* task owning ->delete_sema */
83757 +       struct task_struct *delete_sema_owner;
83758 +
83759 +       /* serialize semaphore */
83760 +       struct semaphore flush_sema;
83761 +
83762 +       /* Diskmap's blocknumber */
83763 +       __u64 diskmap_block;
83764 +
83765 +       /* What to do in case of error */
83766 +       int onerror;
83767 +
83768 +       /* operations for objects on this file system */
83769 +       object_ops ops;
83770 +
83771 +       /* dir_cursor_info see plugin/dir/dir.[ch] for more details */
83772 +       d_cursor_info d_info;
83773 +
83774 +#if REISER4_USE_SYSFS
83775 +       /* kobject representing this file system. It is visible as
83776 +        * /sys/fs/reiser4/<devname>. All other kobjects for this file system
83777 +        * (statistical counters, tunables, etc.) are below it in sysfs
83778 +        * hierarchy. */
83779 +       struct fs_kobject kobj;
83780 +#endif
83781 +#if REISER4_STATS
83782 +       /* Statistical counters. reiser4_stat is empty data-type unless
83783 +          REISER4_STATS is set. See stats.[ch] for details. */
83784 +       reiser4_stat *stats;
83785 +       /* kobject for statistical counters. Visible as
83786 +        * /sys/fs/reiser4/<devname>/stats */
83787 +       struct fs_kobject stats_kobj;
83788 +       /* kobjects for per-level statistical counters. Each level is visible
83789 +          as /sys/fs/reiser4/<devname>/stats-NN */
83790 +       reiser4_level_stats_kobj level[REISER4_MAX_ZTREE_HEIGHT];
83791 +#endif
83792 +#ifdef CONFIG_REISER4_BADBLOCKS
83793 +       /* Alternative master superblock offset (in bytes) */
83794 +       unsigned long altsuper;
83795 +#endif
83796 +#if REISER4_LOG
83797 +       /* last disk block IO was performed against by this file system. Used
83798 +        * by tree tracing code to track seeks. */
83799 +       reiser4_block_nr last_touched;
83800 +#endif
83801 +#if REISER4_DEBUG
83802 +       /* minimum used blocks value (includes super blocks, bitmap blocks and
83803 +        * other fs reserved areas), depends on fs format and fs size. */
83804 +       __u64 min_blocks_used;
83805 +       /* amount of space allocated by kmalloc. For debugging. */
83806 +       int kmalloc_allocated;
83807 +
83808 +       /*
83809 +        * when debugging is on, all jnodes (including znodes, bitmaps, etc.)
83810 +        * are kept on a list anchored at sbinfo->all_jnodes. This list is
83811 +        * protected by sbinfo->all_guard spin lock. This lock should be taken
83812 +        * with _irq modifier, because it is also modified from interrupt
83813 +        * contexts (by RCU).
83814 +        */
83815 +
83816 +       spinlock_t all_guard;
83817 +       /* list of all jnodes */
83818 +       struct list_head all_jnodes;
83819 +
83820 +       /*XXX debugging code */
83821 +       __u64 eflushed_unformatted; /* number of eflushed unformatted nodes */
83822 +       __u64 unalloc_extent_pointers; /* number of unallocated extent pointers in the tree */
83823 +#endif
83824 +       struct repacker * repacker;
83825 +       struct page * status_page;
83826 +       struct bio * status_bio;
83827 +};
83828 +
83829 +
83830 +
83831 +extern reiser4_super_info_data *get_super_private_nocheck(const struct
83832 +                                                         super_block *super);
83833 +
83834 +extern struct super_operations reiser4_super_operations;
83835 +
83836 +/* Return reiser4-specific part of super block */
83837 +static inline reiser4_super_info_data *
83838 +get_super_private(const struct super_block * super)
83839 +{
83840 +       assert("nikita-447", super != NULL);
83841 +
83842 +       return (reiser4_super_info_data *) super->s_fs_info;
83843 +}
83844 +
83845 +/* "Current" super-block: main super block used during current system
83846 +   call. Reference to this super block is stored in reiser4_context. */
83847 +static inline struct super_block *
83848 +reiser4_get_current_sb(void)
83849 +{
83850 +       return get_current_context()->super;
83851 +}
83852 +
83853 +/* Reiser4-specific part of "current" super-block: main super block used
83854 +   during current system call. Reference to this super block is stored in
83855 +   reiser4_context. */
83856 +static inline reiser4_super_info_data *
83857 +get_current_super_private(void)
83858 +{
83859 +       return get_super_private(reiser4_get_current_sb());
83860 +}
83861 +
83862 +static inline ra_params_t *
83863 +get_current_super_ra_params(void)
83864 +{
83865 +       return &(get_current_super_private()->ra_params);
83866 +}
83867 +
83868 +/*
83869 + * true, if file system on @super is read-only
83870 + */
83871 +static inline int rofs_super(struct super_block *super)
83872 +{
83873 +       return super->s_flags & MS_RDONLY;
83874 +}
83875 +
83876 +/*
83877 + * true, if @tree represents read-only file system
83878 + */
83879 +static inline int rofs_tree(reiser4_tree *tree)
83880 +{
83881 +       return rofs_super(tree->super);
83882 +}
83883 +
83884 +/*
83885 + * true, if file system where @inode lives on, is read-only
83886 + */
83887 +static inline int rofs_inode(struct inode *inode)
83888 +{
83889 +       return rofs_super(inode->i_sb);
83890 +}
83891 +
83892 +/*
83893 + * true, if file system where @node lives on, is read-only
83894 + */
83895 +static inline int rofs_jnode(jnode *node)
83896 +{
83897 +       return rofs_tree(jnode_get_tree(node));
83898 +}
83899 +
83900 +extern __u64 reiser4_current_block_count(void);
83901 +
83902 +extern void build_object_ops(struct super_block *super, object_ops *ops);
83903 +
83904 +#define REISER4_SUPER_MAGIC 0x52345362         /* (*(__u32 *)"R4Sb"); */
83905 +
83906 +#define spin_ordering_pred_super(private) (1)
83907 +SPIN_LOCK_FUNCTIONS(super, reiser4_super_info_data, guard);
83908 +
83909 +#define spin_ordering_pred_super_eflush(private) (1)
83910 +SPIN_LOCK_FUNCTIONS(super_eflush, reiser4_super_info_data, eflush_guard);
83911 +
83912 +/*
83913 + * lock reiser4-specific part of super block
83914 + */
83915 +static inline void reiser4_spin_lock_sb(reiser4_super_info_data *sbinfo)
83916 +{
83917 +       spin_lock_super(sbinfo);
83918 +}
83919 +
83920 +/*
83921 + * unlock reiser4-specific part of super block
83922 + */
83923 +static inline void reiser4_spin_unlock_sb(reiser4_super_info_data *sbinfo)
83924 +{
83925 +       spin_unlock_super(sbinfo);
83926 +}
83927 +
83928 +/*
83929 + * lock emergency flush data-structures for super block @s
83930 + */
83931 +static inline void spin_lock_eflush(const struct super_block * s)
83932 +{
83933 +       reiser4_super_info_data * sbinfo = get_super_private (s);
83934 +       spin_lock_super_eflush(sbinfo);
83935 +}
83936 +
83937 +/*
83938 + * unlock emergency flush data-structures for super block @s
83939 + */
83940 +static inline void spin_unlock_eflush(const struct super_block * s)
83941 +{
83942 +       reiser4_super_info_data * sbinfo = get_super_private (s);
83943 +       spin_unlock_super_eflush(sbinfo);
83944 +}
83945 +
83946 +
83947 +extern __u64 flush_reserved        ( const struct super_block*);
83948 +extern void  set_flush_reserved    ( const struct super_block*, __u64 nr );
83949 +extern int reiser4_is_set(const struct super_block *super, reiser4_fs_flag f);
83950 +extern long statfs_type(const struct super_block *super);
83951 +extern int reiser4_blksize(const struct super_block *super);
83952 +extern __u64 reiser4_block_count(const struct super_block *super);
83953 +extern void reiser4_set_block_count(const struct super_block *super, __u64 nr);
83954 +extern __u64 reiser4_data_blocks(const struct super_block *super);
83955 +extern void reiser4_set_data_blocks(const struct super_block *super, __u64 nr);
83956 +extern __u64 reiser4_free_blocks(const struct super_block *super);
83957 +extern void reiser4_set_free_blocks(const struct super_block *super, __u64 nr);
83958 +extern void reiser4_inc_free_blocks(const struct super_block *super);
83959 +extern __u32 reiser4_mkfs_id(const struct super_block *super);
83960 +extern void reiser4_set_mkfs_id(const struct super_block *super, __u32 id);
83961 +
83962 +extern __u64 reiser4_free_committed_blocks(const struct super_block *super);
83963 +extern void reiser4_set_free_committed_blocks(const struct super_block *super, __u64 nr);
83964 +
83965 +extern __u64 reiser4_grabbed_blocks(const struct super_block *);
83966 +extern void reiser4_set_grabbed_blocks(const struct super_block *, __u64 nr);
83967 +extern __u64 reiser4_fake_allocated(const struct super_block *);
83968 +extern void reiser4_set_fake_allocated(const struct super_block *, __u64 nr);
83969 +extern __u64 reiser4_fake_allocated_unformatted(const struct super_block *);
83970 +extern void reiser4_set_fake_allocated_unformatted(const struct super_block *, __u64 nr);
83971 +extern __u64 reiser4_clustered_blocks(const struct super_block *);
83972 +extern void reiser4_set_clustered_blocks(const struct super_block *, __u64 nr);
83973 +
83974 +extern long reiser4_reserved_blocks(const struct super_block *super, uid_t uid, gid_t gid);
83975 +
83976 +extern reiser4_space_allocator *get_space_allocator(const struct super_block
83977 +                                                   *super);
83978 +extern reiser4_oid_allocator *get_oid_allocator(const struct super_block
83979 +                                               *super);
83980 +extern struct inode *get_super_fake(const struct super_block *super);
83981 +extern struct inode *get_cc_fake(const struct super_block *super);
83982 +extern reiser4_tree *get_tree(const struct super_block *super);
83983 +extern int is_reiser4_super(const struct super_block *super);
83984 +
83985 +extern int reiser4_blocknr_is_sane(const reiser4_block_nr *blk);
83986 +extern int reiser4_blocknr_is_sane_for(const struct super_block *super,
83987 +                                      const reiser4_block_nr *blk);
83988 +
83989 +/* Maximal possible object id. */
83990 +#define  ABSOLUTE_MAX_OID ((oid_t)~0)
83991 +
83992 +#define OIDS_RESERVED  ( 1 << 16 )
83993 +int oid_init_allocator(struct super_block *, oid_t nr_files, oid_t next);
83994 +oid_t oid_allocate(struct super_block *);
83995 +int oid_release(struct super_block *, oid_t);
83996 +oid_t oid_next(const struct super_block *);
83997 +void oid_count_allocated(void);
83998 +void oid_count_released(void);
83999 +long oids_used(const struct super_block *);
84000 +long oids_free(const struct super_block *);
84001 +
84002 +
84003 +#if REISER4_DEBUG_OUTPUT
84004 +void print_fs_info(const char *prefix, const struct super_block *);
84005 +#else
84006 +#define print_fs_info(p,s) noop
84007 +#endif
84008 +
84009 +#if REISER4_DEBUG
84010 +
84011 +void inc_unalloc_unfm_ptr(void);
84012 +void dec_unalloc_unfm_ptrs(int nr);
84013 +void inc_unfm_ef(void);
84014 +void dec_unfm_ef(void);
84015 +
84016 +#else
84017 +
84018 +#define inc_unalloc_unfm_ptr() noop
84019 +#define dec_unalloc_unfm_ptrs(nr) noop
84020 +#define inc_unfm_ef() noop
84021 +#define dec_unfm_ef() noop
84022 +
84023 +#endif
84024 +
84025 +
84026 +/* __REISER4_SUPER_H__ */
84027 +#endif
84028 +
84029 +/* Make Linus happy.
84030 +   Local variables:
84031 +   c-indentation-style: "K&R"
84032 +   mode-name: "LC"
84033 +   c-basic-offset: 8
84034 +   tab-width: 8
84035 +   fill-column: 120
84036 +   End:
84037 +*/
84038 diff -rupN linux-2.6.8-rc3/fs/reiser4/syntax.alg linux-2.6.8-rc3-a/fs/reiser4/syntax.alg
84039 --- linux-2.6.8-rc3/fs/reiser4/syntax.alg       1970-01-01 03:00:00.000000000 +0300
84040 +++ linux-2.6.8-rc3-a/fs/reiser4/syntax.alg     2004-08-05 21:20:53.184639130 +0400
84041 @@ -0,0 +1,152 @@
84042 +<TITLE>Reiser4() Syntax In All Its Obscurities</TITLE>
84043 +
84044 +<H1>Introduction</H1>
84045 +
84046 +<p> We define a file operations API, that allows accessing files and
84047 +performing operations on them.  These same files are also accessible
84048 +by the usual Linux conventional VFS API.
84049 +
84050 +<p>The existing VFS API is appropriate for accessing streams of data
84051 +that are larger than the buffers used to hold them, by using simple
84052 +hierarchical names.
84053 +
84054 +<p>
84055 +There exist other needs.
84056 +
84057 +<p> The new API provides a very few of them, but more significantly,
84058 +makes it easy to add more of them later.  Much of what we do in v4 to
84059 +expand the existing semantics can be done by accessing special
84060 +pseudo-files, and so we implement that.  Some of it, particularly
84061 +efficiently accessing multiple small files (or "attributes" which
84062 +should be implemented as files) with transactions in a single system
84063 +call, cannot be done via existing system calls, and requires a new
84064 +API.
84065 +
84066 +
84067 +<H2>Why Not Change The VFS All Filesystems Use?</H2>
84068 +
84069 +<p> It is usual to ask, why don't we implement all of the new
84070 +functionality in VFS, changing all of the other filesystems, and
84071 +getting all of the authors of all of the other filesystems to agree
84072 +with the changes and participate in making them.
84073 +
84074 +<p>The programming community has a different customary process, for
84075 +several reasons.  Too many cooks spoil the soup, regardless of how
84076 +good they are.  Only after the soup is made, can others know whether
84077 +our recipe is desirable or to be avoided.  Then, using the GPL license
84078 +with its restrictions, or obtaining another license from us if
84079 +proprietary, they can take what was shown to work well from our code,
84080 +and improve it from there.  Also, stability is very important to VFS,
84081 +and by making our experimental changes to separate code accessed by a
84082 +separate system call, we make change less disruptive.  Standards are
84083 +for when consensus has emerged, not for the tumult of active research.
84084 +It is not realistic to think that such a consensus will be quickly
84085 +reached, and a working implementation will propell such a consensus
84086 +faster than any other form of persuasion.
84087 +
84088 +
84089 +<h2>Library vs. Kernel</h2>
84090 +
84091 +<p> We have no deep reason for not adopting an exo-kernel style
84092 +approach.  We developed for Linux, and Linux is not an exo-kernel.  It
84093 +is not appropriate for us to differ from the rest of Linux in this
84094 +aspect.  I am far from religious about the micro/exo/macro kernel
84095 +debates.  Micro kernels (e.g. HURD) seem to have substantially lower
84096 +performance while being easier to debug.  It won't surprise me if
84097 +somebody cures the lowered performance someday by being clever.  Many
84098 +of the developers on our team think we should put as much of our
84099 +functionality into the libraries as possible.  The danger in doing
84100 +that blindly is the danger of experiencing micro-kernel performance
84101 +problems.  Putting the functionality either all in the kernel
84102 +(monolithic kernel (Linux)), or all out of the kernel (exo-kernel)
84103 +seems likely to result in the best performance.  Selecting certain
84104 +functionality which lends itself naturally to being layered above our
84105 +interface, and putting it into libraries, is also reasonable, though
84106 +there might not be much of it frankly. Functionality that initially
84107 +seems likely to someday evolve to access functions not exposed outside
84108 +the kernel should be inside the kernel for optimal efficiency, and for
84109 +simplicity of kernel interface. If the applications don't see whether
84110 +it is in the kernel or in the library, then one can change the
84111 +location later.  For now our first pass at the problem is to put the
84112 +functionality into the kernel part of Reiser4.  Perhaps during the
84113 +debugging phase we will even pull certain pieces out of the kernel....
84114 +
84115 +<h2>Creating Objects</h2>
84116 +
84117 +<p> Every object must have at least one name, though that name might
84118 +be just its key.  Every object has a key.
84119 +
84120 +<h3>Objects With Only Keys For Names</h3>
84121 +
84122 +Objects with only keys for names can be accessed by the name
84123 +"..key/KEY" where KEY is of type key.  "..key" is not implemented as a
84124 +tradition directory: it has no directory entries, and performs lookups
84125 +by looking in the tree for them.  Objects that may have only keys for
84126 +names must have a flag so indicating in their stat data, or else fsck
84127 +will assume they are lost files and put them in lost+found.
84128 +Permission to access by key must be explicitly granted, and by default
84129 +is not granted to any user even root.
84130 +
84131 +<H3>Object Plugin Determined At Object Creation Time</H3>
84132 +
84133 +<p> There are various non-symmetries of object creation.  Plugins can
84134 +only be specified at object creation time (this might change in the
84135 +future for some plugins, but it is universally true in v4.0), and
84136 +changing plugins requires creating a new file and copying the old file
84137 +to the new file.
84138 +
84139 +<p> The implication of these non-symmetries is that the syntax for
84140 +object creation necessarily must allow for multi-value assignment.
84141 +Each plugin must specify a set of assignments to look for, and provide
84142 +defaults for the event that they are ommitted.  There is even a
84143 +default packing locality for objects created without specified names
84144 +or packing localities.
84145 +
84146 +<H3>The object creation method is determined by the name.</H3>
84147 +
84148 +<H3>Packing Locality Is Determined At Object Creation Time</H3>
84149 +
84150 +<p> Keys determine what an object is to be packed near. If the packing
84151 +locality for an object is specified at creation time, then we use the
84152 +specified packing locality.  If not, then if a name for the object is
84153 +specified at object creation time, then we let the parent directory
84154 +present in the name determine its packing locality, which by default
84155 +means to set it equal to the packing locality of its parent directory..
84156 +<p>
84157 +The implication of this is that specifying a name for an object at
84158 +creation time has an effect in addition to that of specifying a name
84159 +at some later time if the packing locality will be inherited from the
84160 +parent directory in the name.  Also, objects whose packing localities
84161 +are equal to the parent directory for their name can have their keys
84162 +better compressed.
84163 +
84164 +<p> If an object is created without name and without specified packing
84165 +locality, then it is assigned to the default packing locality.  Since
84166 +the next field after packing locality within a key is the objectid,
84167 +and object creation time proximity correlates strongly with objectid
84168 +proximity due to the algorithms used, this has the effect of causing
84169 +objects with creation time proximity to be created near each other in
84170 +tree order and thus near each other in disk geometry.  This is an
84171 +intended effect.  Interleaved deletes may complicate this, but some
84172 +correlation will remain.  There is to be a special directory "/..keys"
84173 +which allows listing of all objects in the FS by key.
84174 +
84175 +<h3>We Offer An Object, Not Block, Storage Interface</h3>
84176 +
84177 +<p> Allowing applications to reserve and control specific blocks ala
84178 +Franz's exo-kernel is a feature not yet supported, though it would
84179 +have advantages for boot loaders and other applications.  Our current
84180 +mechanism of allowing tails to be turned off, and then allowing
84181 +querying of what blocks a file occupies, can be thought of as a poorly
84182 +abstracted version of this (and the linux boot loaders currently
84183 +support this poor abstraction we offer.)
84184 +
84185 +<p>
84186 +ascii_command =
84187 +<p>                '/home/teletubbies/(..new/(..name<-glove_location,..object_t<-(audit|encrypted), ..perm_t<-acl );
84188 +<p>                 glove_location/..acl/(uid=357, access<-denied));
84189 +<p>                /home/teletubbies/glove_location/..audit/..new/(mailto<-"(teletubbies@pbs.org));
84190 +<p>                /home/teletubbies/glove_location<-"(we stole it quite some number of years ago,and put it in the very first loot pile (in the hole near the purple flower).);'
84191 +
84192 +
84193 +
84194 diff -rupN linux-2.6.8-rc3/fs/reiser4/sys_reiser4.c linux-2.6.8-rc3-a/fs/reiser4/sys_reiser4.c
84195 --- linux-2.6.8-rc3/fs/reiser4/sys_reiser4.c    1970-01-01 03:00:00.000000000 +0300
84196 +++ linux-2.6.8-rc3-a/fs/reiser4/sys_reiser4.c  2004-08-05 21:20:53.384596954 +0400
84197 @@ -0,0 +1,101 @@
84198 +/* System call for accessing enhanced semantics of the Reiser Filesystem Version 4 (reiser4). */
84199 +
84200 +/* This system call feeds a string to parser.c, parser.c converts the
84201 +   string into a set of commands which are executed, and then this
84202 +   system call returns after the completion of those commands. */
84203 +
84204 +
84205 +#include <linux/types.h>
84206 +#include <linux/fs.h>
84207 +#include <linux/mm.h>
84208 +#include <linux/buffer_head.h>
84209 +#include <linux/dcache.h>
84210 +#include <linux/namei.h>
84211 +#include <linux/list.h>
84212 +#include <linux/pagemap.h>
84213 +#include <linux/slab.h>
84214 +#include <linux/seq_file.h>
84215 +#include <linux/init.h>
84216 +#include <linux/module.h>
84217 +#include <linux/writeback.h>
84218 +#include <linux/backing-dev.h>
84219 +#include <asm-generic/errno.h>
84220 +
84221 +#if defined(CONFIG_REISER4_FS_SYSCALL)
84222 +
84223 +#include "forward.h"
84224 +#include "debug.h"
84225 +#include "key.h"
84226 +#include "kassign.h"
84227 +#include "coord.h"
84228 +#include "seal.h"
84229 +#include "plugin/item/item.h"
84230 +#include "plugin/security/perm.h"
84231 +#include "plugin/plugin.h"
84232 +#include "plugin/object.h"
84233 +#include "znode.h"
84234 +#include "vfs_ops.h"
84235 +#include "inode.h"
84236 +#include "super.h"
84237 +#include "reiser4.h"
84238 +
84239 +#include "lnode.h"
84240 +
84241 +#include "parser/parser.h"
84242 +
84243 +#define YYREISER4_DEF
84244 +
84245 +#include "parser/parser.code.c"
84246 +
84247 +
84248 +/* @p_string is a command string for parsing
84249 +this function allocates work area for yacc,
84250 +initializes fields, calls yacc, free space
84251 +and call for execute the generated code */
84252 +
84253 +asmlinkage long
84254 +sys_reiser4(char *p_string)
84255 +{
84256 +       long ret;
84257 +       int *Gencode;
84258 +       char * str;
84259 +       struct reiser4_syscall_w_space * work_space ;
84260 +       str=getname(p_string);
84261 +       if (!IS_ERR(str)) {
84262 +               /* allocate work space for parser
84263 +                  working variables, attached to this call */
84264 +               if ( (work_space = reiser4_pars_init() ) == NULL ) {
84265 +                       return -ENOMEM;
84266 +               }
84267 +               /* initialize fields */
84268 +               /* this field used for parsing string, one (inline) stay on begin of string*/
84269 +               work_space->ws_pline  = str;
84270 +               work_space->ws_inline = work_space->ws_pline;
84271 +               PTRACE(work_space, "%s", "begin parsing");
84272 +               ret = yyparse(work_space);      /* parse command */
84273 +               reiser4_pars_free(work_space);
84274 +               putname(str);
84275 +       }
84276 +       else {
84277 +               ret = PTR_ERR(str);
84278 +       }
84279 +       return ret;
84280 +}
84281 +
84282 +#else
84283 +asmlinkage long
84284 +sys_reiser4(void *p_string)
84285 +{
84286 +       return -ENOSYS;
84287 +}
84288 +#endif
84289 +
84290 +/* Make Linus happy.
84291 +   Local variables:
84292 +   c-indentation-style: "K&R"
84293 +   mode-name: "LC"
84294 +   c-basic-offset: 8
84295 +   tab-width: 8
84296 +   fill-column: 120
84297 +   End:
84298 +*/
84299 diff -rupN linux-2.6.8-rc3/fs/reiser4/sys_reiser4_2.c linux-2.6.8-rc3-a/fs/reiser4/sys_reiser4_2.c
84300 --- linux-2.6.8-rc3/fs/reiser4/sys_reiser4_2.c  1970-01-01 03:00:00.000000000 +0300
84301 +++ linux-2.6.8-rc3-a/fs/reiser4/sys_reiser4_2.c        2004-08-05 21:20:53.083660428 +0400
84302 @@ -0,0 +1,1174 @@
84303 +
84304 +/*
84305 + * Below is an implementation of hand crafted "recursive descent" parser. It
84306 + * is not based on the yacc toolkit.
84307 + *
84308 + * This parser was initially supposed to be a prototype implementation created
84309 + * to develop and test back end methods to be used by the yacc parser. For
84310 + * this reason, parser's functions are placed into proto_* name space.
84311 + *
84312 + * Parser is actually almost non-recursive. That is, it all information
84313 + * required during the parsing is kept in a special stack, allocated by
84314 + * kmalloc, rather than on the native C stack. Parser uses C recursion, but
84315 + * only for the simplicity of prototyping---the only information communicated
84316 + * via C stack is integer return codes. It should be trivial to maintain it in
84317 + * the allocated stack and to re-write parser in the iterative manner.
84318 + *
84319 + * Grammar:
84320 + *
84321 + *     expression ::= binary_exp { ; binary_exp }
84322 + *
84323 + *     binary_exp ::= path | path binop binary_exp
84324 + *
84325 + *     path       ::= literal | rel_path | / rel_path
84326 + *
84327 + *     literal    ::= "string" | #number
84328 + *
84329 + *     rel_path   ::= name { / name }
84330 + *
84331 + *     name       ::= name_token | ( expression )
84332 + *
84333 + *     binop      ::= <-
84334 + *
84335 + *
84336 + * Examples:
84337 + *
84338 + *     (1) a/b <- /etc/passwd
84339 + *
84340 + *     (2) "foo"
84341 + *
84342 + *     (3) #3
84343 + *
84344 + *
84345 + * Implementation:
84346 + *
84347 + * parsing is done in the context represented by data type proto_ctx_t.
84348 + *
84349 + * Types of terminal tokens are represented by values of proto_token_type_t
84350 + * enumeration. Particular tokens, met during parsing are represented by
84351 + * instances of proto_token_t.
84352 + *
84353 + * proto_ctx_t contains a stack used to recursively parse
84354 + * sub-expressions. Each subexpression has a value represented by instance of
84355 + * proto_val_t. Values have types, types are represented by proto_val_type_t
84356 + * enumeration.
84357 + *
84358 + * Each non-terminal token is parsed by special function, and all terminals
84359 + * are parsed by next_token().
84360 + *
84361 + * TO BE DONE:
84362 + *
84363 + *     1. more sophisticated fs_point_t with support for lnodes
84364 + *
84365 + *     2. hub-based assignment
84366 + *
84367 + *     3. locking during name resolutions
84368 + *
84369 + *
84370 + *
84371 + *
84372 + */
84373 +
84374 +
84375 +#include "debug.h"
84376 +#include "lnode.h"
84377 +
84378 +#include <linux/ctype.h>
84379 +#include <linux/mount.h> /* mnt{get,put}() */
84380 +
84381 +/* maximal recursion depth */
84382 +#define PROTO_LEVELS (100)
84383 +
84384 +/* types of terminal tokens */
84385 +typedef enum proto_token_type {
84386 +       TOKEN_NAME,             /* file name, part of a pathname */
84387 +       TOKEN_SLASH,            /* / */
84388 +       TOKEN_ASSIGNMENT,       /* <- */
84389 +       TOKEN_LPAREN,           /* ( */
84390 +       TOKEN_RPAREN,           /* ) */
84391 +       TOKEN_STRING,           /* "foo" string literal */
84392 +       TOKEN_NUMBER,           /* #100  decimal number */
84393 +       TOKEN_LESS_THAN,        /* < */
84394 +       TOKEN_GREATER_THAN,     /* > */
84395 +       TOKEN_EQUAL_TO,         /* = */
84396 +       TOKEN_SEMICOLON,        /* ; */
84397 +       TOKEN_COMMA,            /* , */
84398 +       TOKEN_EOF,              /* eof-of-file reached */
84399 +       TOKEN_INVALID           /* syntax-error */
84400 +} proto_token_type_t;
84401 +
84402 +/* terminal token */
84403 +typedef struct proto_token {
84404 +       /* type of the token */
84405 +       proto_token_type_t  type;
84406 +       /* position within command, where this token starts */
84407 +       int                 pos;
84408 +       /* union of data associated with this token */
84409 +       union {
84410 +               struct {
84411 +                       /* for name and string literal: token length */
84412 +                       int len;
84413 +                       /* offset from ->pos to position where actual token
84414 +                        * content starts */
84415 +                       int delta;
84416 +               } name, string;
84417 +               struct {
84418 +                       /* for number---its value */
84419 +                       long val;
84420 +               } number;
84421 +       } u;
84422 +} proto_token_t;
84423 +
84424 +/* types of values that expressions can result in */
84425 +typedef enum proto_val_type {
84426 +       /* file system object---pathname results in this */
84427 +       VAL_FSOBJ,
84428 +       /* number---number literal and assignment result in this */
84429 +       VAL_NUMBER,
84430 +       /* string---string literal results in this */
84431 +       VAL_STRING,
84432 +       /* error---ill-formed expression, and execution error result in
84433 +        * this */
84434 +       VAL_ERROR,
84435 +       /* no value */
84436 +       VAL_VOID
84437 +} proto_val_type_t;
84438 +
84439 +/* file system object representation. This is needed to interface with VFS */
84440 +typedef struct fs_point {
84441 +       struct dentry   *dentry;
84442 +       struct vfsmount *mnt;
84443 +} fs_point_t;
84444 +
84445 +/* value of expression */
84446 +typedef struct proto_val {
84447 +       /* value type */
84448 +       proto_val_type_t type;
84449 +       /* value itself. Union by various value types. */
84450 +       union {
84451 +               /* VAL_FSOBJ */
84452 +               fs_point_t fsobj;
84453 +               /* VAL_NUMBER */
84454 +               long       number;
84455 +               /* VAL_STRING */
84456 +               struct {
84457 +                       char *string;
84458 +                       int   len;
84459 +               } string;
84460 +               /* VAL_ERROR */
84461 +               struct {
84462 +                       /* error message */
84463 +                       char  *error;
84464 +                       /* position in a command, where error occurred */
84465 +                       int    error_pos;
84466 +               } error;
84467 +       } u;
84468 +} proto_val_t;
84469 +
84470 +/* data maintained for each recursion level. */
84471 +typedef struct proto_level {
84472 +       /* error message, if error occurred at this level */
84473 +       const char    *error;
84474 +       /* error position within command, if error occurred at this level */
84475 +       int            error_pos;
84476 +       /* value of expression, calculated at this level */
84477 +       proto_val_t    val;
84478 +       /* point in a file system from which relative names are resolved at
84479 +        * this level */
84480 +       fs_point_t     cur;
84481 +} proto_level_t;
84482 +
84483 +/* global parsing flags */
84484 +typedef enum proto_flags {
84485 +       /* set whenever syntax error is detected */
84486 +       CTX_PARSE_ERROR = (1 << 0)
84487 +} proto_flags_t;
84488 +
84489 +/* parsing context. */
84490 +typedef struct proto_ctx {
84491 +       /* global flags */
84492 +       __u32          flags;
84493 +       /* command being parsed and executed */
84494 +       const char    *command;
84495 +       /* length of ->command */
84496 +       int            len;
84497 +       /* current parsing position within ->command */
84498 +       int            pos;
84499 +       /* recursion depth */
84500 +       int            depth;
84501 +       /* array of levels */
84502 +       proto_level_t *level;
84503 +       /* where to resolve relative pathnames from */
84504 +       fs_point_t     cwd;
84505 +       /* where to resolve absolute pathnames from */
84506 +       fs_point_t     root;
84507 +} proto_ctx_t;
84508 +
84509 +static int parse_exp(proto_ctx_t *ctx);
84510 +
84511 +#define PTRACE(ctx, format, ... )                                              \
84512 +({                                                                             \
84513 +       ON_TRACE(TRACE_PARSE, "parse: %02i at %i[%c]: %s: " format "\n",        \
84514 +                ctx->depth,                                                    \
84515 +                ctx->pos, char_at(ctx, ctx->pos) ? : '.',                      \
84516 +                __FUNCTION__ , __VA_ARGS__);                                   \
84517 +})
84518 +
84519 +/* methods to manipulate fs_point_t objects */
84520 +
84521 +/* acquire a reference to @fsobj */
84522 +static fs_point_t *fsget(fs_point_t *fsobj)
84523 +{
84524 +       dget(fsobj->dentry);
84525 +       mntget(fsobj->mnt);
84526 +       return fsobj;
84527 +}
84528 +
84529 +/* release a reference to @fsobj */
84530 +static void fsput(fs_point_t *fsobj)
84531 +{
84532 +       if (fsobj->dentry != NULL) {
84533 +               dput(fsobj->dentry);
84534 +               fsobj->dentry = NULL;
84535 +       }
84536 +       if (fsobj->mnt != NULL) {
84537 +               mntput(fsobj->mnt);
84538 +               fsobj->mnt = NULL;
84539 +       }
84540 +}
84541 +
84542 +/* duplicate a reference to @src in @dst */
84543 +static fs_point_t *fscpy(fs_point_t *dst, fs_point_t *src)
84544 +{
84545 +       *dst = *src;
84546 +       return fsget(dst);
84547 +}
84548 +
84549 +/* current character in a command */
84550 +static char char_at(proto_ctx_t *ctx, int pos)
84551 +{
84552 +       if (pos < ctx->len)
84553 +               return ctx->command[pos];
84554 +       else
84555 +               return 0;
84556 +}
84557 +
84558 +/* current level */
84559 +static proto_level_t *get_level(proto_ctx_t *ctx)
84560 +{
84561 +       assert("nikita-3233", ctx->depth < PROTO_LEVELS);
84562 +       return &ctx->level[ctx->depth];
84563 +}
84564 +
84565 +/* current value---value stored in the current level */
84566 +static proto_val_t *get_val(proto_ctx_t *ctx)
84567 +{
84568 +       return &get_level(ctx)->val;
84569 +}
84570 +
84571 +/* from where relative names should be resolved */
84572 +static fs_point_t *get_cur(proto_ctx_t *ctx)
84573 +{
84574 +       int i;
84575 +
84576 +       for (i = ctx->depth; i >= 0; -- i) {
84577 +               if (ctx->level[i].cur.dentry != NULL)
84578 +                       return &ctx->level[i].cur;
84579 +       }
84580 +       return &ctx->cwd;
84581 +}
84582 +
84583 +/* move typed value from one location to another */
84584 +static void proto_val_move(proto_val_t *dst, proto_val_t *src)
84585 +{
84586 +       xmemmove(dst, src, sizeof *dst);
84587 +       src->type = VAL_VOID;
84588 +}
84589 +
84590 +/* finish with value */
84591 +static void proto_val_put(proto_val_t *val)
84592 +{
84593 +       switch(val->type) {
84594 +       case VAL_FSOBJ:
84595 +               fsput(&val->u.fsobj);
84596 +               break;
84597 +       case VAL_STRING:
84598 +               if (val->u.string.string != NULL) {
84599 +                       kfree(val->u.string.string);
84600 +                       val->u.string.string = NULL;
84601 +               }
84602 +               break;
84603 +       case VAL_NUMBER:
84604 +       case VAL_ERROR:
84605 +       case VAL_VOID:
84606 +               break;
84607 +       }
84608 +       val->type = VAL_VOID;
84609 +}
84610 +
84611 +/* move value one level up. Useful when value produced by an expression is the
84612 + * value of its sub-expression. */
84613 +static void proto_val_up(proto_ctx_t *ctx)
84614 +{
84615 +       assert("nikita-3236", ctx->depth > 0);
84616 +       proto_val_move(&ctx->level[ctx->depth - 1].val, get_val(ctx));
84617 +}
84618 +
84619 +/* signal an error */
84620 +static void post_error(proto_ctx_t *ctx, char *error)
84621 +{
84622 +       proto_val_t *val;
84623 +
84624 +       PTRACE(ctx, "%s", error);
84625 +
84626 +       get_level(ctx)->error = error;
84627 +       get_level(ctx)->error_pos = ctx->pos;
84628 +       ctx->flags |= CTX_PARSE_ERROR;
84629 +       val = get_val(ctx);
84630 +       proto_val_put(val);
84631 +       val->type = VAL_ERROR;
84632 +       val->u.error.error = error;
84633 +       val->u.error.error_pos = ctx->pos;
84634 +}
84635 +
84636 +/* parse string literal */
84637 +static proto_token_type_t extract_string(proto_ctx_t *ctx, int *outpos,
84638 +                                        proto_token_t *token)
84639 +{
84640 +       int len;
84641 +       int pos;
84642 +
84643 +       /* simplistic string literal---no escape handling. Feel free to
84644 +        * improve. */
84645 +       pos = *outpos;
84646 +       for (len = 0; ; ++ len, ++ pos) {
84647 +               char ch;
84648 +
84649 +               ch = char_at(ctx, pos);
84650 +               if (ch == '"') {
84651 +                       token->type = TOKEN_STRING;
84652 +                       token->u.string.len = len;
84653 +                       /* string literal start with a quote that should be
84654 +                        * skipped */
84655 +                       token->u.string.delta = 1;
84656 +                       *outpos = pos + 1;
84657 +                       PTRACE(ctx, "%i", len);
84658 +                       break;
84659 +               } else if (ch == 0) {
84660 +                       token->type = TOKEN_INVALID;
84661 +                       post_error(ctx, "eof in string");
84662 +                       break;
84663 +               }
84664 +       }
84665 +       return token->type;
84666 +}
84667 +
84668 +static int unhex(char ch)
84669 +{
84670 +       ch = tolower(ch);
84671 +
84672 +       if (ch >= '0' && ch <= '9')
84673 +               return ch - '0';
84674 +       else if (ch >= 'a' && ch <= 'f')
84675 +               return ch - 'a' + 0xa;
84676 +       return 0xff;
84677 +}
84678 +
84679 +/* construct zero number */
84680 +static proto_token_type_t number_zero(proto_token_t *token)
84681 +{
84682 +       token->type = TOKEN_NUMBER;
84683 +       token->u.number.val = 0;
84684 +       return TOKEN_NUMBER;
84685 +}
84686 +
84687 +/* parse number literal */
84688 +static proto_token_type_t extract_number(proto_ctx_t *ctx, int *pos,
84689 +                                        proto_token_t *token)
84690 +{
84691 +       char ch;
84692 +       int  sign;
84693 +       int  base;
84694 +       long val;
84695 +
84696 +       ch = char_at(ctx, *pos);
84697 +
84698 +       sign = +1;
84699 +
84700 +       if (ch == '+')
84701 +               ++ *pos;
84702 +       else if (ch == '-') {
84703 +               ++ *pos;
84704 +               sign = -1;
84705 +       } else if (!isdigit(ch)) {
84706 +               token->type = TOKEN_INVALID;
84707 +               *pos = token->pos;
84708 +               return TOKEN_INVALID;
84709 +       }
84710 +
84711 +       val = (ch - '0');
84712 +       base = 10;
84713 +       ++ *pos;
84714 +       if (val == 0) {
84715 +               base = 010;
84716 +               if (!isxdigit(char_at(ctx, *pos)) &&
84717 +                   isxdigit(char_at(ctx, *pos + 1))) {
84718 +                       /* 0[xXoOdDtT]<digits> */
84719 +                       switch (char_at(ctx, *pos)) {
84720 +                       case 'x':
84721 +                       case 'X':
84722 +                               base = 0x10;
84723 +                               break;
84724 +                       case 'o':
84725 +                       case 'O':
84726 +                               base = 010;
84727 +                               break;
84728 +                       case 'd':
84729 +                       case 'D':
84730 +                               base = 10;
84731 +                               break;
84732 +                       case 't':
84733 +                       case 'T':
84734 +                               base = 2;
84735 +                               break;
84736 +                       default:
84737 +                               return number_zero(token);
84738 +                       }
84739 +                       if (unhex(char_at(ctx, *pos + 1)) >= base)
84740 +                               return number_zero(token);
84741 +                       ++ *pos;
84742 +               }
84743 +       }
84744 +       for (;; ++ *pos) {
84745 +               int  digit;
84746 +               long newval;
84747 +
84748 +               ch = char_at(ctx, *pos);
84749 +               if (!isxdigit(ch))
84750 +                       break;
84751 +               digit = unhex(ch);
84752 +               if (digit < 0 || digit >= base)
84753 +                       break;
84754 +               newval = val * base + digit;
84755 +               if (newval > val || (val == newval && digit == 0))
84756 +                       val = newval;
84757 +               else {
84758 +                       token->type = TOKEN_INVALID;
84759 +                       post_error(ctx, "integer overflow");
84760 +                       *pos = token->pos;
84761 +                       return TOKEN_INVALID;
84762 +               }
84763 +       }
84764 +       token->type = TOKEN_NUMBER;
84765 +       PTRACE(ctx, "%li", val);
84766 +       token->u.number.val = sign * val;
84767 +       return token->type;
84768 +}
84769 +
84770 +/* parse name token */
84771 +static proto_token_type_t extract_name(proto_ctx_t *ctx, int *pos,
84772 +                                      proto_token_t *token)
84773 +{
84774 +       int len;
84775 +
84776 +       /* name is sequence of any characters save for /, <, and + */
84777 +       for (len = 0;  ; ++ *pos, ++ len) {
84778 +               char ch;
84779 +
84780 +               ch = char_at(ctx, *pos);
84781 +               if (isspace(ch))
84782 +                       break;
84783 +               if (ch == 0)
84784 +                       break;
84785 +               if (strchr("/+-=()[]<>;,", ch) != NULL)
84786 +                       break;
84787 +       }
84788 +       if (len == 0) {
84789 +               token->type = TOKEN_INVALID;
84790 +       } else {
84791 +               token->type = TOKEN_NAME;
84792 +               token->u.name.len = len;
84793 +               token->u.name.delta = 0;
84794 +               PTRACE(ctx, "%i", len);
84795 +       }
84796 +       return token->type;
84797 +}
84798 +
84799 +static proto_token_type_t extract_extended_string(proto_ctx_t *ctx, int *pos,
84800 +                                                 proto_token_t *token,
84801 +                                                 proto_token_type_t ttype)
84802 +{
84803 +       proto_token_t width;
84804 +
84805 +       /* s<width>:bytes */
84806 +       token->type = TOKEN_INVALID;
84807 +       ++ *pos;
84808 +       /* <width>:bytes */
84809 +       if (extract_number(ctx, pos, &width) == TOKEN_NUMBER) {
84810 +               /* :bytes */
84811 +               if (char_at(ctx, *pos) == ':') {
84812 +                       ++ *pos;
84813 +                       /* bytes */
84814 +                       token->type = ttype;
84815 +                       token->u.string.len = width.u.number.val;
84816 +                       token->u.string.delta = *pos - token->pos;
84817 +                       *pos += token->u.string.len;
84818 +               }
84819 +       }
84820 +       if (token->type == TOKEN_INVALID)
84821 +               *pos = token->pos;
84822 +       return token->type;
84823 +}
84824 +
84825 +/* parse #-literal */
84826 +static proto_token_type_t extract_extended_literal(proto_ctx_t *ctx, int *pos,
84827 +                                                  proto_token_t *token)
84828 +{
84829 +       char ch;
84830 +
84831 +       ch = char_at(ctx, *pos);
84832 +       if (isdigit(ch))
84833 +               return extract_number(ctx, pos, token);
84834 +
84835 +       /* "#s<width>:bytes" */
84836 +       if (ch == 's')
84837 +               return extract_extended_string(ctx, pos, token, TOKEN_STRING);
84838 +       if (ch == 'n')
84839 +               return extract_extended_string(ctx, pos, token, TOKEN_NAME);
84840 +       /* put "#" back */
84841 +       -- *pos;
84842 +       token->type = TOKEN_INVALID;
84843 +       return TOKEN_INVALID;
84844 +}
84845 +
84846 +/* return next token */
84847 +static proto_token_type_t next_token(proto_ctx_t *ctx,
84848 +                                    proto_token_t *token)
84849 +{
84850 +       proto_token_type_t ttype;
84851 +       int pos;
84852 +
84853 +       /* skip white spaces */
84854 +       for (; isspace(char_at(ctx, ctx->pos)) ; ++ ctx->pos)
84855 +       {;}
84856 +
84857 +       pos = token->pos = ctx->pos;
84858 +       switch (char_at(ctx, pos ++)) {
84859 +       case '/':
84860 +               ttype = TOKEN_SLASH;
84861 +               break;
84862 +       case '(':
84863 +               ttype = TOKEN_LPAREN;
84864 +               break;
84865 +       case ')':
84866 +               ttype = TOKEN_RPAREN;
84867 +               break;
84868 +       case ';':
84869 +               ttype = TOKEN_SEMICOLON;
84870 +               break;
84871 +       case ',':
84872 +               ttype = TOKEN_COMMA;
84873 +               break;
84874 +       case '"':
84875 +               ttype = extract_string(ctx, &pos, token);
84876 +               break;
84877 +       case '<':
84878 +               if (char_at(ctx, pos) == '-') {
84879 +                       ttype = TOKEN_ASSIGNMENT;
84880 +                       ++ pos;
84881 +               } else
84882 +                       ttype = TOKEN_LESS_THAN;
84883 +               break;
84884 +       case 0:
84885 +               ttype = TOKEN_EOF;
84886 +               -- pos;
84887 +               break;
84888 +       case '#':
84889 +               ttype = extract_extended_literal(ctx, &pos, token);
84890 +               break;
84891 +       default:
84892 +               -- pos;
84893 +               ttype = extract_name(ctx, &pos, token);
84894 +               break;
84895 +       }
84896 +       token->type = ttype;
84897 +       ctx->pos = pos;
84898 +       PTRACE(ctx, "%i", ttype);
84899 +       return ttype;
84900 +}
84901 +
84902 +/* push token back into command, so that next_token() will return @token
84903 + * again */
84904 +static void back_token(proto_ctx_t *ctx, proto_token_t *token)
84905 +{
84906 +       assert("nikita-3237", ctx->pos >= token->pos);
84907 +       /* it is -that- simple */
84908 +       ctx->pos = token->pos;
84909 +}
84910 +
84911 +/* finish with context, release all resources */
84912 +static void ctx_done(proto_ctx_t *ctx)
84913 +{
84914 +       if (ctx->level != NULL) {
84915 +               kfree(ctx->level);
84916 +               ctx->level = NULL;
84917 +       }
84918 +       fsput(&ctx->cwd);
84919 +       fsput(&ctx->root);
84920 +}
84921 +
84922 +/* initialize context for parsing and executing @command */
84923 +static int ctx_init(proto_ctx_t *ctx, const char *command)
84924 +{
84925 +       int result;
84926 +
84927 +       xmemset(ctx, 0, sizeof *ctx);
84928 +       ctx->command = command;
84929 +       ctx->len = strlen(command);
84930 +       ctx->level = kmalloc(sizeof (ctx->level[0]) * PROTO_LEVELS,
84931 +                            GFP_KERNEL);
84932 +       xmemset(ctx->level, 0, sizeof (ctx->level[0]) * PROTO_LEVELS);
84933 +       if (ctx->level != NULL) {
84934 +
84935 +               read_lock(&current->fs->lock);
84936 +               ctx->cwd.dentry  = dget(current->fs->pwd);
84937 +               ctx->cwd.mnt     = mntget(current->fs->pwdmnt);
84938 +               ctx->root.dentry = dget(current->fs->root);
84939 +               ctx->root.mnt    = mntget(current->fs->rootmnt);
84940 +               read_unlock(&current->fs->lock);
84941 +
84942 +               result = 0;
84943 +       } else
84944 +               result = -ENOMEM;
84945 +       if (result != 0)
84946 +               ctx_done(ctx);
84947 +       return result;
84948 +}
84949 +
84950 +/* go one level deeper to parse and execute sub-expression */
84951 +static int inlevel(proto_ctx_t *ctx)
84952 +{
84953 +       if (ctx->depth >= PROTO_LEVELS - 1) {
84954 +               /* handle stack overflow */
84955 +               post_error(ctx, "stack overflow");
84956 +               return -EOVERFLOW;
84957 +       }
84958 +       ++ ctx->depth;
84959 +       xmemset(get_level(ctx), 0, sizeof *get_level(ctx));
84960 +       get_val(ctx)->type = VAL_VOID;
84961 +       return 0;
84962 +}
84963 +
84964 +/* go one level up */
84965 +static void exlevel(proto_ctx_t *ctx)
84966 +{
84967 +       assert("nikita-3235", ctx->depth > 0);
84968 +       proto_val_put(get_val(ctx));
84969 +       fsput(&get_level(ctx)->cur);
84970 +       -- ctx->depth;
84971 +}
84972 +
84973 +/* given @token which should be token for string literal, produce string
84974 + * value */
84975 +static void build_string_val(proto_ctx_t *ctx,
84976 +                            proto_token_t *token, proto_val_t *val)
84977 +{
84978 +       int len;
84979 +
84980 +       assert("nikita-3238",
84981 +              token->type == TOKEN_STRING || token->type == TOKEN_NAME);
84982 +
84983 +       len = token->u.string.len;
84984 +       val->type = VAL_STRING;
84985 +       val->u.string.string = kmalloc(len + 1, GFP_KERNEL);
84986 +       if (val->u.string.string != NULL) {
84987 +               strncpy(val->u.string.string,
84988 +                       ctx->command + token->pos + token->u.string.delta, len);
84989 +               val->u.string.string[len] = 0;
84990 +               val->u.string.len = len;
84991 +       }
84992 +}
84993 +
84994 +/* given @token which should be token for a number literal, produce number
84995 + * value */
84996 +static void build_number_val(proto_ctx_t *ctx,
84997 +                            proto_token_t *token, proto_val_t *val)
84998 +{
84999 +       assert("nikita-3245", token->type == TOKEN_NUMBER);
85000 +
85001 +       val->type = VAL_NUMBER;
85002 +       val->u.number = token->u.number.val;
85003 +}
85004 +
85005 +/* follow mount points. COPIED from fs/namei.c */
85006 +static void follow_mount(fs_point_t * fsobj)
85007 +{
85008 +       while (d_mountpoint(fsobj->dentry)) {
85009 +               struct vfsmount *mounted;
85010 +
85011 +               spin_lock(&dcache_lock);
85012 +               mounted = lookup_mnt(fsobj->mnt, fsobj->dentry);
85013 +               if (!mounted) {
85014 +                       spin_unlock(&dcache_lock);
85015 +                       break;
85016 +               }
85017 +               fsobj->mnt = mntget(mounted);
85018 +               spin_unlock(&dcache_lock);
85019 +               dput(fsobj->dentry);
85020 +               mntput(mounted->mnt_parent);
85021 +               fsobj->dentry = dget(mounted->mnt_root);
85022 +       }
85023 +}
85024 +
85025 +/* resolve @name within @parent and return resulting object in @fsobj.
85026 + * COPIED from fs/namei.c, fs/dcache.c */
85027 +static int lookup(fs_point_t * parent, const char * name, fs_point_t * fsobj)
85028 +{
85029 +       unsigned long hash;
85030 +       struct qstr qname;
85031 +       int result;
85032 +       unsigned int c;
85033 +
85034 +       qname.name = name;
85035 +       c = *(const unsigned char *)name;
85036 +
85037 +       hash = init_name_hash();
85038 +       do {
85039 +               name++;
85040 +               hash = partial_name_hash(c, hash);
85041 +               c = *(const unsigned char *)name;
85042 +       } while (c != 0);
85043 +       qname.len = name - (const char *) qname.name;
85044 +       qname.hash = end_name_hash(hash);
85045 +
85046 +       result = 0;
85047 +       fsobj->dentry = __d_lookup(parent->dentry, &qname);
85048 +       if (fsobj->dentry == NULL) {
85049 +               struct inode *dir;
85050 +
85051 +               dir = parent->dentry->d_inode;
85052 +               down(&dir->i_sem);
85053 +               fsobj->dentry = d_lookup(parent->dentry, &qname);
85054 +               if (fsobj->dentry == NULL) {
85055 +                       struct dentry * new;
85056 +
85057 +                       new = d_alloc(parent->dentry, &qname);
85058 +                       if (new != NULL) {
85059 +                               fsobj->dentry = dir->i_op->lookup(dir, new);
85060 +                               if (fsobj->dentry != NULL) {
85061 +                                       dput(new);
85062 +                                       result = PTR_ERR(fsobj->dentry);
85063 +                               } else if (new->d_inode != NULL)
85064 +                                       fsobj->dentry = new;
85065 +                               else {
85066 +                                       dput(new);
85067 +                                       result = RETERR(-ENOENT);
85068 +                               }
85069 +                       } else
85070 +                               result = RETERR(-ENOMEM);
85071 +               }
85072 +               up(&dir->i_sem);
85073 +       }
85074 +       if (result == 0) {
85075 +               fsobj->mnt = parent->mnt;
85076 +               follow_mount(fsobj);
85077 +       }
85078 +       return result;
85079 +}
85080 +
85081 +#define START_KERNEL_IO                                \
85082 +        {                                      \
85083 +               mm_segment_t __ski_old_fs;      \
85084 +                                               \
85085 +               __ski_old_fs = get_fs();        \
85086 +               set_fs(KERNEL_DS)
85087 +
85088 +#define END_KERNEL_IO                          \
85089 +               set_fs(__ski_old_fs);           \
85090 +       }
85091 +
85092 +#define PUMP_BUF_SIZE (PAGE_CACHE_SIZE)
85093 +
85094 +/* perform actual assignment (copying) from @righthand to @lefthand */
85095 +static int pump(fs_point_t *lefthand, fs_point_t *righthand)
85096 +{
85097 +       int result;
85098 +       char *buf;
85099 +       loff_t readoff;
85100 +       loff_t writeoff;
85101 +       struct file *dst;
85102 +       struct file *src;
85103 +
85104 +       buf = kmalloc(PUMP_BUF_SIZE, GFP_KERNEL);
85105 +       if (buf == NULL)
85106 +               return RETERR(-ENOMEM);
85107 +
85108 +       src = dentry_open(righthand->dentry, righthand->mnt, O_RDONLY);
85109 +       if (!IS_ERR(src)) {
85110 +               mntget(righthand->mnt); /* simulate open_namei() */
85111 +               dget(righthand->dentry);
85112 +               dst = dentry_open(lefthand->dentry, lefthand->mnt, O_WRONLY);
85113 +               if (!IS_ERR(dst)) {
85114 +                       mntget(lefthand->mnt); /* simulate open_namei() */
85115 +                       dget(lefthand->dentry);
85116 +                       readoff = writeoff = 0;
85117 +                       result = 0;
85118 +                       START_KERNEL_IO;
85119 +                       while (result >= 0) {
85120 +                               result = vfs_read(src,
85121 +                                                 buf, PUMP_BUF_SIZE, &readoff);
85122 +                               if (result <= 0)
85123 +                                       break;
85124 +                               /* give other threads chance to run */
85125 +                               preempt_point();
85126 +                               result = vfs_write(dst, buf, result, &writeoff);
85127 +                       }
85128 +                       END_KERNEL_IO;
85129 +                       if (result == 0)
85130 +                               result = writeoff;
85131 +                       filp_close(dst, current->files);
85132 +               } else
85133 +                       result = PTR_ERR(dst);
85134 +               filp_close(src, current->files);
85135 +       } else
85136 +               result = PTR_ERR(src);
85137 +
85138 +       kfree(buf);
85139 +       return result;
85140 +}
85141 +
85142 +/* perform actual assignment (copying) from buffer to @lefthand */
85143 +static int pump_buf(fs_point_t *lefthand, char *buf, int len)
85144 +{
85145 +       int result;
85146 +       loff_t writeoff;
85147 +       struct file *dst;
85148 +
85149 +       dst = dentry_open(lefthand->dentry, lefthand->mnt, O_WRONLY);
85150 +       if (!IS_ERR(dst)) {
85151 +               writeoff = 0;
85152 +               result = 0;
85153 +               START_KERNEL_IO;
85154 +               while (len > writeoff && result >= 0)
85155 +                       result = vfs_write(dst, buf + writeoff,
85156 +                                          len - writeoff, &writeoff);
85157 +               END_KERNEL_IO;
85158 +               if (result == 0)
85159 +                       result = writeoff;
85160 +               filp_close(dst, current->files);
85161 +       } else
85162 +               result = PTR_ERR(dst);
85163 +
85164 +       return result;
85165 +}
85166 +
85167 +/* prepare and perform assignment, store result at the level */
85168 +static int proto_assign(proto_ctx_t *ctx, proto_val_t *lhs, proto_val_t *rhs)
85169 +{
85170 +       int result;
85171 +       fs_point_t dst;
85172 +
85173 +       if (lhs->type != VAL_FSOBJ) {
85174 +               post_error(ctx, "cannot assign");
85175 +               return -EINVAL;
85176 +       }
85177 +
85178 +       fscpy(&dst, &lhs->u.fsobj);
85179 +       switch (rhs->type) {
85180 +       case VAL_FSOBJ: {
85181 +               result = pump(&dst, &rhs->u.fsobj);
85182 +               break;
85183 +       }
85184 +       case VAL_NUMBER: {
85185 +               char buf[20];
85186 +
85187 +               snprintf(buf, sizeof buf, "%li", rhs->u.number);
85188 +               result = pump_buf(&dst, buf, strlen(buf));
85189 +               break;
85190 +       }
85191 +       case VAL_STRING: {
85192 +               result = pump_buf(&dst, rhs->u.string.string, rhs->u.string.len);
85193 +               break;
85194 +       }
85195 +       default:
85196 +               post_error(ctx, "lnode expected");
85197 +               result = -EINVAL;
85198 +       }
85199 +       fsput(&dst);
85200 +       if (result >= 0) {
85201 +               proto_val_t *ret;
85202 +
85203 +               ret = get_val(ctx);
85204 +               proto_val_put(ret);
85205 +               ret->type = VAL_NUMBER;
85206 +               ret->u.number = result;
85207 +               result = 0;
85208 +       }
85209 +       return result;
85210 +}
85211 +
85212 +/* parse "name" token */
85213 +static int parse_name(proto_ctx_t *ctx)
85214 +{
85215 +       int result;
85216 +       proto_token_t token;
85217 +
85218 +       /* name ::= name_token | ( expression ) */
85219 +
85220 +       next_token(ctx, &token);
85221 +       PTRACE(ctx, "%i", token.type);
85222 +
85223 +       result = 0;
85224 +       switch (token.type) {
85225 +       case TOKEN_NAME: {
85226 +               proto_val_t name;
85227 +               fs_point_t  child;
85228 +
85229 +               build_string_val(ctx, &token, &name);
85230 +               result = lookup(get_cur(ctx), name.u.string.string, &child);
85231 +               if (result == -ENOENT || child.dentry->d_inode == NULL) {
85232 +                       post_error(ctx, "not found");
85233 +                       result = -ENOENT;
85234 +               } else if (result == 0) {
85235 +                       proto_val_put(get_val(ctx));
85236 +                       get_val(ctx)->type = VAL_FSOBJ;
85237 +                       fscpy(&get_val(ctx)->u.fsobj, &child);
85238 +               } else
85239 +                       post_error(ctx, "lookup failure");
85240 +               proto_val_put(&name);
85241 +               break;
85242 +       }
85243 +       case TOKEN_LPAREN: {
85244 +               proto_token_t rparen;
85245 +
85246 +               result = inlevel(ctx);
85247 +               if (result == 0) {
85248 +                       result = parse_exp(ctx);
85249 +                       proto_val_up(ctx);
85250 +                       exlevel(ctx);
85251 +                       if (next_token(ctx, &rparen) != TOKEN_RPAREN) {
85252 +                               post_error(ctx, "expecting `)'");
85253 +                               result = -EINVAL;
85254 +                       }
85255 +               }
85256 +               break;
85257 +       }
85258 +       case TOKEN_INVALID:
85259 +               post_error(ctx, "huh");
85260 +               result = -EINVAL;
85261 +       default:
85262 +               back_token(ctx, &token);
85263 +               break;
85264 +       }
85265 +       return result;
85266 +}
85267 +
85268 +/* parse "path" token */
85269 +static int parse_rel_path(proto_ctx_t *ctx, fs_point_t *start)
85270 +{
85271 +       int result;
85272 +
85273 +       /* rel_path ::= name { / name } */
85274 +
85275 +       result = inlevel(ctx);
85276 +       if (result != 0)
85277 +               return result;
85278 +
85279 +       fscpy(&get_level(ctx)->cur, start);
85280 +
85281 +       while (1) {
85282 +               proto_token_t token;
85283 +               proto_val_t  *val;
85284 +
85285 +               result = parse_name(ctx);
85286 +               if (result != 0)
85287 +                       break;
85288 +
85289 +               val = get_val(ctx);
85290 +               if (val->type != VAL_FSOBJ) {
85291 +                       post_error(ctx, "name is not an file system object");
85292 +                       break;
85293 +               }
85294 +
85295 +               fsput(&get_level(ctx)->cur);
85296 +               fscpy(&get_level(ctx)->cur, &val->u.fsobj);
85297 +
85298 +               next_token(ctx, &token);
85299 +               PTRACE(ctx, "%i", token.type);
85300 +
85301 +               if (token.type != TOKEN_SLASH) {
85302 +                       back_token(ctx, &token);
85303 +                       break;
85304 +               }
85305 +       }
85306 +       proto_val_up(ctx);
85307 +       exlevel(ctx);
85308 +       return result;
85309 +}
85310 +
85311 +/* parse "path" token */
85312 +static int parse_path(proto_ctx_t *ctx)
85313 +{
85314 +       int result;
85315 +       proto_token_t token;
85316 +
85317 +       /* path ::= literal | rel_path | / rel_path */
85318 +
85319 +       next_token(ctx, &token);
85320 +       PTRACE(ctx, "%i", token.type);
85321 +
85322 +       result = 0;
85323 +       switch (token.type) {
85324 +       case TOKEN_STRING:
85325 +               build_string_val(ctx, &token, get_val(ctx));
85326 +               break;
85327 +       case TOKEN_NUMBER:
85328 +               build_number_val(ctx, &token, get_val(ctx));
85329 +               break;
85330 +       case TOKEN_SLASH:
85331 +               result = parse_rel_path(ctx, &ctx->root);
85332 +               break;
85333 +       default:
85334 +               back_token(ctx, &token);
85335 +               result = parse_rel_path(ctx, get_cur(ctx));
85336 +               break;
85337 +       case TOKEN_INVALID:
85338 +               post_error(ctx, "cannot parse path");
85339 +               result = -EINVAL;
85340 +               back_token(ctx, &token);
85341 +               break;
85342 +       }
85343 +       return result;
85344 +}
85345 +
85346 +/* parse "binary_exp" token */
85347 +static int parse_binary_exp(proto_ctx_t *ctx)
85348 +{
85349 +       int result;
85350 +       proto_val_t *lhs;
85351 +
85352 +       /* binary_exp ::= path | path binop binary_exp */
85353 +
85354 +       result = inlevel(ctx);
85355 +       if (result != 0)
85356 +               return result;
85357 +
85358 +       result = parse_path(ctx);
85359 +       if (result == 0) {
85360 +               proto_token_t  token;
85361 +
85362 +               lhs = get_val(ctx);
85363 +
85364 +               next_token(ctx, &token);
85365 +               PTRACE(ctx, "%i", token.type);
85366 +
85367 +               if (token.type == TOKEN_ASSIGNMENT) {
85368 +                       result = inlevel(ctx);
85369 +                       if (result == 0) {
85370 +                               result = parse_binary_exp(ctx);
85371 +                               if (result == 0) {
85372 +                                       proto_val_t *rhs;
85373 +
85374 +                                       rhs = get_val(ctx);
85375 +                                       result = proto_assign(ctx, lhs, rhs);
85376 +                               }
85377 +                               proto_val_up(ctx);
85378 +                               exlevel(ctx);
85379 +                       }
85380 +               } else
85381 +                       back_token(ctx, &token);
85382 +       }
85383 +       proto_val_up(ctx);
85384 +       exlevel(ctx);
85385 +       return result;
85386 +}
85387 +
85388 +/* parse "expression" token */
85389 +static int parse_exp(proto_ctx_t *ctx)
85390 +{
85391 +       int result;
85392 +
85393 +       /* expression ::= binary_exp { ; binary_exp } */
85394 +
85395 +       result = inlevel(ctx);
85396 +       if (result != 0)
85397 +               return result;
85398 +
85399 +       while (1) {
85400 +               proto_token_t  token;
85401 +
85402 +               result = parse_binary_exp(ctx);
85403 +               proto_val_up(ctx);
85404 +               if (result != 0)
85405 +                       break;
85406 +
85407 +               next_token(ctx, &token);
85408 +               PTRACE(ctx, "%i", token.type);
85409 +
85410 +               if (token.type != TOKEN_SEMICOLON) {
85411 +                       back_token(ctx, &token);
85412 +                       break;
85413 +               }
85414 +               /* discard value */
85415 +               proto_val_put(get_val(ctx));
85416 +       }
85417 +       exlevel(ctx);
85418 +       return result;
85419 +}
85420 +
85421 +/* execute @command */
85422 +static int execute(proto_ctx_t *ctx)
85423 +{
85424 +       int result;
85425 +
85426 +       inlevel(ctx);
85427 +       fscpy(&get_level(ctx)->cur, &ctx->cwd);
85428 +       result = parse_exp(ctx);
85429 +       if (get_val(ctx)->type == VAL_NUMBER)
85430 +               result = get_val(ctx)->u.number;
85431 +       exlevel(ctx);
85432 +       assert("nikita-3234", ctx->depth == 0);
85433 +       if (char_at(ctx, ctx->pos) != 0) {
85434 +               post_error(ctx, "garbage after expression");
85435 +               if (result == 0)
85436 +                       result = -EINVAL;
85437 +       }
85438 +
85439 +       if (ctx->flags & CTX_PARSE_ERROR) {
85440 +               int i;
85441 +
85442 +               printk("Syntax error in ``%s''\n", ctx->command);
85443 +               for (i = PROTO_LEVELS - 1; i >= 0; --i) {
85444 +                       proto_level_t *level;
85445 +
85446 +                       level = &ctx->level[i];
85447 +                       if (level->error != NULL) {
85448 +                               printk("    %02i: %s at %i\n",
85449 +                                      i, level->error, level->error_pos);
85450 +                       }
85451 +               }
85452 +               result = -EINVAL;
85453 +       }
85454 +       return result;
85455 +}
85456 +
85457 +/* entry point */
85458 +asmlinkage long sys_reiser4(const char __user * command)
85459 +{
85460 +       int    result;
85461 +       char * inkernel;
85462 +
85463 +       inkernel = getname(command);
85464 +       if (!IS_ERR(inkernel)) {
85465 +               proto_ctx_t ctx;
85466 +
85467 +               result = ctx_init(&ctx, inkernel);
85468 +               if (result == 0) {
85469 +                       result = execute(&ctx);
85470 +                       ctx_done(&ctx);
85471 +               }
85472 +               putname(inkernel);
85473 +       } else
85474 +               result = PTR_ERR(inkernel);
85475 +       return result;
85476 +}
85477 diff -rupN linux-2.6.8-rc3/fs/reiser4/tap.c linux-2.6.8-rc3-a/fs/reiser4/tap.c
85478 --- linux-2.6.8-rc3/fs/reiser4/tap.c    1970-01-01 03:00:00.000000000 +0300
85479 +++ linux-2.6.8-rc3-a/fs/reiser4/tap.c  2004-08-05 21:20:52.933692060 +0400
85480 @@ -0,0 +1,392 @@
85481 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
85482 + * reiser4/README */
85483 +
85484 +/*
85485 +   Tree Access Pointer (tap).
85486 +
85487 +   tap is data structure combining coord and lock handle (mostly). It is
85488 +   useful when one has to scan tree nodes (for example, in readdir, or flush),
85489 +   for tap functions allow to move tap in either direction transparently
85490 +   crossing unit/item/node borders.
85491 +
85492 +   Tap doesn't provide automatic synchronization of its fields as it is
85493 +   supposed to be per-thread object.
85494 +*/
85495 +
85496 +#include "forward.h"
85497 +#include "debug.h"
85498 +#include "coord.h"
85499 +#include "tree.h"
85500 +#include "context.h"
85501 +#include "tap.h"
85502 +#include "znode.h"
85503 +#include "tree_walk.h"
85504 +
85505 +#if REISER4_DEBUG
85506 +static int tap_invariant(const tap_t * tap);
85507 +static void tap_check(const tap_t * tap);
85508 +#else
85509 +#define tap_check(tap) noop
85510 +#endif
85511 +
85512 +/** load node tap is pointing to, if not loaded already */
85513 +reiser4_internal int
85514 +tap_load(tap_t * tap)
85515 +{
85516 +       tap_check(tap);
85517 +       if (tap->loaded == 0) {
85518 +               int result;
85519 +
85520 +               result = zload_ra(tap->coord->node, &tap->ra_info);
85521 +               if (result != 0)
85522 +                       return result;
85523 +               coord_clear_iplug(tap->coord);
85524 +       }
85525 +       ++tap->loaded;
85526 +       tap_check(tap);
85527 +       return 0;
85528 +}
85529 +
85530 +/** release node tap is pointing to. Dual to tap_load() */
85531 +reiser4_internal void
85532 +tap_relse(tap_t * tap)
85533 +{
85534 +       tap_check(tap);
85535 +       if (tap->loaded > 0) {
85536 +               --tap->loaded;
85537 +               if (tap->loaded == 0) {
85538 +                       zrelse(tap->coord->node);
85539 +               }
85540 +       }
85541 +       tap_check(tap);
85542 +}
85543 +
85544 +/**
85545 + * init tap to consist of @coord and @lh. Locks on nodes will be acquired with
85546 + * @mode
85547 + */
85548 +reiser4_internal void
85549 +tap_init(tap_t * tap, coord_t * coord, lock_handle * lh, znode_lock_mode mode)
85550 +{
85551 +       tap->coord  = coord;
85552 +       tap->lh     = lh;
85553 +       tap->mode   = mode;
85554 +       tap->loaded = 0;
85555 +       tap_list_clean(tap);
85556 +       init_ra_info(&tap->ra_info);
85557 +}
85558 +
85559 +/** add @tap to the per-thread list of all taps */
85560 +reiser4_internal void
85561 +tap_monitor(tap_t * tap)
85562 +{
85563 +       assert("nikita-2623", tap != NULL);
85564 +       tap_check(tap);
85565 +       tap_list_push_front(taps_list(), tap);
85566 +       tap_check(tap);
85567 +}
85568 +
85569 +/* duplicate @src into @dst. Copy lock handle. @dst is not initially
85570 + * loaded. */
85571 +reiser4_internal void
85572 +tap_copy(tap_t * dst, tap_t * src)
85573 +{
85574 +       assert("nikita-3193", src != NULL);
85575 +       assert("nikita-3194", dst != NULL);
85576 +
85577 +       *dst->coord  = *src->coord;
85578 +       if (src->lh->node)
85579 +               copy_lh(dst->lh, src->lh);
85580 +       dst->mode    = src->mode;
85581 +       dst->loaded  = 0;
85582 +       tap_list_clean(dst);
85583 +       dst->ra_info = src->ra_info;
85584 +}
85585 +
85586 +/** finish with @tap */
85587 +reiser4_internal void
85588 +tap_done(tap_t * tap)
85589 +{
85590 +       assert("nikita-2565", tap != NULL);
85591 +       tap_check(tap);
85592 +       if (tap->loaded > 0)
85593 +               zrelse(tap->coord->node);
85594 +       done_lh(tap->lh);
85595 +       tap->loaded = 0;
85596 +       tap_list_remove_clean(tap);
85597 +       tap->coord->node = NULL;
85598 +}
85599 +
85600 +/**
85601 + * move @tap to the new node, locked with @target. Load @target, if @tap was
85602 + * already loaded.
85603 + */
85604 +reiser4_internal int
85605 +tap_move(tap_t * tap, lock_handle * target)
85606 +{
85607 +       int result = 0;
85608 +
85609 +       assert("nikita-2567", tap != NULL);
85610 +       assert("nikita-2568", target != NULL);
85611 +       assert("nikita-2570", target->node != NULL);
85612 +       assert("nikita-2569", tap->coord->node == tap->lh->node);
85613 +
85614 +       tap_check(tap);
85615 +       if (tap->loaded > 0)
85616 +               result = zload_ra(target->node, &tap->ra_info);
85617 +
85618 +       if (result == 0) {
85619 +               if (tap->loaded > 0)
85620 +                       zrelse(tap->coord->node);
85621 +               done_lh(tap->lh);
85622 +               copy_lh(tap->lh, target);
85623 +               tap->coord->node = target->node;
85624 +               coord_clear_iplug(tap->coord);
85625 +       }
85626 +       tap_check(tap);
85627 +       return result;
85628 +}
85629 +
85630 +/**
85631 + * move @tap to @target. Acquire lock on @target, if @tap was already
85632 + * loaded.
85633 + */
85634 +reiser4_internal int
85635 +tap_to(tap_t * tap, znode * target)
85636 +{
85637 +       int result;
85638 +
85639 +       assert("nikita-2624", tap != NULL);
85640 +       assert("nikita-2625", target != NULL);
85641 +
85642 +       tap_check(tap);
85643 +       result = 0;
85644 +       if (tap->coord->node != target) {
85645 +               lock_handle here;
85646 +
85647 +               init_lh(&here);
85648 +               result = longterm_lock_znode(&here, target,
85649 +                                            tap->mode, ZNODE_LOCK_HIPRI);
85650 +               if (result == 0) {
85651 +                       result = tap_move(tap, &here);
85652 +                       done_lh(&here);
85653 +               }
85654 +       }
85655 +       tap_check(tap);
85656 +       return result;
85657 +}
85658 +
85659 +/**
85660 + * move @tap to given @target, loading and locking @target->node if
85661 + * necessary
85662 + */
85663 +reiser4_internal int
85664 +tap_to_coord(tap_t * tap, coord_t * target)
85665 +{
85666 +       int result;
85667 +
85668 +       tap_check(tap);
85669 +       result = tap_to(tap, target->node);
85670 +       if (result == 0)
85671 +               coord_dup(tap->coord, target);
85672 +       tap_check(tap);
85673 +       return result;
85674 +}
85675 +
85676 +/** return list of all taps */
85677 +reiser4_internal tap_list_head *
85678 +taps_list(void)
85679 +{
85680 +       return &get_current_context()->taps;
85681 +}
85682 +
85683 +/** helper function for go_{next,prev}_{item,unit,node}() */
85684 +reiser4_internal int
85685 +go_dir_el(tap_t * tap, sideof dir, int units_p)
85686 +{
85687 +       coord_t dup;
85688 +       coord_t *coord;
85689 +       int result;
85690 +
85691 +       int (*coord_dir) (coord_t *);
85692 +       int (*get_dir_neighbor) (lock_handle *, znode *, int, int);
85693 +       void (*coord_init) (coord_t *, const znode *);
85694 +       ON_DEBUG(int (*coord_check) (const coord_t *));
85695 +
85696 +       assert("nikita-2556", tap != NULL);
85697 +       assert("nikita-2557", tap->coord != NULL);
85698 +       assert("nikita-2558", tap->lh != NULL);
85699 +       assert("nikita-2559", tap->coord->node != NULL);
85700 +
85701 +       tap_check(tap);
85702 +       if (dir == LEFT_SIDE) {
85703 +               coord_dir = units_p ? coord_prev_unit : coord_prev_item;
85704 +               get_dir_neighbor = reiser4_get_left_neighbor;
85705 +               coord_init = coord_init_last_unit;
85706 +       } else {
85707 +               coord_dir = units_p ? coord_next_unit : coord_next_item;
85708 +               get_dir_neighbor = reiser4_get_right_neighbor;
85709 +               coord_init = coord_init_first_unit;
85710 +       }
85711 +       ON_DEBUG(coord_check = units_p ? coord_is_existing_unit : coord_is_existing_item);
85712 +       assert("nikita-2560", coord_check(tap->coord));
85713 +
85714 +       coord = tap->coord;
85715 +       coord_dup(&dup, coord);
85716 +       if (coord_dir(&dup) != 0) {
85717 +               do {
85718 +                       /* move to the left neighboring node */
85719 +                       lock_handle dup;
85720 +
85721 +                       init_lh(&dup);
85722 +                       result = get_dir_neighbor(
85723 +                               &dup, coord->node, (int) tap->mode, GN_CAN_USE_UPPER_LEVELS);
85724 +                       if (result == 0) {
85725 +                               result = tap_move(tap, &dup);
85726 +                               if (result == 0)
85727 +                                       coord_init(tap->coord, dup.node);
85728 +                               done_lh(&dup);
85729 +                       }
85730 +                       /* skip empty nodes */
85731 +               } while ((result == 0) && node_is_empty(coord->node));
85732 +       } else {
85733 +               result = 0;
85734 +               coord_dup(coord, &dup);
85735 +       }
85736 +       assert("nikita-2564", ergo(!result, coord_check(tap->coord)));
85737 +       tap_check(tap);
85738 +       return result;
85739 +}
85740 +
85741 +/**
85742 + * move @tap to the next unit, transparently crossing item and node
85743 + * boundaries
85744 + */
85745 +reiser4_internal int
85746 +go_next_unit(tap_t * tap)
85747 +{
85748 +       return go_dir_el(tap, RIGHT_SIDE, 1);
85749 +}
85750 +
85751 +/**
85752 + * move @tap to the previous unit, transparently crossing item and node
85753 + * boundaries
85754 + */
85755 +reiser4_internal int
85756 +go_prev_unit(tap_t * tap)
85757 +{
85758 +       return go_dir_el(tap, LEFT_SIDE, 1);
85759 +}
85760 +
85761 +/**
85762 + * @shift times apply @actor to the @tap. This is used to move @tap by
85763 + * @shift units (or items, or nodes) in either direction.
85764 + */
85765 +reiser4_internal int
85766 +rewind_to(tap_t * tap, go_actor_t actor, int shift)
85767 +{
85768 +       int result;
85769 +
85770 +       assert("nikita-2555", shift >= 0);
85771 +       assert("nikita-2562", tap->coord->node == tap->lh->node);
85772 +
85773 +       tap_check(tap);
85774 +       result = tap_load(tap);
85775 +       if (result != 0)
85776 +               return result;
85777 +
85778 +       for (; shift > 0; --shift) {
85779 +               result = actor(tap);
85780 +               assert("nikita-2563", tap->coord->node == tap->lh->node);
85781 +               if (result != 0)
85782 +                       break;
85783 +       }
85784 +       tap_relse(tap);
85785 +       tap_check(tap);
85786 +       return result;
85787 +}
85788 +
85789 +/** move @tap @shift units rightward */
85790 +reiser4_internal int
85791 +rewind_right(tap_t * tap, int shift)
85792 +{
85793 +       return rewind_to(tap, go_next_unit, shift);
85794 +}
85795 +
85796 +/** move @tap @shift units leftward */
85797 +reiser4_internal int
85798 +rewind_left(tap_t * tap, int shift)
85799 +{
85800 +       return rewind_to(tap, go_prev_unit, shift);
85801 +}
85802 +
85803 +#if REISER4_DEBUG_OUTPUT
85804 +/** debugging function: print @tap content in human readable form */
85805 +reiser4_internal void print_tap(const char * prefix, const tap_t * tap)
85806 +{
85807 +       if (tap == NULL) {
85808 +               printk("%s: null tap\n", prefix);
85809 +               return;
85810 +       }
85811 +       printk("%s: loaded: %i, in-list: %i, node: %p, mode: %s\n", prefix,
85812 +              tap->loaded, tap_list_is_clean(tap), tap->lh->node,
85813 +              lock_mode_name(tap->mode));
85814 +       print_coord("\tcoord", tap->coord, 0);
85815 +}
85816 +#else
85817 +#define print_tap(prefix, tap) noop
85818 +#endif
85819 +
85820 +#if REISER4_DEBUG
85821 +/** check [tap-sane] invariant */
85822 +static int tap_invariant(const tap_t * tap)
85823 +{
85824 +       /* [tap-sane] invariant */
85825 +
85826 +       if (tap == NULL)
85827 +               return 1;
85828 +       /* tap->mode is one of
85829 +        *
85830 +        * {ZNODE_NO_LOCK, ZNODE_READ_LOCK, ZNODE_WRITE_LOCK}, and
85831 +        */
85832 +       if (tap->mode != ZNODE_NO_LOCK &&
85833 +           tap->mode != ZNODE_READ_LOCK && tap->mode != ZNODE_WRITE_LOCK)
85834 +               return 2;
85835 +       /* tap->coord != NULL, and */
85836 +       if (tap->coord == NULL)
85837 +               return 3;
85838 +       /* tap->lh != NULL, and */
85839 +       if (tap->lh == NULL)
85840 +               return 4;
85841 +       /* tap->loaded > 0 => znode_is_loaded(tap->coord->node), and */
85842 +       if (!ergo(tap->loaded, znode_is_loaded(tap->coord->node)))
85843 +               return 5;
85844 +       /* tap->coord->node == tap->lh->node if tap->lh->node is not 0 */
85845 +       if (tap->lh->node != NULL && tap->coord->node != tap->lh->node)
85846 +               return 6;
85847 +       return 0;
85848 +}
85849 +
85850 +/** debugging function: check internal @tap consistency */
85851 +static void tap_check(const tap_t * tap)
85852 +{
85853 +       int result;
85854 +
85855 +       result = tap_invariant(tap);
85856 +       if (result != 0) {
85857 +               print_tap("broken", tap);
85858 +               reiser4_panic("nikita-2831", "tap broken: %i\n", result);
85859 +       }
85860 +}
85861 +#endif
85862 +
85863 +/* Make Linus happy.
85864 +   Local variables:
85865 +   c-indentation-style: "K&R"
85866 +   mode-name: "LC"
85867 +   c-basic-offset: 8
85868 +   tab-width: 8
85869 +   fill-column: 120
85870 +   scroll-step: 1
85871 +   End:
85872 +*/
85873 diff -rupN linux-2.6.8-rc3/fs/reiser4/tap.h linux-2.6.8-rc3-a/fs/reiser4/tap.h
85874 --- linux-2.6.8-rc3/fs/reiser4/tap.h    1970-01-01 03:00:00.000000000 +0300
85875 +++ linux-2.6.8-rc3-a/fs/reiser4/tap.h  2004-08-05 21:20:53.029671816 +0400
85876 @@ -0,0 +1,75 @@
85877 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
85878 +
85879 +/* Tree Access Pointers. See tap.c for more details. */
85880 +
85881 +#if !defined( __REISER4_TAP_H__ )
85882 +#define __REISER4_TAP_H__
85883 +
85884 +#include "forward.h"
85885 +#include "type_safe_list.h"
85886 +#include "readahead.h"
85887 +
85888 +TYPE_SAFE_LIST_DECLARE(tap);
85889 +
85890 +/**
85891 +    tree_access_pointer aka tap. Data structure combining coord_t and lock
85892 +    handle.
85893 +    Invariants involving this data-type, see doc/lock-ordering for details:
85894 +
85895 +      [tap-sane]
85896 + */
85897 +struct tree_access_pointer {
85898 +       /* coord tap is at */
85899 +       coord_t *coord;
85900 +       /* lock handle on ->coord->node */
85901 +       lock_handle *lh;
85902 +       /* mode of lock acquired by this tap */
85903 +       znode_lock_mode mode;
85904 +       /* incremented by tap_load(). Decremented by tap_relse(). */
85905 +       int loaded;
85906 +       /* list of taps */
85907 +       tap_list_link linkage;
85908 +       /* read-ahead hint */
85909 +       ra_info_t ra_info;
85910 +};
85911 +
85912 +TYPE_SAFE_LIST_DEFINE(tap, tap_t, linkage);
85913 +
85914 +typedef int (*go_actor_t) (tap_t * tap);
85915 +
85916 +extern int tap_load(tap_t * tap);
85917 +extern void tap_relse(tap_t * tap);
85918 +extern void tap_init(tap_t * tap, coord_t * coord, lock_handle * lh, znode_lock_mode mode);
85919 +extern void tap_monitor(tap_t * tap);
85920 +extern void tap_copy(tap_t * dst, tap_t * src);
85921 +extern void tap_done(tap_t * tap);
85922 +extern int tap_move(tap_t * tap, lock_handle * target);
85923 +extern int tap_to(tap_t * tap, znode * target);
85924 +extern int tap_to_coord(tap_t * tap, coord_t * target);
85925 +
85926 +extern int go_dir_el(tap_t * tap, sideof dir, int units_p);
85927 +extern int go_next_unit(tap_t * tap);
85928 +extern int go_prev_unit(tap_t * tap);
85929 +extern int rewind_to(tap_t * tap, go_actor_t actor, int shift);
85930 +extern int rewind_right(tap_t * tap, int shift);
85931 +extern int rewind_left(tap_t * tap, int shift);
85932 +
85933 +extern tap_list_head *taps_list(void);
85934 +
85935 +#define for_all_taps( tap )                            \
85936 +       for (tap = tap_list_front ( taps_list() );      \
85937 +                ! tap_list_end   ( taps_list(), tap ); \
85938 +            tap = tap_list_next  ( tap ) )
85939 +
85940 +/* __REISER4_TAP_H__ */
85941 +#endif
85942 +/* Make Linus happy.
85943 +   Local variables:
85944 +   c-indentation-style: "K&R"
85945 +   mode-name: "LC"
85946 +   c-basic-offset: 8
85947 +   tab-width: 8
85948 +   fill-column: 120
85949 +   scroll-step: 1
85950 +   End:
85951 +*/
85952 diff -rupN linux-2.6.8-rc3/fs/reiser4/tree.c linux-2.6.8-rc3-a/fs/reiser4/tree.c
85953 --- linux-2.6.8-rc3/fs/reiser4/tree.c   1970-01-01 03:00:00.000000000 +0300
85954 +++ linux-2.6.8-rc3-a/fs/reiser4/tree.c 2004-08-05 21:20:53.266621837 +0400
85955 @@ -0,0 +1,1829 @@
85956 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
85957 + * reiser4/README */
85958 +
85959 +/*
85960 + * KEYS IN A TREE.
85961 + *
85962 + * The tree consists of nodes located on the disk. Node in the tree is either
85963 + * formatted or unformatted. Formatted node is one that has structure
85964 + * understood by the tree balancing and traversal code. Formatted nodes are
85965 + * further classified into leaf and internal nodes. Latter distinctions is
85966 + * (almost) of only historical importance: general structure of leaves and
85967 + * internal nodes is the same in Reiser4. Unformatted nodes contain raw data
85968 + * that are part of bodies of ordinary files and attributes.
85969 + *
85970 + * Each node in the tree spawns some interval in the key space. Key ranges for
85971 + * all nodes in the tree are disjoint. Actually, this only holds in some weak
85972 + * sense, because of the non-unique keys: intersection of key ranges for
85973 + * different nodes is either empty, or consists of exactly one key.
85974 + *
85975 + * Formatted node consists of a sequence of items. Each item spawns some
85976 + * interval in key space. Key ranges for all items in a tree are disjoint,
85977 + * modulo non-unique keys again. Items within nodes are ordered in the key
85978 + * order of the smallest key in a item.
85979 + *
85980 + * Particular type of item can be further split into units. Unit is piece of
85981 + * item that can be cut from item and moved into another item of the same
85982 + * time. Units are used by balancing code to repack data during balancing.
85983 + *
85984 + * Unit can be further split into smaller entities (for example, extent unit
85985 + * represents several pages, and it is natural for extent code to operate on
85986 + * particular pages and even bytes within one unit), but this is of no
85987 + * relevance to the generic balancing and lookup code.
85988 + *
85989 + * Although item is said to "spawn" range or interval of keys, it is not
85990 + * necessary that item contains piece of data addressable by each and every
85991 + * key in this range. For example, compound directory item, consisting of
85992 + * units corresponding to directory entries and keyed by hashes of file names,
85993 + * looks more as having "discrete spectrum": only some disjoint keys inside
85994 + * range occupied by this item really address data.
85995 + *
85996 + * No than less, each item always has well-defined least (minimal) key, that
85997 + * is recorded in item header, stored in the node this item is in. Also, item
85998 + * plugin can optionally define method ->max_key_inside() returning maximal
85999 + * key that can _possibly_ be located within this item. This method is used
86000 + * (mainly) to determine when given piece of data should be merged into
86001 + * existing item, in stead of creating new one. Because of this, even though
86002 + * ->max_key_inside() can be larger that any key actually located in the item,
86003 + * intervals
86004 + *
86005 + * [ min_key( item ), ->max_key_inside( item ) ]
86006 + *
86007 + * are still disjoint for all items within the _same_ node.
86008 + *
86009 + * In memory node is represented by znode. It plays several roles:
86010 + *
86011 + *  . something locks are taken on
86012 + *
86013 + *  . something tracked by transaction manager (this is going to change)
86014 + *
86015 + *  . something used to access node data
86016 + *
86017 + *  . something used to maintain tree structure in memory: sibling and
86018 + *  parental linkage.
86019 + *
86020 + *  . something used to organize nodes into "slums"
86021 + *
86022 + * More on znodes see in znode.[ch]
86023 + *
86024 + * DELIMITING KEYS
86025 + *
86026 + *   To simplify balancing, allow some flexibility in locking and speed up
86027 + *   important coord cache optimization, we keep delimiting keys of nodes in
86028 + *   memory. Depending on disk format (implemented by appropriate node plugin)
86029 + *   node on disk can record both left and right delimiting key, only one of
86030 + *   them, or none. Still, our balancing and tree traversal code keep both
86031 + *   delimiting keys for a node that is in memory stored in the znode. When
86032 + *   node is first brought into memory during tree traversal, its left
86033 + *   delimiting key is taken from its parent, and its right delimiting key is
86034 + *   either next key in its parent, or is right delimiting key of parent if
86035 + *   node is the rightmost child of parent.
86036 + *
86037 + *   Physical consistency of delimiting key is protected by special dk
86038 + *   read-write lock. That is, delimiting keys can only be inspected or
86039 + *   modified under this lock. But dk lock is only sufficient for fast
86040 + *   "pessimistic" check, because to simplify code and to decrease lock
86041 + *   contention, balancing (carry) only updates delimiting keys right before
86042 + *   unlocking all locked nodes on the given tree level. For example,
86043 + *   coord-by-key cache scans LRU list of recently accessed znodes. For each
86044 + *   node it first does fast check under dk spin lock. If key looked for is
86045 + *   not between delimiting keys for this node, next node is inspected and so
86046 + *   on. If key is inside of the key range, long term lock is taken on node
86047 + *   and key range is rechecked.
86048 + *
86049 + * COORDINATES
86050 + *
86051 + *   To find something in the tree, you supply a key, and the key is resolved
86052 + *   by coord_by_key() into a coord (coordinate) that is valid as long as the
86053 + *   node the coord points to remains locked.  As mentioned above trees
86054 + *   consist of nodes that consist of items that consist of units. A unit is
86055 + *   the smallest and indivisible piece of tree as far as balancing and tree
86056 + *   search are concerned. Each node, item, and unit can be addressed by
86057 + *   giving its level in the tree and the key occupied by this entity.  A node
86058 + *   knows what the key ranges are of the items within it, and how to find its
86059 + *   items and invoke their item handlers, but it does not know how to access
86060 + *   individual units within its items except through the item handlers.
86061 + *   coord is a structure containing a pointer to the node, the ordinal number
86062 + *   of the item within this node (a sort of item offset), and the ordinal
86063 + *   number of the unit within this item.
86064 + *
86065 + * TREE LOOKUP
86066 + *
86067 + *   There are two types of access to the tree: lookup and modification.
86068 + *
86069 + *   Lookup is a search for the key in the tree. Search can look for either
86070 + *   exactly the key given to it, or for the largest key that is not greater
86071 + *   than the key given to it. This distinction is determined by "bias"
86072 + *   parameter of search routine (coord_by_key()). coord_by_key() either
86073 + *   returns error (key is not in the tree, or some kind of external error
86074 + *   occurred), or successfully resolves key into coord.
86075 + *
86076 + *   This resolution is done by traversing tree top-to-bottom from root level
86077 + *   to the desired level. On levels above twig level (level one above the
86078 + *   leaf level) nodes consist exclusively of internal items. Internal item is
86079 + *   nothing more than pointer to the tree node on the child level. On twig
86080 + *   level nodes consist of internal items intermixed with extent
86081 + *   items. Internal items form normal search tree structure used by traversal
86082 + *   to descent through the tree.
86083 + *
86084 + * TREE LOOKUP OPTIMIZATIONS
86085 + *
86086 + * Tree lookup described above is expensive even if all nodes traversed are
86087 + * already in the memory: for each node binary search within it has to be
86088 + * performed and binary searches are CPU consuming and tend to destroy CPU
86089 + * caches.
86090 + *
86091 + * Several optimizations are used to work around this:
86092 + *
86093 + *   . cbk_cache (look-aside cache for tree traversals, see search.c for
86094 + *   details)
86095 + *
86096 + *   . seals (see seal.[ch])
86097 + *
86098 + *   . vroot (see search.c)
86099 + *
86100 + * General search-by-key is layered thusly:
86101 + *
86102 + *                   [check seal, if any]   --ok--> done
86103 + *                           |
86104 + *                         failed
86105 + *                           |
86106 + *                           V
86107 + *                     [vroot defined] --no--> node = tree_root
86108 + *                           |                   |
86109 + *                          yes                  |
86110 + *                           |                   |
86111 + *                           V                   |
86112 + *                       node = vroot            |
86113 + *                                 |             |
86114 + *                                 |             |
86115 + *                                 |             |
86116 + *                                 V             V
86117 + *                            [check cbk_cache for key]  --ok--> done
86118 + *                                        |
86119 + *                                      failed
86120 + *                                        |
86121 + *                                        V
86122 + *                       [start tree traversal from node]
86123 + *
86124 + */
86125 +
86126 +#include "forward.h"
86127 +#include "debug.h"
86128 +#include "dformat.h"
86129 +#include "key.h"
86130 +#include "coord.h"
86131 +#include "plugin/item/static_stat.h"
86132 +#include "plugin/item/item.h"
86133 +#include "plugin/node/node.h"
86134 +#include "plugin/plugin.h"
86135 +#include "txnmgr.h"
86136 +#include "jnode.h"
86137 +#include "znode.h"
86138 +#include "block_alloc.h"
86139 +#include "tree_walk.h"
86140 +#include "carry.h"
86141 +#include "carry_ops.h"
86142 +#include "tap.h"
86143 +#include "tree.h"
86144 +#include "log.h"
86145 +#include "vfs_ops.h"
86146 +#include "page_cache.h"
86147 +#include "super.h"
86148 +#include "reiser4.h"
86149 +#include "inode.h"
86150 +
86151 +#include <linux/fs.h>          /* for struct super_block  */
86152 +#include <linux/spinlock.h>
86153 +
86154 +/* Disk address (block number) never ever used for any real tree node. This is
86155 +   used as block number of "uber" znode.
86156 +
86157 +   Invalid block addresses are 0 by tradition.
86158 +
86159 +*/
86160 +const reiser4_block_nr UBER_TREE_ADDR = 0ull;
86161 +
86162 +#define CUT_TREE_MIN_ITERATIONS 64
86163 +
86164 +/* return node plugin of coord->node */
86165 +reiser4_internal node_plugin *
86166 +node_plugin_by_coord(const coord_t * coord)
86167 +{
86168 +       assert("vs-1", coord != NULL);
86169 +       assert("vs-2", coord->node != NULL);
86170 +
86171 +       return coord->node->nplug;
86172 +}
86173 +
86174 +/* insert item into tree. Fields of @coord are updated so that they can be
86175 + * used by consequent insert operation. */
86176 +reiser4_internal insert_result
86177 +insert_by_key(reiser4_tree * tree      /* tree to insert new item
86178 +                                                * into */ ,
86179 +                           const reiser4_key * key /* key of new item */ ,
86180 +                           reiser4_item_data * data    /* parameters for item
86181 +                                                        * creation */ ,
86182 +                           coord_t * coord /* resulting insertion coord */ ,
86183 +                           lock_handle * lh    /* resulting lock
86184 +                                                  * handle */ ,
86185 +                           tree_level stop_level /** level where to insert */ ,
86186 +                           __u32 flags /* insertion flags */ )
86187 +{
86188 +       int result;
86189 +
86190 +       assert("nikita-358", tree != NULL);
86191 +       assert("nikita-360", coord != NULL);
86192 +
86193 +       result = coord_by_key(tree, key, coord, lh, ZNODE_WRITE_LOCK,
86194 +                             FIND_EXACT, stop_level, stop_level, flags | CBK_FOR_INSERT, 0/*ra_info*/);
86195 +       switch (result) {
86196 +       default:
86197 +               break;
86198 +       case CBK_COORD_FOUND:
86199 +               result = IBK_ALREADY_EXISTS;
86200 +               break;
86201 +       case CBK_COORD_NOTFOUND:
86202 +               assert("nikita-2017", coord->node != NULL);
86203 +               result = insert_by_coord(coord, data, key, lh, 0 /*flags */ );
86204 +               break;
86205 +       }
86206 +       return result;
86207 +}
86208 +
86209 +/* insert item by calling carry. Helper function called if short-cut
86210 +   insertion failed  */
86211 +static insert_result
86212 +insert_with_carry_by_coord(coord_t * coord /* coord where to insert */ ,
86213 +                          lock_handle * lh /* lock handle of insertion
86214 +                                            * node */ ,
86215 +                          reiser4_item_data * data /* parameters of new
86216 +                                                    * item */ ,
86217 +                          const reiser4_key * key /* key of new item */ ,
86218 +                          carry_opcode cop /* carry operation to perform */ ,
86219 +                          cop_insert_flag flags /* carry flags */)
86220 +{
86221 +       int result;
86222 +       carry_pool pool;
86223 +       carry_level lowest_level;
86224 +       carry_op *op;
86225 +       carry_insert_data cdata;
86226 +
86227 +       assert("umka-314", coord != NULL);
86228 +
86229 +       init_carry_pool(&pool);
86230 +       init_carry_level(&lowest_level, &pool);
86231 +
86232 +       op = post_carry(&lowest_level, cop, coord->node, 0);
86233 +       if (IS_ERR(op) || (op == NULL))
86234 +               return RETERR(op ? PTR_ERR(op) : -EIO);
86235 +       cdata.coord = coord;
86236 +       cdata.data = data;
86237 +       cdata.key = key;
86238 +       op->u.insert.d = &cdata;
86239 +       if (flags == 0)
86240 +               flags = znode_get_tree(coord->node)->carry.insert_flags;
86241 +       op->u.insert.flags = flags;
86242 +       op->u.insert.type = COPT_ITEM_DATA;
86243 +       op->u.insert.child = 0;
86244 +       if (lh != NULL) {
86245 +               assert("nikita-3245", lh->node == coord->node);
86246 +               lowest_level.track_type = CARRY_TRACK_CHANGE;
86247 +               lowest_level.tracked = lh;
86248 +       }
86249 +
86250 +       ON_STATS(lowest_level.level_no = znode_get_level(coord->node));
86251 +       result = carry(&lowest_level, 0);
86252 +       done_carry_pool(&pool);
86253 +
86254 +       return result;
86255 +}
86256 +
86257 +/* form carry queue to perform paste of @data with @key at @coord, and launch
86258 +   its execution by calling carry().
86259 +
86260 +   Instruct carry to update @lh it after balancing insertion coord moves into
86261 +   different block.
86262 +
86263 +*/
86264 +static int
86265 +paste_with_carry(coord_t * coord /* coord of paste */ ,
86266 +                lock_handle * lh       /* lock handle of node
86267 +                                          * where item is
86268 +                                          * pasted */ ,
86269 +                reiser4_item_data * data       /* parameters of new
86270 +                                                * item */ ,
86271 +                const reiser4_key * key /* key of new item */ ,
86272 +                unsigned flags /* paste flags */ )
86273 +{
86274 +       int result;
86275 +       carry_pool pool;
86276 +       carry_level lowest_level;
86277 +       carry_op *op;
86278 +       carry_insert_data cdata;
86279 +
86280 +       assert("umka-315", coord != NULL);
86281 +       assert("umka-316", key != NULL);
86282 +
86283 +       init_carry_pool(&pool);
86284 +       init_carry_level(&lowest_level, &pool);
86285 +
86286 +       op = post_carry(&lowest_level, COP_PASTE, coord->node, 0);
86287 +       if (IS_ERR(op) || (op == NULL))
86288 +               return RETERR(op ? PTR_ERR(op) : -EIO);
86289 +       cdata.coord = coord;
86290 +       cdata.data = data;
86291 +       cdata.key = key;
86292 +       op->u.paste.d = &cdata;
86293 +       if (flags == 0)
86294 +               flags = znode_get_tree(coord->node)->carry.paste_flags;
86295 +       op->u.paste.flags = flags;
86296 +       op->u.paste.type = COPT_ITEM_DATA;
86297 +       if (lh != NULL) {
86298 +               lowest_level.track_type = CARRY_TRACK_CHANGE;
86299 +               lowest_level.tracked = lh;
86300 +       }
86301 +
86302 +       ON_STATS(lowest_level.level_no = znode_get_level(coord->node));
86303 +       result = carry(&lowest_level, 0);
86304 +       done_carry_pool(&pool);
86305 +
86306 +       return result;
86307 +}
86308 +
86309 +/* insert item at the given coord.
86310 +
86311 +   First try to skip carry by directly calling ->create_item() method of node
86312 +   plugin. If this is impossible (there is not enough free space in the node,
86313 +   or leftmost item in the node is created), call insert_with_carry_by_coord()
86314 +   that will do full carry().
86315 +
86316 +*/
86317 +reiser4_internal insert_result
86318 +insert_by_coord(coord_t * coord        /* coord where to
86319 +                                                  * insert. coord->node has
86320 +                                                  * to be write locked by
86321 +                                                  * caller */ ,
86322 +                             reiser4_item_data * data  /* data to be
86323 +                                                        * inserted */ ,
86324 +                             const reiser4_key * key /* key of new item */ ,
86325 +                             lock_handle * lh  /* lock handle of write
86326 +                                                  * lock on node */ ,
86327 +                             __u32 flags /* insertion flags */ )
86328 +{
86329 +       unsigned item_size;
86330 +       int result;
86331 +       znode *node;
86332 +
86333 +       assert("vs-247", coord != NULL);
86334 +       assert("vs-248", data != NULL);
86335 +       assert("vs-249", data->length >= 0);
86336 +       assert("nikita-1191", znode_is_write_locked(coord->node));
86337 +
86338 +       write_tree_log(znode_get_tree(coord->node), tree_insert, key, data, coord, flags);
86339 +
86340 +       node = coord->node;
86341 +       coord_clear_iplug(coord);
86342 +       result = zload(node);
86343 +       if (result != 0)
86344 +               return result;
86345 +
86346 +       item_size = space_needed(node, NULL, data, 1);
86347 +       if (item_size > znode_free_space(node) &&
86348 +           (flags & COPI_DONT_SHIFT_LEFT) && (flags & COPI_DONT_SHIFT_RIGHT) && (flags & COPI_DONT_ALLOCATE)) {
86349 +               /* we are forced to use free space of coord->node and new item
86350 +                  does not fit into it.
86351 +
86352 +                  Currently we get here only when we allocate and copy units
86353 +                  of extent item from a node to its left neighbor during
86354 +                  "squalloc"-ing.  If @node (this is left neighbor) does not
86355 +                  have enough free space - we do not want to attempt any
86356 +                  shifting and allocations because we are in squeezing and
86357 +                  everything to the left of @node is tightly packed.
86358 +               */
86359 +               result = -E_NODE_FULL;
86360 +       } else if ((item_size <= znode_free_space(node)) &&
86361 +                  !coord_is_before_leftmost(coord) &&
86362 +                  (node_plugin_by_node(node)->fast_insert != NULL) && node_plugin_by_node(node)->fast_insert(coord)) {
86363 +               /* shortcut insertion without carry() overhead.
86364 +
86365 +                  Only possible if:
86366 +
86367 +                  - there is enough free space
86368 +
86369 +                  - insertion is not into the leftmost position in a node
86370 +                    (otherwise it would require updating of delimiting key in a
86371 +                    parent)
86372 +
86373 +                  - node plugin agrees with this
86374 +
86375 +               */
86376 +               reiser4_stat_inc(tree.fast_insert);
86377 +               result = node_plugin_by_node(node)->create_item(coord, key, data, NULL);
86378 +               znode_make_dirty(node);
86379 +       } else {
86380 +               /* otherwise do full-fledged carry(). */
86381 +               result = insert_with_carry_by_coord(coord, lh, data, key, COP_INSERT, flags);
86382 +       }
86383 +       zrelse(node);
86384 +       return result;
86385 +}
86386 +
86387 +/* @coord is set to leaf level and @data is to be inserted to twig level */
86388 +reiser4_internal insert_result
86389 +insert_extent_by_coord(coord_t * coord /* coord where to insert. coord->node * has to be write * locked by caller */ ,
86390 +                      reiser4_item_data * data /* data to be inserted */ ,
86391 +                      const reiser4_key * key /* key of new item */ ,
86392 +                      lock_handle * lh /* lock handle of write lock on * node */)
86393 +{
86394 +       assert("vs-405", coord != NULL);
86395 +       assert("vs-406", data != NULL);
86396 +       assert("vs-407", data->length > 0);
86397 +       assert("vs-408", znode_is_write_locked(coord->node));
86398 +       assert("vs-409", znode_get_level(coord->node) == LEAF_LEVEL);
86399 +
86400 +       return insert_with_carry_by_coord(coord, lh, data, key, COP_EXTENT, 0 /*flags */ );
86401 +}
86402 +
86403 +/* Insert into the item at the given coord.
86404 +
86405 +   First try to skip carry by directly calling ->paste() method of item
86406 +   plugin. If this is impossible (there is not enough free space in the node,
86407 +   or we are pasting into leftmost position in the node), call
86408 +   paste_with_carry() that will do full carry().
86409 +
86410 +*/
86411 +/* paste_into_item */
86412 +reiser4_internal int
86413 +insert_into_item(coord_t * coord /* coord of pasting */ ,
86414 +                lock_handle * lh /* lock handle on node involved */ ,
86415 +                const reiser4_key * key /* key of unit being pasted */ ,
86416 +                reiser4_item_data * data /* parameters for new unit */ ,
86417 +                unsigned flags /* insert/paste flags */ )
86418 +{
86419 +       int result;
86420 +       int size_change;
86421 +       node_plugin *nplug;
86422 +       item_plugin *iplug;
86423 +
86424 +       assert("umka-317", coord != NULL);
86425 +       assert("umka-318", key != NULL);
86426 +
86427 +       iplug = item_plugin_by_coord(coord);
86428 +       nplug = node_plugin_by_coord(coord);
86429 +
86430 +       assert("nikita-1480", iplug == data->iplug);
86431 +
86432 +       write_tree_log(znode_get_tree(coord->node), tree_paste, key, data, coord, flags);
86433 +
86434 +       size_change = space_needed(coord->node, coord, data, 0);
86435 +       if (size_change > (int) znode_free_space(coord->node) &&
86436 +           (flags & COPI_DONT_SHIFT_LEFT) && (flags & COPI_DONT_SHIFT_RIGHT) && (flags & COPI_DONT_ALLOCATE)) {
86437 +               /* we are forced to use free space of coord->node and new data
86438 +                  does not fit into it. */
86439 +               return -E_NODE_FULL;
86440 +       }
86441 +
86442 +       /* shortcut paste without carry() overhead.
86443 +
86444 +          Only possible if:
86445 +
86446 +          - there is enough free space
86447 +
86448 +          - paste is not into the leftmost unit in a node (otherwise
86449 +          it would require updating of delimiting key in a parent)
86450 +
86451 +          - node plugin agrees with this
86452 +
86453 +          - item plugin agrees with us
86454 +       */
86455 +       if (size_change <= (int) znode_free_space(coord->node) &&
86456 +           (coord->item_pos != 0 ||
86457 +            coord->unit_pos != 0 || coord->between == AFTER_UNIT) &&
86458 +           coord->unit_pos != 0 && nplug->fast_paste != NULL &&
86459 +           nplug->fast_paste(coord) &&
86460 +           iplug->b.fast_paste != NULL && iplug->b.fast_paste(coord)) {
86461 +               reiser4_stat_inc(tree.fast_paste);
86462 +               if (size_change > 0)
86463 +                       nplug->change_item_size(coord, size_change);
86464 +               /* NOTE-NIKITA: huh? where @key is used? */
86465 +               result = iplug->b.paste(coord, data, NULL);
86466 +               if (size_change < 0)
86467 +                       nplug->change_item_size(coord, size_change);
86468 +               znode_make_dirty(coord->node);
86469 +       } else
86470 +               /* otherwise do full-fledged carry(). */
86471 +               result = paste_with_carry(coord, lh, data, key, flags);
86472 +       return result;
86473 +}
86474 +
86475 +/* this either appends or truncates item @coord */
86476 +reiser4_internal int
86477 +resize_item(coord_t * coord /* coord of item being resized */ ,
86478 +           reiser4_item_data * data /* parameters of resize */ ,
86479 +           reiser4_key * key /* key of new unit */ ,
86480 +           lock_handle * lh    /* lock handle of node
86481 +                                * being modified */ ,
86482 +           cop_insert_flag flags /* carry flags */ )
86483 +{
86484 +       int result;
86485 +       carry_pool pool;
86486 +       carry_level lowest_level;
86487 +       znode *node;
86488 +
86489 +       assert("nikita-362", coord != NULL);
86490 +       assert("nikita-363", data != NULL);
86491 +       assert("vs-245", data->length != 0);
86492 +
86493 +       node = coord->node;
86494 +       coord_clear_iplug(coord);
86495 +       result = zload(node);
86496 +       if (result != 0)
86497 +               return result;
86498 +
86499 +       init_carry_pool(&pool);
86500 +       init_carry_level(&lowest_level, &pool);
86501 +
86502 +       if (data->length < 0)
86503 +               result = node_plugin_by_coord(coord)->shrink_item(coord,
86504 +                                                                 data->length);
86505 +       else
86506 +               result = insert_into_item(coord, lh, key, data, flags);
86507 +
86508 +       zrelse(node);
86509 +       return result;
86510 +}
86511 +
86512 +/* insert flow @f */
86513 +reiser4_internal int
86514 +insert_flow(coord_t * coord, lock_handle * lh, flow_t * f)
86515 +{
86516 +       int result;
86517 +       carry_pool pool;
86518 +       carry_level lowest_level;
86519 +       carry_op *op;
86520 +       reiser4_item_data data;
86521 +
86522 +       init_carry_pool(&pool);
86523 +       init_carry_level(&lowest_level, &pool);
86524 +
86525 +       op = post_carry(&lowest_level, COP_INSERT_FLOW, coord->node, 0 /* operate directly on coord -> node */ );
86526 +       if (IS_ERR(op) || (op == NULL))
86527 +               return RETERR(op ? PTR_ERR(op) : -EIO);
86528 +
86529 +       /* these are permanent during insert_flow */
86530 +       data.user = 1;
86531 +       data.iplug = item_plugin_by_id(FORMATTING_ID);
86532 +       data.arg = 0;
86533 +       /* data.length and data.data will be set before calling paste or
86534 +          insert */
86535 +       data.length = 0;
86536 +       data.data = 0;
86537 +
86538 +       op->u.insert_flow.flags = 0;
86539 +       op->u.insert_flow.insert_point = coord;
86540 +       op->u.insert_flow.flow = f;
86541 +       op->u.insert_flow.data = &data;
86542 +       op->u.insert_flow.new_nodes = 0;
86543 +
86544 +       lowest_level.track_type = CARRY_TRACK_CHANGE;
86545 +       lowest_level.tracked = lh;
86546 +
86547 +       ON_STATS(lowest_level.level_no = znode_get_level(coord->node));
86548 +       result = carry(&lowest_level, 0);
86549 +       done_carry_pool(&pool);
86550 +
86551 +       return result;
86552 +}
86553 +
86554 +/* Given a coord in parent node, obtain a znode for the corresponding child */
86555 +reiser4_internal znode *
86556 +child_znode(const coord_t * parent_coord       /* coord of pointer to
86557 +                                                * child */ ,
86558 +           znode * parent /* parent of child */ ,
86559 +           int incore_p        /* if !0 only return child if already in
86560 +                                * memory */ ,
86561 +           int setup_dkeys_p   /* if !0 update delimiting keys of
86562 +                                * child */ )
86563 +{
86564 +       znode *child;
86565 +
86566 +       assert("nikita-1374", parent_coord != NULL);
86567 +       assert("nikita-1482", parent != NULL);
86568 +       assert("nikita-1384", ergo(setup_dkeys_p,
86569 +                                  rw_dk_is_not_locked(znode_get_tree(parent))));
86570 +       assert("nikita-2947", znode_is_any_locked(parent));
86571 +
86572 +       if (znode_get_level(parent) <= LEAF_LEVEL) {
86573 +               /* trying to get child of leaf node */
86574 +               warning("nikita-1217", "Child of maize?");
86575 +               print_znode("node", parent);
86576 +               return ERR_PTR(RETERR(-EIO));
86577 +       }
86578 +       if (item_is_internal(parent_coord)) {
86579 +               reiser4_block_nr addr;
86580 +               item_plugin *iplug;
86581 +               reiser4_tree *tree;
86582 +
86583 +               iplug = item_plugin_by_coord(parent_coord);
86584 +               assert("vs-512", iplug->s.internal.down_link);
86585 +               iplug->s.internal.down_link(parent_coord, NULL, &addr);
86586 +
86587 +               tree = znode_get_tree(parent);
86588 +               if (incore_p)
86589 +                       child = zlook(tree, &addr);
86590 +               else
86591 +                       child = zget(tree, &addr, parent, znode_get_level(parent) - 1, GFP_KERNEL);
86592 +               if ((child != NULL) && !IS_ERR(child) && setup_dkeys_p)
86593 +                       set_child_delimiting_keys(parent, parent_coord, child);
86594 +       } else {
86595 +               warning("nikita-1483", "Internal item expected");
86596 +               print_znode("node", parent);
86597 +               child = ERR_PTR(RETERR(-EIO));
86598 +       }
86599 +       return child;
86600 +}
86601 +
86602 +/* remove znode from transaction */
86603 +static void uncapture_znode (znode * node)
86604 +{
86605 +       struct page * page;
86606 +
86607 +       assert ("zam-1001", ZF_ISSET(node, JNODE_HEARD_BANSHEE));
86608 +
86609 +       /* Get e-flush block allocation back before deallocating node's
86610 +        * block number. */
86611 +       spin_lock_znode(node);
86612 +       if (ZF_ISSET(node, JNODE_EFLUSH))
86613 +               eflush_del(ZJNODE(node), 0);
86614 +       spin_unlock_znode(node);
86615 +
86616 +       if (!blocknr_is_fake(znode_get_block(node))) {
86617 +               int ret;
86618 +
86619 +               /* An already allocated block goes right to the atom's delete set. */
86620 +               ret = reiser4_dealloc_block(
86621 +                       znode_get_block(node), 0, BA_DEFER | BA_FORMATTED);
86622 +               if (ret)
86623 +                       warning("zam-942", "can\'t add a block (%llu) number to atom's delete set\n",
86624 +                                       (unsigned long long)(*znode_get_block(node)));
86625 +
86626 +               spin_lock_znode(node);
86627 +               /* Here we return flush reserved block which was reserved at the
86628 +                * moment when this allocated node was marked dirty and still
86629 +                * not used by flush in node relocation procedure.  */
86630 +               if (ZF_ISSET(node, JNODE_FLUSH_RESERVED)) {
86631 +                       txn_atom * atom ;
86632 +
86633 +                       atom = jnode_get_atom(ZJNODE(node));
86634 +                       assert("zam-939", atom != NULL);
86635 +                       spin_unlock_znode(node);
86636 +                       flush_reserved2grabbed(atom, (__u64)1);
86637 +                       UNLOCK_ATOM(atom);
86638 +               } else
86639 +                       spin_unlock_znode(node);
86640 +       } else {
86641 +               /* znode has assigned block which is counted as "fake
86642 +                  allocated". Return it back to "free blocks") */
86643 +               fake_allocated2free((__u64) 1, BA_FORMATTED);
86644 +       }
86645 +
86646 +       /*
86647 +        * uncapture page from transaction. There is a possibility of a race
86648 +        * with ->releasepage(): reiser4_releasepage() detaches page from this
86649 +        * jnode and we have nothing to uncapture. To avoid this, get
86650 +        * reference of node->pg under jnode spin lock. uncapture_page() will
86651 +        * deal with released page itself.
86652 +        */
86653 +       spin_lock_znode(node);
86654 +       page = znode_page(node);
86655 +       if (likely(page != NULL)) {
86656 +               /*
86657 +                * uncapture_page() can only be called when we are sure that
86658 +                * znode is pinned in memory, which we are, because
86659 +                * forget_znode() is only called from longterm_unlock_znode().
86660 +                */
86661 +               page_cache_get(page);
86662 +               spin_unlock_znode(node);
86663 +               lock_page(page);
86664 +               uncapture_page(page);
86665 +               unlock_page(page);
86666 +               page_cache_release(page);
86667 +       } else {
86668 +               txn_atom * atom;
86669 +
86670 +               /* handle "flush queued" znodes */
86671 +               while (1) {
86672 +                       atom = jnode_get_atom(ZJNODE(node));
86673 +                       assert("zam-943", atom != NULL);
86674 +
86675 +                       if (!ZF_ISSET(node, JNODE_FLUSH_QUEUED) || !atom->nr_running_queues)
86676 +                               break;
86677 +
86678 +                       spin_unlock_znode(node);
86679 +                       atom_wait_event(atom);
86680 +                       spin_lock_znode(node);
86681 +               }
86682 +
86683 +               uncapture_block(ZJNODE(node));
86684 +               UNLOCK_ATOM(atom);
86685 +               zput(node);
86686 +       }
86687 +}
86688 +
86689 +/* This is called from longterm_unlock_znode() when last lock is released from
86690 +   the node that has been removed from the tree. At this point node is removed
86691 +   from sibling list and its lock is invalidated. */
86692 +reiser4_internal void
86693 +forget_znode(lock_handle * handle)
86694 +{
86695 +       znode *node;
86696 +       reiser4_tree *tree;
86697 +
86698 +       assert("umka-319", handle != NULL);
86699 +
86700 +       node = handle->node;
86701 +       tree = znode_get_tree(node);
86702 +
86703 +       assert("vs-164", znode_is_write_locked(node));
86704 +       assert("nikita-1280", ZF_ISSET(node, JNODE_HEARD_BANSHEE));
86705 +       assert("nikita-3337", rw_zlock_is_locked(&node->lock));
86706 +
86707 +       /* We assume that this node was detached from its parent before
86708 +        * unlocking, it gives no way to reach this node from parent through a
86709 +        * down link.  The node should have no children and, thereby, can't be
86710 +        * reached from them by their parent pointers.  The only way to obtain a
86711 +        * reference to the node is to use sibling pointers from its left and
86712 +        * right neighbors.  In the next several lines we remove the node from
86713 +        * the sibling list. */
86714 +
86715 +       WLOCK_TREE(tree);
86716 +       sibling_list_remove(node);
86717 +       znode_remove(node, tree);
86718 +       WUNLOCK_TREE(tree);
86719 +
86720 +       /* Here we set JNODE_DYING and cancel all pending lock requests.  It
86721 +        * forces all lock requestor threads to repeat iterations of getting
86722 +        * lock on a child, neighbor or parent node.  But, those threads can't
86723 +        * come to this node again, because this node is no longer a child,
86724 +        * neighbor or parent of any other node.  This order of znode
86725 +        * invalidation does not allow other threads to waste cpu time is a busy
86726 +        * loop, trying to lock dying object.  The exception is in the flush
86727 +        * code when we take node directly from atom's capture list.*/
86728 +
86729 +       write_unlock_zlock(&node->lock);
86730 +       /* and, remove from atom's capture list. */
86731 +       uncapture_znode(node);
86732 +       write_lock_zlock(&node->lock);
86733 +
86734 +       invalidate_lock(handle);
86735 +}
86736 +
86737 +/* Check that internal item at @pointer really contains pointer to @child. */
86738 +reiser4_internal int
86739 +check_tree_pointer(const coord_t * pointer     /* would-be pointer to
86740 +                                                  * @child */ ,
86741 +                  const znode * child /* child znode */ )
86742 +{
86743 +       assert("nikita-1016", pointer != NULL);
86744 +       assert("nikita-1017", child != NULL);
86745 +       assert("nikita-1018", pointer->node != NULL);
86746 +
86747 +       assert("nikita-1325", znode_is_any_locked(pointer->node));
86748 +
86749 +       assert("nikita-2985",
86750 +              znode_get_level(pointer->node) == znode_get_level(child) + 1);
86751 +
86752 +       coord_clear_iplug((coord_t *) pointer);
86753 +
86754 +       if (coord_is_existing_unit(pointer)) {
86755 +               item_plugin *iplug;
86756 +               reiser4_block_nr addr;
86757 +
86758 +               if (item_is_internal(pointer)) {
86759 +                       iplug = item_plugin_by_coord(pointer);
86760 +                       assert("vs-513", iplug->s.internal.down_link);
86761 +                       iplug->s.internal.down_link(pointer, NULL, &addr);
86762 +                       /* check that cached value is correct */
86763 +                       if (disk_addr_eq(&addr, znode_get_block(child))) {
86764 +                               reiser4_stat_inc(tree.pos_in_parent_hit);
86765 +                               return NS_FOUND;
86766 +                       }
86767 +               }
86768 +       }
86769 +       /* warning ("jmacd-1002", "tree pointer incorrect"); */
86770 +       return NS_NOT_FOUND;
86771 +}
86772 +
86773 +/* find coord of pointer to new @child in @parent.
86774 +
86775 +   Find the &coord_t in the @parent where pointer to a given @child will
86776 +   be in.
86777 +
86778 +*/
86779 +reiser4_internal int
86780 +find_new_child_ptr(znode * parent /* parent znode, passed locked */ ,
86781 +                  znode * child UNUSED_ARG /* child znode, passed locked */ ,
86782 +                  znode * left /* left brother of new node */ ,
86783 +                  coord_t * result /* where result is stored in */ )
86784 +{
86785 +       int ret;
86786 +
86787 +       assert("nikita-1486", parent != NULL);
86788 +       assert("nikita-1487", child != NULL);
86789 +       assert("nikita-1488", result != NULL);
86790 +
86791 +       ret = find_child_ptr(parent, left, result);
86792 +       if (ret != NS_FOUND) {
86793 +               warning("nikita-1489", "Cannot find brother position: %i", ret);
86794 +               return RETERR(-EIO);
86795 +       } else {
86796 +               result->between = AFTER_UNIT;
86797 +               return RETERR(NS_NOT_FOUND);
86798 +       }
86799 +}
86800 +
86801 +/* find coord of pointer to @child in @parent.
86802 +
86803 +   Find the &coord_t in the @parent where pointer to a given @child is in.
86804 +
86805 +*/
86806 +reiser4_internal int
86807 +find_child_ptr(znode * parent /* parent znode, passed locked */ ,
86808 +              znode * child /* child znode, passed locked */ ,
86809 +              coord_t * result /* where result is stored in */ )
86810 +{
86811 +       int lookup_res;
86812 +       node_plugin *nplug;
86813 +       /* left delimiting key of a child */
86814 +       reiser4_key ld;
86815 +       reiser4_tree *tree;
86816 +
86817 +       assert("nikita-934", parent != NULL);
86818 +       assert("nikita-935", child != NULL);
86819 +       assert("nikita-936", result != NULL);
86820 +       assert("zam-356", znode_is_loaded(parent));
86821 +
86822 +       coord_init_zero(result);
86823 +       result->node = parent;
86824 +
86825 +       nplug = parent->nplug;
86826 +       assert("nikita-939", nplug != NULL);
86827 +
86828 +       tree = znode_get_tree(parent);
86829 +       /* NOTE-NIKITA taking read-lock on tree here assumes that @result is
86830 +        * not aliased to ->in_parent of some znode. Otherwise,
86831 +        * parent_coord_to_coord() below would modify data protected by tree
86832 +        * lock. */
86833 +       RLOCK_TREE(tree);
86834 +       /* fast path. Try to use cached value. Lock tree to keep
86835 +          node->pos_in_parent and pos->*_blocknr consistent. */
86836 +       if (child->in_parent.item_pos + 1 != 0) {
86837 +               reiser4_stat_inc(tree.pos_in_parent_set);
86838 +               parent_coord_to_coord(&child->in_parent, result);
86839 +               if (check_tree_pointer(result, child) == NS_FOUND) {
86840 +                       RUNLOCK_TREE(tree);
86841 +                       return NS_FOUND;
86842 +               }
86843 +
86844 +               reiser4_stat_inc(tree.pos_in_parent_miss);
86845 +               child->in_parent.item_pos = (unsigned short)~0;
86846 +       }
86847 +       RUNLOCK_TREE(tree);
86848 +
86849 +       /* is above failed, find some key from @child. We are looking for the
86850 +          least key in a child. */
86851 +       UNDER_RW_VOID(dk, tree, read, ld = *znode_get_ld_key(child));
86852 +       /*
86853 +        * now, lookup parent with key just found. Note, that left delimiting
86854 +        * key doesn't identify node uniquely, because (in extremely rare
86855 +        * case) two nodes can have equal left delimiting keys, if one of them
86856 +        * is completely filled with directory entries that all happened to be
86857 +        * hash collision. But, we check block number in check_tree_pointer()
86858 +        * and, so, are safe.
86859 +        */
86860 +       lookup_res = nplug->lookup(parent, &ld, FIND_EXACT, result);
86861 +       /* update cached pos_in_node */
86862 +       if (lookup_res == NS_FOUND) {
86863 +               WLOCK_TREE(tree);
86864 +               coord_to_parent_coord(result, &child->in_parent);
86865 +               WUNLOCK_TREE(tree);
86866 +               lookup_res = check_tree_pointer(result, child);
86867 +       }
86868 +       if (lookup_res == NS_NOT_FOUND)
86869 +               lookup_res = find_child_by_addr(parent, child, result);
86870 +       return lookup_res;
86871 +}
86872 +
86873 +/* find coord of pointer to @child in @parent by scanning
86874 +
86875 +   Find the &coord_t in the @parent where pointer to a given @child
86876 +   is in by scanning all internal items in @parent and comparing block
86877 +   numbers in them with that of @child.
86878 +
86879 +*/
86880 +reiser4_internal int
86881 +find_child_by_addr(znode * parent /* parent znode, passed locked */ ,
86882 +                  znode * child /* child znode, passed locked */ ,
86883 +                  coord_t * result /* where result is stored in */ )
86884 +{
86885 +       int ret;
86886 +
86887 +       assert("nikita-1320", parent != NULL);
86888 +       assert("nikita-1321", child != NULL);
86889 +       assert("nikita-1322", result != NULL);
86890 +
86891 +       ret = NS_NOT_FOUND;
86892 +
86893 +       for_all_units(result, parent) {
86894 +               if (check_tree_pointer(result, child) == NS_FOUND) {
86895 +                       UNDER_RW_VOID(tree, znode_get_tree(parent), write,
86896 +                                     coord_to_parent_coord(result,
86897 +                                                           &child->in_parent));
86898 +                       ret = NS_FOUND;
86899 +                       break;
86900 +               }
86901 +       }
86902 +       return ret;
86903 +}
86904 +
86905 +/* true, if @addr is "unallocated block number", which is just address, with
86906 +   highest bit set. */
86907 +reiser4_internal int
86908 +is_disk_addr_unallocated(const reiser4_block_nr * addr /* address to
86909 +                                                        * check */ )
86910 +{
86911 +       assert("nikita-1766", addr != NULL);
86912 +       cassert(sizeof (reiser4_block_nr) == 8);
86913 +       return (*addr & REISER4_BLOCKNR_STATUS_BIT_MASK) == REISER4_UNALLOCATED_STATUS_VALUE;
86914 +}
86915 +
86916 +/* convert unallocated disk address to the memory address
86917 +
86918 +   FIXME: This needs a big comment. */
86919 +reiser4_internal void *
86920 +unallocated_disk_addr_to_ptr(const reiser4_block_nr * addr     /* address to
86921 +                                                                * convert */ )
86922 +{
86923 +       assert("nikita-1688", addr != NULL);
86924 +       assert("nikita-1689", is_disk_addr_unallocated(addr));
86925 +       return (void *) (long) (*addr << 1);
86926 +}
86927 +
86928 +/* returns true if removing bytes of given range of key [from_key, to_key]
86929 +   causes removing of whole item @from */
86930 +static int
86931 +item_removed_completely(coord_t * from, const reiser4_key * from_key, const reiser4_key * to_key)
86932 +{
86933 +       item_plugin *iplug;
86934 +       reiser4_key key_in_item;
86935 +
86936 +       assert("umka-325", from != NULL);
86937 +       assert("", item_is_extent(from));
86938 +
86939 +       /* check first key just for case */
86940 +       item_key_by_coord(from, &key_in_item);
86941 +       if (keygt(from_key, &key_in_item))
86942 +               return 0;
86943 +
86944 +       /* check last key */
86945 +       iplug = item_plugin_by_coord(from);
86946 +       assert("vs-611", iplug && iplug->s.file.append_key);
86947 +
86948 +       iplug->s.file.append_key(from, &key_in_item);
86949 +       set_key_offset(&key_in_item, get_key_offset(&key_in_item) - 1);
86950 +
86951 +       if (keylt(to_key, &key_in_item))
86952 +               /* last byte is not removed */
86953 +               return 0;
86954 +       return 1;
86955 +}
86956 +
86957 +/* helper function for prepare_twig_kill(): @left and @right are formatted
86958 + * neighbors of extent item being completely removed. Load and lock neighbors
86959 + * and store lock handles into @cdata for later use by kill_hook_extent() */
86960 +static int
86961 +prepare_children(znode *left, znode *right, carry_kill_data *kdata)
86962 +{
86963 +       int result;
86964 +       int left_loaded;
86965 +       int right_loaded;
86966 +
86967 +       result = 0;
86968 +       left_loaded = right_loaded = 0;
86969 +
86970 +       if (left != NULL) {
86971 +               result = zload(left);
86972 +               if (result == 0) {
86973 +                       left_loaded = 1;
86974 +                       result = longterm_lock_znode(kdata->left, left,
86975 +                                                    ZNODE_READ_LOCK,
86976 +                                                    ZNODE_LOCK_LOPRI);
86977 +               }
86978 +       }
86979 +       if (result == 0 && right != NULL) {
86980 +               result = zload(right);
86981 +               if (result == 0) {
86982 +                       right_loaded = 1;
86983 +                       result = longterm_lock_znode(kdata->right, right,
86984 +                                                    ZNODE_READ_LOCK,
86985 +                                                    ZNODE_LOCK_HIPRI | ZNODE_LOCK_NONBLOCK);
86986 +               }
86987 +       }
86988 +       if (result != 0) {
86989 +               done_lh(kdata->left);
86990 +               done_lh(kdata->right);
86991 +               if (left_loaded != 0)
86992 +                       zrelse(left);
86993 +               if (right_loaded != 0)
86994 +                       zrelse(right);
86995 +       }
86996 +       return result;
86997 +}
86998 +
86999 +static void
87000 +done_children(carry_kill_data *kdata)
87001 +{
87002 +       if (kdata->left != NULL && kdata->left->node != NULL) {
87003 +               zrelse(kdata->left->node);
87004 +               done_lh(kdata->left);
87005 +       }
87006 +       if (kdata->right != NULL && kdata->right->node != NULL) {
87007 +               zrelse(kdata->right->node);
87008 +               done_lh(kdata->right);
87009 +       }
87010 +}
87011 +
87012 +/* part of cut_node. It is called when cut_node is called to remove or cut part
87013 +   of extent item. When head of that item is removed - we have to update right
87014 +   delimiting of left neighbor of extent. When item is removed completely - we
87015 +   have to set sibling link between left and right neighbor of removed
87016 +   extent. This may return -E_DEADLOCK because of trying to get left neighbor
87017 +   locked. So, caller should repeat an attempt
87018 +*/
87019 +/* Audited by: umka (2002.06.16) */
87020 +static int
87021 +prepare_twig_kill(carry_kill_data *kdata, znode * locked_left_neighbor)
87022 +{
87023 +       int result;
87024 +       reiser4_key key;
87025 +       lock_handle left_lh;
87026 +       lock_handle right_lh;
87027 +       coord_t left_coord;
87028 +       coord_t *from;
87029 +       znode *left_child;
87030 +       znode *right_child;
87031 +       reiser4_tree *tree;
87032 +       int left_zloaded_here, right_zloaded_here;
87033 +
87034 +       from = kdata->params.from;
87035 +       assert("umka-326", from != NULL);
87036 +       assert("umka-327", kdata->params.to != NULL);
87037 +
87038 +       /* for one extent item only yet */
87039 +       assert("vs-591", item_is_extent(from));
87040 +       assert ("vs-592", from->item_pos == kdata->params.to->item_pos);
87041 +
87042 +       if ((kdata->params.from_key && keygt(kdata->params.from_key, item_key_by_coord(from, &key))) ||
87043 +           from->unit_pos != 0) {
87044 +               /* head of item @from is not removed, there is nothing to
87045 +                  worry about */
87046 +               return 0;
87047 +       }
87048 +
87049 +       result = 0;
87050 +       left_zloaded_here = 0;
87051 +       right_zloaded_here = 0;
87052 +
87053 +       left_child = right_child = NULL;
87054 +
87055 +       coord_dup(&left_coord, from);
87056 +       init_lh(&left_lh);
87057 +       init_lh(&right_lh);
87058 +       if (coord_prev_unit(&left_coord)) {
87059 +               /* @from is leftmost item in its node */
87060 +               if (!locked_left_neighbor) {
87061 +                       result = reiser4_get_left_neighbor(&left_lh, from->node, ZNODE_READ_LOCK, GN_CAN_USE_UPPER_LEVELS);
87062 +                       switch (result) {
87063 +                       case 0:
87064 +                               break;
87065 +                       case -E_NO_NEIGHBOR:
87066 +                               /* there is no formatted node to the left of
87067 +                                  from->node */
87068 +                               warning("vs-605",
87069 +                                       "extent item has smallest key in " "the tree and it is about to be removed");
87070 +                               return 0;
87071 +                       case -E_DEADLOCK:
87072 +                               /* need to restart */
87073 +                       default:
87074 +                               return result;
87075 +                       }
87076 +
87077 +                       /* we have acquired left neighbor of from->node */
87078 +                       result = zload(left_lh.node);
87079 +                       if (result)
87080 +                               goto done;
87081 +
87082 +                       locked_left_neighbor = left_lh.node;
87083 +               } else {
87084 +                       /* squalloc_right_twig_cut should have supplied locked
87085 +                        * left neighbor */
87086 +                       assert("vs-834", znode_is_write_locked(locked_left_neighbor));
87087 +                       result = zload(locked_left_neighbor);
87088 +                       if (result)
87089 +                               return result;
87090 +               }
87091 +
87092 +               left_zloaded_here = 1;
87093 +               coord_init_last_unit(&left_coord, locked_left_neighbor);
87094 +       }
87095 +
87096 +       if (!item_is_internal(&left_coord)) {
87097 +               /* what else but extent can be on twig level */
87098 +               assert("vs-606", item_is_extent(&left_coord));
87099 +
87100 +               /* there is no left formatted child */
87101 +               if (left_zloaded_here)
87102 +                       zrelse(locked_left_neighbor);
87103 +               done_lh(&left_lh);
87104 +               return 0;
87105 +       }
87106 +
87107 +       tree = znode_get_tree(left_coord.node);
87108 +       left_child = child_znode(&left_coord, left_coord.node, 1, 0);
87109 +
87110 +       if (IS_ERR(left_child)) {
87111 +               result = PTR_ERR(left_child);
87112 +               goto done;
87113 +       }
87114 +
87115 +       /* left child is acquired, calculate new right delimiting key for it
87116 +          and get right child if it is necessary */
87117 +       if (item_removed_completely(from, kdata->params.from_key, kdata->params.to_key)) {
87118 +               /* try to get right child of removed item */
87119 +               coord_t right_coord;
87120 +
87121 +               assert("vs-607", kdata->params.to->unit_pos == coord_last_unit_pos(kdata->params.to));
87122 +               coord_dup(&right_coord, kdata->params.to);
87123 +               if (coord_next_unit(&right_coord)) {
87124 +                       /* @to is rightmost unit in the node */
87125 +                       result = reiser4_get_right_neighbor(&right_lh, from->node, ZNODE_READ_LOCK, GN_CAN_USE_UPPER_LEVELS);
87126 +                       switch (result) {
87127 +                       case 0:
87128 +                               result = zload(right_lh.node);
87129 +                               if (result)
87130 +                                       goto done;
87131 +
87132 +                               right_zloaded_here = 1;
87133 +                               coord_init_first_unit(&right_coord, right_lh.node);
87134 +                               item_key_by_coord(&right_coord, &key);
87135 +                               break;
87136 +
87137 +                       case -E_NO_NEIGHBOR:
87138 +                               /* there is no formatted node to the right of
87139 +                                  from->node */
87140 +                               UNDER_RW_VOID(dk, tree, read,
87141 +                                             key = *znode_get_rd_key(from->node));
87142 +                               right_coord.node = 0;
87143 +                               result = 0;
87144 +                               break;
87145 +                       default:
87146 +                               /* real error */
87147 +                               goto done;
87148 +                       }
87149 +               } else {
87150 +                       /* there is an item to the right of @from - take its key */
87151 +                       item_key_by_coord(&right_coord, &key);
87152 +               }
87153 +
87154 +               /* try to get right child of @from */
87155 +               if (right_coord.node && /* there is right neighbor of @from */
87156 +                   item_is_internal(&right_coord)) {   /* it is internal item */
87157 +                       right_child = child_znode(&right_coord,
87158 +                                                 right_coord.node, 1, 0);
87159 +
87160 +                       if (IS_ERR(right_child)) {
87161 +                               result = PTR_ERR(right_child);
87162 +                               goto done;
87163 +                       }
87164 +
87165 +               }
87166 +               /* whole extent is removed between znodes left_child and right_child. Prepare them for linking and
87167 +                  update of right delimiting key of left_child */
87168 +               result = prepare_children(left_child, right_child, kdata);
87169 +       } else {
87170 +               /* head of item @to is removed. left_child has to get right delimting key update. Prepare it for that */
87171 +               result = prepare_children(left_child, NULL, kdata);
87172 +       }
87173 +
87174 + done:
87175 +       if (right_child)
87176 +               zput(right_child);
87177 +       if (right_zloaded_here)
87178 +               zrelse(right_lh.node);
87179 +       done_lh(&right_lh);
87180 +
87181 +       if (left_child)
87182 +               zput(left_child);
87183 +       if (left_zloaded_here)
87184 +               zrelse(locked_left_neighbor);
87185 +       done_lh(&left_lh);
87186 +       return result;
87187 +}
87188 +
87189 +/* this is used to remove part of node content between coordinates @from and @to. Units to which @from and @to are set
87190 +   are to be cut completely */
87191 +/* for try_to_merge_with_left, delete_copied, delete_node */
87192 +reiser4_internal int
87193 +cut_node_content(coord_t *from, coord_t *to,
87194 +                const reiser4_key * from_key /* first key to be removed */ ,
87195 +                const reiser4_key * to_key /* last key to be removed */ ,
87196 +                reiser4_key * smallest_removed /* smallest key actually removed */)
87197 +{
87198 +       carry_pool pool;
87199 +       carry_level lowest_level;
87200 +       carry_op *op;
87201 +       carry_cut_data cut_data;
87202 +       int result;
87203 +
87204 +       assert("", coord_compare(from, to) != COORD_CMP_ON_RIGHT);
87205 +
87206 +       init_carry_pool(&pool);
87207 +       init_carry_level(&lowest_level, &pool);
87208 +
87209 +       op = post_carry(&lowest_level, COP_CUT, from->node, 0);
87210 +       assert("vs-1509", op != 0);
87211 +       if (IS_ERR(op))
87212 +               return PTR_ERR(op);
87213 +
87214 +       cut_data.params.from = from;
87215 +       cut_data.params.to = to;
87216 +       cut_data.params.from_key = from_key;
87217 +       cut_data.params.to_key = to_key;
87218 +       cut_data.params.smallest_removed = smallest_removed;
87219 +
87220 +       op->u.cut_or_kill.is_cut = 1;
87221 +       op->u.cut_or_kill.u.cut = &cut_data;
87222 +
87223 +       ON_STATS(lowest_level.level_no = znode_get_level(from->node));
87224 +       result = carry(&lowest_level, 0);
87225 +       done_carry_pool(&pool);
87226 +
87227 +       return result;
87228 +}
87229 +
87230 +/* cut part of the node
87231 +
87232 +   Cut part or whole content of node.
87233 +
87234 +   cut data between @from and @to of @from->node and call carry() to make
87235 +   corresponding changes in the tree. @from->node may become empty. If so -
87236 +   pointer to it will be removed. Neighboring nodes are not changed. Smallest
87237 +   removed key is stored in @smallest_removed
87238 +
87239 +*/
87240 +reiser4_internal int
87241 +kill_node_content(coord_t * from /* coord of the first unit/item that will be
87242 +                                 * eliminated */ ,
87243 +                 coord_t * to   /* coord of the last unit/item that will be
87244 +                                 * eliminated */ ,
87245 +                 const reiser4_key * from_key /* first key to be removed */ ,
87246 +                 const reiser4_key * to_key /* last key to be removed */ ,
87247 +                 reiser4_key * smallest_removed        /* smallest key actually
87248 +                                                        * removed */ ,
87249 +                 znode * locked_left_neighbor, /* this is set when kill_node_content is called with left neighbor
87250 +                                                * locked (in squalloc_right_twig_cut, namely) */
87251 +                 struct inode *inode /* inode of file whose item (or its part) is to be killed. This is necessary to
87252 +                                        invalidate pages together with item pointing to them */)
87253 +{
87254 +       int result;
87255 +       carry_pool pool;
87256 +       carry_level lowest_level;
87257 +       carry_op *op;
87258 +       carry_kill_data kdata;
87259 +       lock_handle left_child;
87260 +       lock_handle right_child;
87261 +
87262 +       assert("umka-328", from != NULL);
87263 +       assert("vs-316", !node_is_empty(from->node));
87264 +       assert("nikita-1812", coord_is_existing_unit(from) && coord_is_existing_unit(to));
87265 +
87266 +       init_lh(&left_child);
87267 +       init_lh(&right_child);
87268 +
87269 +       kdata.params.from = from;
87270 +       kdata.params.to = to;
87271 +       kdata.params.from_key = from_key;
87272 +       kdata.params.to_key = to_key;
87273 +       kdata.params.smallest_removed = smallest_removed;
87274 +       kdata.flags = 0;
87275 +       kdata.inode = inode;
87276 +       kdata.left = &left_child;
87277 +       kdata.right = &right_child;
87278 +
87279 +       if (znode_get_level(from->node) == TWIG_LEVEL && item_is_extent(from)) {
87280 +               /* left child of extent item may have to get updated right
87281 +                  delimiting key and to get linked with right child of extent
87282 +                  @from if it will be removed completely */
87283 +               result = prepare_twig_kill(&kdata, locked_left_neighbor);
87284 +               if (result) {
87285 +                       done_children(&kdata);
87286 +                       return result;
87287 +               }
87288 +       }
87289 +
87290 +       init_carry_pool(&pool);
87291 +       init_carry_level(&lowest_level, &pool);
87292 +
87293 +       op = post_carry(&lowest_level, COP_CUT, from->node, 0);
87294 +       if (IS_ERR(op) || (op == NULL)) {
87295 +               done_children(&kdata);
87296 +               return RETERR(op ? PTR_ERR(op) : -EIO);
87297 +       }
87298 +
87299 +       op->u.cut_or_kill.is_cut = 0;
87300 +       op->u.cut_or_kill.u.kill = &kdata;
87301 +
87302 +       ON_STATS(lowest_level.level_no = znode_get_level(from->node));
87303 +       result = carry(&lowest_level, 0);
87304 +       done_carry_pool(&pool);
87305 +
87306 +       done_children(&kdata);
87307 +       return result;
87308 +}
87309 +
87310 +void
87311 +fake_kill_hook_tail(struct inode *inode, loff_t start, loff_t end)
87312 +{
87313 +       if (inode_get_flag(inode, REISER4_HAS_MMAP)) {
87314 +               pgoff_t start_pg, end_pg;
87315 +
87316 +               start_pg = start >> PAGE_CACHE_SHIFT;
87317 +               end_pg = (end - 1) >> PAGE_CACHE_SHIFT;
87318 +
87319 +               if ((start & (PAGE_CACHE_SIZE - 1)) == 0) {
87320 +                       /*
87321 +                        * kill up to the page boundary.
87322 +                        */
87323 +                       assert("vs-123456", start_pg == end_pg);
87324 +                       reiser4_invalidate_pages(inode->i_mapping, start_pg, 1);
87325 +               } else if (start_pg != end_pg) {
87326 +                       /*
87327 +                        * page boundary is within killed portion of node.
87328 +                        */
87329 +                       assert("vs-654321", end_pg - start_pg == 1);
87330 +                       reiser4_invalidate_pages(inode->i_mapping, end_pg, end_pg - start_pg);
87331 +               }
87332 +       }
87333 +       inode_sub_bytes(inode, end - start);
87334 +}
87335 +
87336 +/**
87337 + * Delete whole @node from the reiser4 tree without loading it.
87338 + *
87339 + * @left: locked left neighbor,
87340 + * @node: node to be deleted,
87341 + * @smallest_removed: leftmost key of deleted node,
87342 + * @object: inode pointer, if we truncate a file body.
87343 + *
87344 + * @return: 0 if success, error code otherwise.
87345 + *
87346 + * NOTE: if @object!=NULL we assume that @smallest_removed != NULL and it
87347 + * contains the right value of the smallest removed key from the previous
87348 + * cut_worker() iteration.  This is needed for proper accounting of
87349 + * "i_blocks" and "i_bytes" fields of the @object.
87350 + */
87351 +reiser4_internal int delete_node (znode * node, reiser4_key * smallest_removed,
87352 +                       struct inode * object)
87353 +{
87354 +       lock_handle parent_lock;
87355 +       coord_t cut_from;
87356 +       coord_t cut_to;
87357 +       reiser4_tree * tree;
87358 +       int ret;
87359 +
87360 +       assert ("zam-937", node != NULL);
87361 +       assert ("zam-933", znode_is_write_locked(node));
87362 +       assert ("zam-999", smallest_removed != NULL);
87363 +
87364 +       init_lh(&parent_lock);
87365 +
87366 +       ret = reiser4_get_parent(&parent_lock, node, ZNODE_WRITE_LOCK, 0);
87367 +       if (ret)
87368 +               return ret;
87369 +
87370 +       assert("zam-934", !znode_above_root(parent_lock.node));
87371 +
87372 +       ret = zload(parent_lock.node);
87373 +       if (ret)
87374 +               goto failed_nozrelse;
87375 +
87376 +       ret = find_child_ptr(parent_lock.node, node, &cut_from);
87377 +       if (ret)
87378 +               goto failed;
87379 +
87380 +       /* decrement child counter and set parent pointer to NULL before
87381 +          deleting the list from parent node because of checks in
87382 +          internal_kill_item_hook (we can delete the last item from the parent
87383 +          node, the parent node is going to be deleted and its c_count should
87384 +          be zero). */
87385 +
87386 +       tree = znode_get_tree(node);
87387 +       WLOCK_TREE(tree);
87388 +       init_parent_coord(&node->in_parent, NULL);
87389 +       -- parent_lock.node->c_count;
87390 +       WUNLOCK_TREE(tree);
87391 +
87392 +       assert("zam-989", item_is_internal(&cut_from));
87393 +
87394 +       /* @node should be deleted after unlocking. */
87395 +       ZF_SET(node, JNODE_HEARD_BANSHEE);
87396 +
87397 +       /* remove a pointer from the parent node to the node being deleted. */
87398 +       coord_dup(&cut_to, &cut_from);
87399 +       /* FIXME: shouldn't this be kill_node_content */
87400 +       ret = cut_node_content(&cut_from, &cut_to, NULL, NULL, NULL);
87401 +       if (ret)
87402 +               /* FIXME(Zam): Should we re-connect the node to its parent if
87403 +                * cut_node fails? */
87404 +               goto failed;
87405 +
87406 +       {
87407 +               reiser4_tree * tree = current_tree;
87408 +               __u64 start_offset = 0, end_offset = 0;
87409 +
87410 +               WLOCK_DK(tree);
87411 +               if (object) {
87412 +                       /* We use @smallest_removed and the left delimiting of
87413 +                        * the current node for @object->i_blocks, i_bytes
87414 +                        * calculation.  We assume that the items after the
87415 +                        * *@smallest_removed key have been deleted from the
87416 +                        * file body. */
87417 +                       start_offset = get_key_offset(znode_get_ld_key(node));
87418 +                       end_offset = get_key_offset(smallest_removed);
87419 +               }
87420 +
87421 +               RLOCK_TREE(tree);
87422 +               assert("zam-1021", znode_is_connected(node));
87423 +               if (node->left)
87424 +                       znode_set_rd_key(node->left, znode_get_rd_key(node));
87425 +               RUNLOCK_TREE(tree);
87426 +
87427 +               *smallest_removed = *znode_get_ld_key(node);
87428 +
87429 +               WUNLOCK_DK(tree);
87430 +
87431 +               if (object) {
87432 +                       /* we used to perform actions which are to be performed on items on their removal from tree in
87433 +                          special item method - kill_hook. Here for optimization reasons we avoid reading node
87434 +                          containing item we remove and can not call item's kill hook. Instead we call function which
87435 +                          does exactly the same things as tail kill hook in assumption that node we avoid reading
87436 +                          contains only one item and that item is a tail one. */
87437 +                       fake_kill_hook_tail(object, start_offset, end_offset);
87438 +               }
87439 +       }
87440 + failed:
87441 +       zrelse(parent_lock.node);
87442 + failed_nozrelse:
87443 +       done_lh(&parent_lock);
87444 +
87445 +       return ret;
87446 +}
87447 +
87448 +/**
87449 + * The cut_tree subroutine which does progressive deletion of items and whole
87450 + * nodes from right to left (which is not optimal but implementation seems to
87451 + * be easier).
87452 + *
87453 + * @tap: the point deletion process begins from,
87454 + * @from_key: the beginning of the deleted key range,
87455 + * @to_key: the end of the deleted key range,
87456 + * @smallest_removed: the smallest removed key,
87457 + *
87458 + * @return: 0 if success, error code otherwise, -E_REPEAT means that long cut_tree
87459 + * operation was interrupted for allowing atom commit .
87460 + */
87461 +static int cut_tree_worker (tap_t * tap, const reiser4_key * from_key,
87462 +                           const reiser4_key * to_key, reiser4_key * smallest_removed,
87463 +                           struct inode * object,
87464 +                           int lazy)
87465 +{
87466 +       lock_handle next_node_lock;
87467 +       coord_t left_coord;
87468 +       int result;
87469 +       long iterations = 0;
87470 +
87471 +       assert("zam-931", tap->coord->node != NULL);
87472 +       assert("zam-932", znode_is_write_locked(tap->coord->node));
87473 +
87474 +       init_lh(&next_node_lock);
87475 +
87476 +       while (1) {
87477 +               znode       *node;  /* node from which items are cut */
87478 +               node_plugin *nplug; /* node plugin for @node */
87479 +
87480 +               node = tap->coord->node;
87481 +
87482 +               /* Move next_node_lock to the next node on the left. */
87483 +               result = reiser4_get_left_neighbor(
87484 +                       &next_node_lock, node, ZNODE_WRITE_LOCK, GN_CAN_USE_UPPER_LEVELS);
87485 +               if (result != 0 && result != -E_NO_NEIGHBOR)
87486 +                       break;
87487 +               /* Check can we delete the node as a whole. */
87488 +               if (lazy && iterations && znode_get_level(node) == LEAF_LEVEL &&
87489 +                   UNDER_RW(dk, current_tree, read, keyle(from_key, znode_get_ld_key(node))))
87490 +               {
87491 +                       result = delete_node(node, smallest_removed, object);
87492 +               } else {
87493 +                       result = tap_load(tap);
87494 +                       if (result)
87495 +                               return result;
87496 +
87497 +                       /* Prepare the second (right) point for cut_node() */
87498 +                       if (iterations)
87499 +                               coord_init_last_unit(tap->coord, node);
87500 +
87501 +                       else if (item_plugin_by_coord(tap->coord)->b.lookup == NULL)
87502 +                               /* set rightmost unit for the items without lookup method */
87503 +                               tap->coord->unit_pos = coord_last_unit_pos(tap->coord);
87504 +
87505 +                       nplug = node->nplug;
87506 +
87507 +                       assert("vs-686", nplug);
87508 +                       assert("vs-687", nplug->lookup);
87509 +
87510 +                       /* left_coord is leftmost unit cut from @node */
87511 +                       result = nplug->lookup(node, from_key,
87512 +                                              FIND_MAX_NOT_MORE_THAN, &left_coord);
87513 +
87514 +                       if (IS_CBKERR(result))
87515 +                               break;
87516 +
87517 +                       /* adjust coordinates so that they are set to existing units */
87518 +                       if (coord_set_to_right(&left_coord) || coord_set_to_left(tap->coord)) {
87519 +                               result = 0;
87520 +                               break;
87521 +                       }
87522 +
87523 +                       if (coord_compare(&left_coord, tap->coord) == COORD_CMP_ON_RIGHT) {
87524 +                               /* keys from @from_key to @to_key are not in the tree */
87525 +                               result = 0;
87526 +                               break;
87527 +                       }
87528 +
87529 +                       if (left_coord.item_pos != tap->coord->item_pos) {
87530 +                               /* do not allow to cut more than one item. It is added to solve problem of truncating
87531 +                                  partially converted files. If file is partially converted there may exist a twig node
87532 +                                  containing both internal item or items pointing to leaf nodes with formatting items
87533 +                                  and extent item. We do not want to kill internal items being at twig node here
87534 +                                  because cut_tree_worker assumes killing them from level level */
87535 +                               coord_dup(&left_coord, tap->coord);
87536 +                               assert("vs-1652", coord_is_existing_unit(&left_coord));
87537 +                               left_coord.unit_pos = 0;
87538 +                       }
87539 +
87540 +                       /* cut data from one node */
87541 +                       *smallest_removed = *min_key();
87542 +                       result = kill_node_content(&left_coord,
87543 +                                                  tap->coord,
87544 +                                                  from_key,
87545 +                                                  to_key,
87546 +                                                  smallest_removed,
87547 +                                                  next_node_lock.node,
87548 +                                                  object);
87549 +                       tap_relse(tap);
87550 +               }
87551 +               if (result)
87552 +                       break;
87553 +
87554 +               /* Check whether all items with keys >= from_key were removed
87555 +                * from the tree. */
87556 +               if (keyle(smallest_removed, from_key))
87557 +                       /* result = 0;*/
87558 +                               break;
87559 +
87560 +               if (next_node_lock.node == NULL)
87561 +                       break;
87562 +
87563 +               result = tap_move(tap, &next_node_lock);
87564 +               done_lh(&next_node_lock);
87565 +               if (result)
87566 +                       break;
87567 +
87568 +               /* Break long cut_tree operation (deletion of a large file) if
87569 +                * atom requires commit. */
87570 +               if (iterations > CUT_TREE_MIN_ITERATIONS
87571 +                   && current_atom_should_commit())
87572 +               {
87573 +                       result = -E_REPEAT;
87574 +                       break;
87575 +               }
87576 +
87577 +
87578 +               ++ iterations;
87579 +       }
87580 +       done_lh(&next_node_lock);
87581 +       // assert("vs-301", !keyeq(&smallest_removed, min_key()));
87582 +       return result;
87583 +}
87584 +
87585 +
87586 +/* there is a fundamental problem with optimizing deletes: VFS does it
87587 +   one file at a time.  Another problem is that if an item can be
87588 +   anything, then deleting items must be done one at a time.  It just
87589 +   seems clean to writes this to specify a from and a to key, and cut
87590 +   everything between them though.  */
87591 +
87592 +/* use this function with care if deleting more than what is part of a single file. */
87593 +/* do not use this when cutting a single item, it is suboptimal for that */
87594 +
87595 +/* You are encouraged to write plugin specific versions of this.  It
87596 +   cannot be optimal for all plugins because it works item at a time,
87597 +   and some plugins could sometimes work node at a time. Regular files
87598 +   however are not optimizable to work node at a time because of
87599 +   extents needing to free the blocks they point to.
87600 +
87601 +   Optimizations compared to v3 code:
87602 +
87603 +   It does not balance (that task is left to memory pressure code).
87604 +
87605 +   Nodes are deleted only if empty.
87606 +
87607 +   Uses extents.
87608 +
87609 +   Performs read-ahead of formatted nodes whose contents are part of
87610 +   the deletion.
87611 +*/
87612 +
87613 +
87614 +/**
87615 + * Delete everything from the reiser4 tree between two keys: @from_key and
87616 + * @to_key.
87617 + *
87618 + * @from_key: the beginning of the deleted key range,
87619 + * @to_key: the end of the deleted key range,
87620 + * @smallest_removed: the smallest removed key,
87621 + * @object: owner of cutting items.
87622 + *
87623 + * @return: 0 if success, error code otherwise, -E_REPEAT means that long cut_tree
87624 + * operation was interrupted for allowing atom commit .
87625 + *
87626 + * FIXME(Zam): the cut_tree interruption is not implemented.
87627 + */
87628 +
87629 +reiser4_internal int
87630 +cut_tree_object(reiser4_tree * tree, const reiser4_key * from_key,
87631 +               const reiser4_key * to_key, reiser4_key * smallest_removed_p,
87632 +               struct inode * object, int lazy)
87633 +{
87634 +       lock_handle lock;
87635 +       int result;
87636 +       tap_t tap;
87637 +       coord_t right_coord;
87638 +       reiser4_key smallest_removed;
87639 +       STORE_COUNTERS;
87640 +
87641 +       assert("umka-329", tree != NULL);
87642 +       assert("umka-330", from_key != NULL);
87643 +       assert("umka-331", to_key != NULL);
87644 +       assert("zam-936", keyle(from_key, to_key));
87645 +
87646 +       if (smallest_removed_p == NULL)
87647 +               smallest_removed_p = &smallest_removed;
87648 +
87649 +       write_tree_log(tree, tree_cut, from_key, to_key);
87650 +       init_lh(&lock);
87651 +
87652 +       do {
87653 +               /* Find rightmost item to cut away from the tree. */
87654 +               result = object_lookup(
87655 +                       object, to_key, &right_coord, &lock,
87656 +                       ZNODE_WRITE_LOCK, FIND_MAX_NOT_MORE_THAN, TWIG_LEVEL,
87657 +                       LEAF_LEVEL, CBK_UNIQUE, 0/*ra_info*/);
87658 +               if (result != CBK_COORD_FOUND)
87659 +                       break;
87660 +
87661 +               tap_init(&tap, &right_coord, &lock, ZNODE_WRITE_LOCK);
87662 +               result = cut_tree_worker(
87663 +                       &tap, from_key, to_key, smallest_removed_p, object, lazy);
87664 +               tap_done(&tap);
87665 +
87666 +               preempt_point();
87667 +
87668 +       } while (0);
87669 +
87670 +       done_lh(&lock);
87671 +
87672 +       if (result) {
87673 +               switch (result) {
87674 +               case -E_NO_NEIGHBOR:
87675 +                       result = 0;
87676 +                       break;
87677 +               case -E_DEADLOCK:
87678 +                       result = -E_REPEAT;
87679 +               case -E_REPEAT:
87680 +               case -ENOMEM:
87681 +               case -ENOENT:
87682 +                       break;
87683 +               default:
87684 +                       warning("nikita-2861", "failure: %i", result);
87685 +               }
87686 +       }
87687 +
87688 +       CHECK_COUNTERS;
87689 +       return result;
87690 +}
87691 +
87692 +/* repeat cut_tree_object until everything is deleted. unlike cut_file_items, it
87693 + * does not end current transaction if -E_REPEAT is returned by
87694 + * cut_tree_object. */
87695 +reiser4_internal int
87696 +cut_tree(reiser4_tree *tree, const reiser4_key *from, const reiser4_key *to,
87697 +        struct inode *inode, int mode)
87698 +{
87699 +       int result;
87700 +
87701 +       do {
87702 +               result = cut_tree_object(tree, from, to, NULL, inode, mode);
87703 +       } while (result == -E_REPEAT);
87704 +
87705 +       return result;
87706 +}
87707 +
87708 +
87709 +/* first step of reiser4 tree initialization */
87710 +reiser4_internal void
87711 +init_tree_0(reiser4_tree * tree)
87712 +{
87713 +       assert("zam-683", tree != NULL);
87714 +       rw_tree_init(tree);
87715 +       spin_epoch_init(tree);
87716 +}
87717 +
87718 +/* finishing reiser4 initialization */
87719 +reiser4_internal int
87720 +init_tree(reiser4_tree * tree  /* pointer to structure being
87721 +                                * initialized */ ,
87722 +         const reiser4_block_nr * root_block   /* address of a root block
87723 +                                                * on a disk */ ,
87724 +         tree_level height /* height of a tree */ ,
87725 +         node_plugin * nplug /* default node plugin */ )
87726 +{
87727 +       int result;
87728 +
87729 +       assert("nikita-306", tree != NULL);
87730 +       assert("nikita-307", root_block != NULL);
87731 +       assert("nikita-308", height > 0);
87732 +       assert("nikita-309", nplug != NULL);
87733 +       assert("zam-587", tree->super != NULL);
87734 +
87735 +       /* someone might not call init_tree_0 before calling init_tree. */
87736 +       init_tree_0(tree);
87737 +
87738 +       tree->root_block = *root_block;
87739 +       tree->height = height;
87740 +       tree->estimate_one_insert = calc_estimate_one_insert(height);
87741 +       tree->nplug = nplug;
87742 +
87743 +       tree->znode_epoch = 1ull;
87744 +
87745 +       cbk_cache_init(&tree->cbk_cache);
87746 +
87747 +       result = znodes_tree_init(tree);
87748 +       if (result == 0)
87749 +               result = jnodes_tree_init(tree);
87750 +       if (result == 0) {
87751 +               tree->uber = zget(tree, &UBER_TREE_ADDR, NULL, 0, GFP_KERNEL);
87752 +               if (IS_ERR(tree->uber)) {
87753 +                       result = PTR_ERR(tree->uber);
87754 +                       tree->uber = NULL;
87755 +               }
87756 +       }
87757 +       return result;
87758 +}
87759 +
87760 +/* release resources associated with @tree */
87761 +reiser4_internal void
87762 +done_tree(reiser4_tree * tree /* tree to release */ )
87763 +{
87764 +       assert("nikita-311", tree != NULL);
87765 +
87766 +       if (tree->uber != NULL) {
87767 +               zput(tree->uber);
87768 +               tree->uber = NULL;
87769 +       }
87770 +       znodes_tree_done(tree);
87771 +       jnodes_tree_done(tree);
87772 +       cbk_cache_done(&tree->cbk_cache);
87773 +}
87774 +
87775 +/* Make Linus happy.
87776 +   Local variables:
87777 +   c-indentation-style: "K&R"
87778 +   mode-name: "LC"
87779 +   c-basic-offset: 8
87780 +   tab-width: 8
87781 +   fill-column: 120
87782 +   scroll-step: 1
87783 +   End:
87784 +*/
87785 diff -rupN linux-2.6.8-rc3/fs/reiser4/tree.h linux-2.6.8-rc3-a/fs/reiser4/tree.h
87786 --- linux-2.6.8-rc3/fs/reiser4/tree.h   1970-01-01 03:00:00.000000000 +0300
87787 +++ linux-2.6.8-rc3-a/fs/reiser4/tree.h 2004-08-05 21:20:52.930692693 +0400
87788 @@ -0,0 +1,556 @@
87789 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
87790 + * reiser4/README */
87791 +
87792 +/* Tree operations. See fs/reiser4/tree.c for comments */
87793 +
87794 +#if !defined( __REISER4_TREE_H__ )
87795 +#define __REISER4_TREE_H__
87796 +
87797 +#include "forward.h"
87798 +#include "debug.h"
87799 +#include "spin_macros.h"
87800 +#include "dformat.h"
87801 +#include "type_safe_list.h"
87802 +#include "plugin/node/node.h"
87803 +#include "plugin/plugin.h"
87804 +#include "jnode.h"
87805 +#include "znode.h"
87806 +#include "tap.h"
87807 +
87808 +#include <linux/types.h>       /* for __u??  */
87809 +#include <linux/fs.h>          /* for struct super_block  */
87810 +#include <linux/spinlock.h>
87811 +#include <linux/sched.h>       /* for struct task_struct */
87812 +
87813 +/* fictive block number never actually used */
87814 +extern const reiser4_block_nr UBER_TREE_ADDR;
87815 +
87816 +/* define typed list for cbk_cache lru */
87817 +TYPE_SAFE_LIST_DECLARE(cbk_cache);
87818 +
87819 +/* &cbk_cache_slot - entry in a coord cache.
87820 +
87821 +   This is entry in a coord_by_key (cbk) cache, represented by
87822 +   &cbk_cache.
87823 +
87824 +*/
87825 +typedef struct cbk_cache_slot {
87826 +       /* cached node */
87827 +       znode *node;
87828 +       /* linkage to the next cbk cache slot in a LRU order */
87829 +       cbk_cache_list_link lru;
87830 +} cbk_cache_slot;
87831 +
87832 +/* &cbk_cache - coord cache. This is part of reiser4_tree.
87833 +
87834 +   cbk_cache is supposed to speed up tree lookups by caching results of recent
87835 +   successful lookups (we don't cache negative results as dentry cache
87836 +   does). Cache consists of relatively small number of entries kept in a LRU
87837 +   order. Each entry (&cbk_cache_slot) contains a pointer to znode, from
87838 +   which we can obtain a range of keys that covered by this znode. Before
87839 +   embarking into real tree traversal we scan cbk_cache slot by slot and for
87840 +   each slot check whether key we are looking for is between minimal and
87841 +   maximal keys for node pointed to by this slot. If no match is found, real
87842 +   tree traversal is performed and if result is successful, appropriate entry
87843 +   is inserted into cache, possibly pulling least recently used entry out of
87844 +   it.
87845 +
87846 +   Tree spin lock is used to protect coord cache. If contention for this
87847 +   lock proves to be too high, more finer grained locking can be added.
87848 +
87849 +   Invariants involving parts of this data-type:
87850 +
87851 +      [cbk-cache-invariant]
87852 +*/
87853 +typedef struct cbk_cache {
87854 +       /* serializator */
87855 +       reiser4_rw_data guard;
87856 +       int nr_slots;
87857 +       /* head of LRU list of cache slots */
87858 +       cbk_cache_list_head lru;
87859 +       /* actual array of slots */
87860 +       cbk_cache_slot *slot;
87861 +} cbk_cache;
87862 +
87863 +#define rw_ordering_pred_cbk_cache(cache) (1)
87864 +
87865 +/* defined read-write locking functions for cbk_cache */
87866 +RW_LOCK_FUNCTIONS(cbk_cache, cbk_cache, guard);
87867 +
87868 +/* define list manipulation functions for cbk_cache LRU list */
87869 +TYPE_SAFE_LIST_DEFINE(cbk_cache, cbk_cache_slot, lru);
87870 +
87871 +/* level_lookup_result - possible outcome of looking up key at some level.
87872 +   This is used by coord_by_key when traversing tree downward. */
87873 +typedef enum {
87874 +       /* continue to the next level */
87875 +       LOOKUP_CONT,
87876 +       /* done. Either required item was found, or we can prove it
87877 +          doesn't exist, or some error occurred. */
87878 +       LOOKUP_DONE,
87879 +       /* restart traversal from the root. Infamous "repetition". */
87880 +       LOOKUP_REST
87881 +} level_lookup_result;
87882 +
87883 +/*    This is representation of internal reiser4 tree where all file-system
87884 +   data and meta-data are stored. This structure is passed to all tree
87885 +   manipulation functions. It's different from the super block because:
87886 +   we don't want to limit ourselves to strictly one to one mapping
87887 +   between super blocks and trees, and, because they are logically
87888 +   different: there are things in a super block that have no relation to
87889 +   the tree (bitmaps, journalling area, mount options, etc.) and there
87890 +   are things in a tree that bear no relation to the super block, like
87891 +   tree of znodes.
87892 +
87893 +   At this time, there is only one tree
87894 +   per filesystem, and this struct is part of the super block.  We only
87895 +   call the super block the super block for historical reasons (most
87896 +   other filesystems call the per filesystem metadata the super block).
87897 +*/
87898 +
87899 +struct reiser4_tree {
87900 +       /* block_nr == 0 is fake znode. Write lock it, while changing
87901 +          tree height. */
87902 +       /* disk address of root node of a tree */
87903 +       reiser4_block_nr root_block;
87904 +
87905 +       /* level of the root node. If this is 1, tree consists of root
87906 +           node only */
87907 +       tree_level height;
87908 +
87909 +       /*
87910 +        * this is cached here avoid calling plugins through function
87911 +        * dereference all the time.
87912 +        */
87913 +       __u64 estimate_one_insert;
87914 +
87915 +       /* cache of recent tree lookup results */
87916 +       cbk_cache cbk_cache;
87917 +
87918 +       /* hash table to look up znodes by block number. */
87919 +       z_hash_table zhash_table;
87920 +       z_hash_table zfake_table;
87921 +       /* hash table to look up jnodes by inode and offset. */
87922 +       j_hash_table jhash_table;
87923 +
87924 +       /* lock protecting:
87925 +           - parent pointers,
87926 +           - sibling pointers,
87927 +           - znode hash table
87928 +           - coord cache
87929 +       */
87930 +       /* NOTE: The "giant" tree lock can be replaced by more spin locks,
87931 +          hoping they will be less contented. We can use one spin lock per one
87932 +          znode hash bucket.  With adding of some code complexity, sibling
87933 +          pointers can be protected by both znode spin locks.  However it looks
87934 +          more SMP scalable we should test this locking change on n-ways (n >
87935 +          4) SMP machines.  Current 4-ways machine test does not show that tree
87936 +          lock is contented and it is a bottleneck (2003.07.25). */
87937 +
87938 +       reiser4_rw_data tree_lock;
87939 +
87940 +       /* lock protecting delimiting keys */
87941 +       reiser4_rw_data dk_lock;
87942 +
87943 +       /* spin lock protecting znode_epoch */
87944 +       reiser4_spin_data epoch_lock;
87945 +       /* version stamp used to mark znode updates. See seal.[ch] for more
87946 +        * information. */
87947 +       __u64 znode_epoch;
87948 +
87949 +       znode       *uber;
87950 +       node_plugin *nplug;
87951 +       struct super_block *super;
87952 +       struct {
87953 +               /* carry flags used for insertion of new nodes */
87954 +               __u32 new_node_flags;
87955 +               /* carry flags used for insertion of new extents */
87956 +               __u32 new_extent_flags;
87957 +               /* carry flags used for paste operations */
87958 +               __u32 paste_flags;
87959 +               /* carry flags used for insert operations */
87960 +               __u32 insert_flags;
87961 +       } carry;
87962 +};
87963 +
87964 +#define spin_ordering_pred_epoch(tree) (1)
87965 +SPIN_LOCK_FUNCTIONS(epoch, reiser4_tree, epoch_lock);
87966 +
87967 +extern void init_tree_0(reiser4_tree *);
87968 +
87969 +extern int init_tree(reiser4_tree * tree,
87970 +                    const reiser4_block_nr * root_block, tree_level height, node_plugin * default_plugin);
87971 +extern void done_tree(reiser4_tree * tree);
87972 +
87973 +/* &reiser4_item_data - description of data to be inserted or pasted
87974 +
87975 +   Q: articulate the reasons for the difference between this and flow.
87976 +
87977 +   A: Becides flow we insert into tree other things: stat data, directory
87978 +   entry, etc.  To insert them into tree one has to provide this structure. If
87979 +   one is going to insert flow - he can use insert_flow, where this structure
87980 +   does not have to be created
87981 +*/
87982 +struct reiser4_item_data {
87983 +       /* actual data to be inserted. If NULL, ->create_item() will not
87984 +          do xmemcpy itself, leaving this up to the caller. This can
87985 +          save some amount of unnecessary memory copying, for example,
87986 +          during insertion of stat data.
87987 +
87988 +       */
87989 +       char *data;
87990 +       /* 1 if 'char * data' contains pointer to user space and 0 if it is
87991 +          kernel space */
87992 +       int user;
87993 +       /* amount of data we are going to insert or paste */
87994 +       int length;
87995 +       /* "Arg" is opaque data that is passed down to the
87996 +           ->create_item() method of node layout, which in turn
87997 +           hands it to the ->create_hook() of item being created. This
87998 +           arg is currently used by:
87999 +
88000 +           .  ->create_hook() of internal item
88001 +           (fs/reiser4/plugin/item/internal.c:internal_create_hook()),
88002 +           . ->paste() method of directory item.
88003 +           . ->create_hook() of extent item
88004 +
88005 +          For internal item, this is left "brother" of new node being
88006 +          inserted and it is used to add new node into sibling list
88007 +          after parent to it was just inserted into parent.
88008 +
88009 +          While ->arg does look somewhat of unnecessary compication,
88010 +          it actually saves a lot of headache in many places, because
88011 +          all data necessary to insert or paste new data into tree are
88012 +          collected in one place, and this eliminates a lot of extra
88013 +          argument passing and storing everywhere.
88014 +
88015 +       */
88016 +       void *arg;
88017 +       /* plugin of item we are inserting */
88018 +       item_plugin *iplug;
88019 +};
88020 +
88021 +/* cbk flags: options for coord_by_key() */
88022 +typedef enum {
88023 +       /* coord_by_key() is called for insertion. This is necessary because
88024 +          of extents being located at the twig level. For explanation, see
88025 +          comment just above is_next_item_internal().
88026 +       */
88027 +       CBK_FOR_INSERT = (1 << 0),
88028 +       /* coord_by_key() is called with key that is known to be unique */
88029 +       CBK_UNIQUE = (1 << 1),
88030 +       /* coord_by_key() can trust delimiting keys. This options is not user
88031 +          accessible. coord_by_key() will set it automatically. It will be
88032 +          only cleared by special-case in extents-on-the-twig-level handling
88033 +          where it is necessary to insert item with a key smaller than
88034 +          leftmost key in a node. This is necessary because of extents being
88035 +          located at the twig level. For explanation, see comment just above
88036 +          is_next_item_internal().
88037 +       */
88038 +       CBK_TRUST_DK = (1 << 2),
88039 +       CBK_READA    = (1 << 3),  /* original: readahead leaves which contain items of certain file */
88040 +       CBK_READDIR_RA = (1 << 4), /* readdir: readahead whole directory and all its stat datas */
88041 +       CBK_DKSET    = (1 << 5),
88042 +       CBK_EXTENDED_COORD = (1 << 6), /* coord_t is actually */
88043 +       CBK_IN_CACHE = (1 << 7), /* node is already in cache */
88044 +       CBK_USE_CRABLOCK = (1 << 8) /* use crab_lock in stead of long term
88045 +                                    * lock */
88046 +} cbk_flags;
88047 +
88048 +/* insertion outcome. IBK = insert by key */
88049 +typedef enum {
88050 +       IBK_INSERT_OK = 0,
88051 +       IBK_ALREADY_EXISTS = -EEXIST,
88052 +       IBK_IO_ERROR = -EIO,
88053 +       IBK_NO_SPACE = -E_NODE_FULL,
88054 +       IBK_OOM = -ENOMEM
88055 +} insert_result;
88056 +
88057 +#define IS_CBKERR(err) ((err) != CBK_COORD_FOUND && (err) != CBK_COORD_NOTFOUND)
88058 +
88059 +typedef int (*tree_iterate_actor_t) (reiser4_tree * tree, coord_t * coord, lock_handle * lh, void *arg);
88060 +extern int iterate_tree(reiser4_tree * tree, coord_t * coord, lock_handle * lh,
88061 +                       tree_iterate_actor_t actor, void *arg, znode_lock_mode mode, int through_units_p);
88062 +extern int get_uber_znode(reiser4_tree * tree, znode_lock_mode mode,
88063 +                         znode_lock_request pri, lock_handle *lh);
88064 +
88065 +/* return node plugin of @node */
88066 +static inline node_plugin *
88067 +node_plugin_by_node(const znode * node /* node to query */ )
88068 +{
88069 +       assert("vs-213", node != NULL);
88070 +       assert("vs-214", znode_is_loaded(node));
88071 +
88072 +       return node->nplug;
88073 +}
88074 +
88075 +/* number of items in @node */
88076 +static inline pos_in_node_t
88077 +node_num_items(const znode * node)
88078 +{
88079 +       assert("nikita-2754", znode_is_loaded(node));
88080 +       assert("nikita-2468",
88081 +              node_plugin_by_node(node)->num_of_items(node) == node->nr_items);
88082 +
88083 +       return node->nr_items;
88084 +}
88085 +
88086 +/* Return the number of items at the present node.  Asserts coord->node !=
88087 +   NULL. */
88088 +static inline unsigned
88089 +coord_num_items(const coord_t * coord)
88090 +{
88091 +       assert("jmacd-9805", coord->node != NULL);
88092 +
88093 +       return node_num_items(coord->node);
88094 +}
88095 +
88096 +/* true if @node is empty */
88097 +static inline int
88098 +node_is_empty(const znode * node)
88099 +{
88100 +       return node_num_items(node) == 0;
88101 +}
88102 +
88103 +typedef enum {
88104 +       SHIFTED_SOMETHING = 0,
88105 +       SHIFT_NO_SPACE = -E_NODE_FULL,
88106 +       SHIFT_IO_ERROR = -EIO,
88107 +       SHIFT_OOM = -ENOMEM,
88108 +} shift_result;
88109 +
88110 +extern node_plugin *node_plugin_by_coord(const coord_t * coord);
88111 +extern int is_coord_in_node(const coord_t * coord);
88112 +extern int key_in_node(const reiser4_key *, const coord_t *);
88113 +extern void coord_item_move_to(coord_t * coord, int items);
88114 +extern void coord_unit_move_to(coord_t * coord, int units);
88115 +
88116 +/* there are two types of repetitive accesses (ra): intra-syscall
88117 +   (local) and inter-syscall (global). Local ra is used when
88118 +   during single syscall we add/delete several items and units in the
88119 +   same place in a tree. Note that plan-A fragments local ra by
88120 +   separating stat-data and file body in key-space. Global ra is
88121 +   used when user does repetitive modifications in the same place in a
88122 +   tree.
88123 +
88124 +   Our ra implementation serves following purposes:
88125 +    1 it affects balancing decisions so that next operation in a row
88126 +      can be performed faster;
88127 +    2 it affects lower-level read-ahead in page-cache;
88128 +    3 it allows to avoid unnecessary lookups by maintaining some state
88129 +      accross several operations (this is only for local ra);
88130 +    4 it leaves room for lazy-micro-balancing: when we start a sequence of
88131 +      operations they are performed without actually doing any intra-node
88132 +      shifts, until we finish sequence or scope of sequence leaves
88133 +      current node, only then we really pack node (local ra only).
88134 +*/
88135 +
88136 +/* another thing that can be useful is to keep per-tree and/or
88137 +   per-process cache of recent lookups. This cache can be organised as a
88138 +   list of block numbers of formatted nodes sorted by starting key in
88139 +   this node. Balancings should invalidate appropriate parts of this
88140 +   cache.
88141 +*/
88142 +
88143 +lookup_result coord_by_key(reiser4_tree * tree, const reiser4_key * key,
88144 +                          coord_t * coord, lock_handle * handle,
88145 +                          znode_lock_mode lock, lookup_bias bias,
88146 +                          tree_level lock_level, tree_level stop_level, __u32 flags,
88147 +                          ra_info_t *);
88148 +
88149 +lookup_result object_lookup(struct inode *object,
88150 +                           const reiser4_key * key,
88151 +                           coord_t * coord,
88152 +                           lock_handle * lh,
88153 +                           znode_lock_mode lock_mode,
88154 +                           lookup_bias bias,
88155 +                           tree_level lock_level,
88156 +                           tree_level stop_level,
88157 +                           __u32 flags,
88158 +                           ra_info_t *info);
88159 +
88160 +insert_result insert_by_key(reiser4_tree * tree, const reiser4_key * key,
88161 +                           reiser4_item_data * data, coord_t * coord,
88162 +                           lock_handle * lh,
88163 +                           tree_level stop_level, __u32 flags);
88164 +insert_result insert_by_coord(coord_t * coord,
88165 +                             reiser4_item_data * data, const reiser4_key * key,
88166 +                             lock_handle * lh,
88167 +                             __u32);
88168 +insert_result insert_extent_by_coord(coord_t * coord,
88169 +                                    reiser4_item_data * data, const reiser4_key * key, lock_handle * lh);
88170 +int cut_node_content(coord_t *from, coord_t *to,
88171 +                    const reiser4_key *from_key, const reiser4_key *to_key,
88172 +                    reiser4_key *smallest_removed);
88173 +int kill_node_content(coord_t *from, coord_t *to,
88174 +                     const reiser4_key *from_key, const reiser4_key *to_key,
88175 +                     reiser4_key *smallest_removed,
88176 +                     znode *locked_left_neighbor,
88177 +                     struct inode *inode);
88178 +
88179 +int resize_item(coord_t * coord, reiser4_item_data * data,
88180 +               reiser4_key * key, lock_handle * lh, cop_insert_flag);
88181 +int insert_into_item(coord_t * coord, lock_handle * lh, const reiser4_key * key, reiser4_item_data * data, unsigned);
88182 +int insert_flow(coord_t * coord, lock_handle * lh, flow_t * f);
88183 +int find_new_child_ptr(znode * parent, znode * child, znode * left, coord_t * result);
88184 +
88185 +int shift_right_of_but_excluding_insert_coord(coord_t * insert_coord);
88186 +int shift_left_of_and_including_insert_coord(coord_t * insert_coord);
88187 +
88188 +void fake_kill_hook_tail(struct inode *, loff_t start, loff_t end);
88189 +
88190 +extern int cut_tree_object(reiser4_tree*, const reiser4_key*, const reiser4_key*, reiser4_key*, struct inode*, int lazy);
88191 +extern int cut_tree(reiser4_tree *tree, const reiser4_key *from, const reiser4_key *to, struct inode*, int lazy);
88192 +
88193 +extern int delete_node(znode * node, reiser4_key *, struct inode *);
88194 +extern int check_tree_pointer(const coord_t * pointer, const znode * child);
88195 +extern int find_new_child_ptr(znode * parent, znode * child UNUSED_ARG, znode * left, coord_t * result);
88196 +extern int find_child_ptr(znode * parent, znode * child, coord_t * result);
88197 +extern int find_child_by_addr(znode * parent, znode * child, coord_t * result);
88198 +extern int set_child_delimiting_keys(znode * parent, const coord_t * in_parent, znode *child);
88199 +extern znode *child_znode(const coord_t * in_parent, znode * parent, int incore_p, int setup_dkeys_p);
88200 +
88201 +extern int cbk_cache_init(cbk_cache * cache);
88202 +extern void cbk_cache_done(cbk_cache * cache);
88203 +extern void cbk_cache_invalidate(const znode * node, reiser4_tree * tree);
88204 +extern void cbk_cache_add(const znode * node);
88205 +
88206 +extern const char *bias_name(lookup_bias bias);
88207 +extern char *sprint_address(const reiser4_block_nr * block);
88208 +
88209 +#if REISER4_DEBUG_OUTPUT
88210 +extern void print_coord_content(const char *prefix, coord_t * p);
88211 +extern void print_address(const char *prefix, const reiser4_block_nr * block);
88212 +extern void print_tree_rec(const char *prefix, reiser4_tree * tree, __u32 flags);
88213 +extern void print_cbk_slot(const char *prefix, const cbk_cache_slot * slot);
88214 +extern void print_cbk_cache(const char *prefix, const cbk_cache * cache);
88215 +#else
88216 +#define print_coord_content(p, c) noop
88217 +#define print_address(p, b) noop
88218 +#define print_tree_rec(p, f, t) noop
88219 +#define print_cbk_slot(p, s) noop
88220 +#define print_cbk_cache(p, c) noop
88221 +#endif
88222 +
88223 +extern void forget_znode(lock_handle * handle);
88224 +extern int deallocate_znode(znode * node);
88225 +
88226 +extern int is_disk_addr_unallocated(const reiser4_block_nr * addr);
88227 +extern void *unallocated_disk_addr_to_ptr(const reiser4_block_nr * addr);
88228 +
88229 +/* struct used internally to pack all numerous arguments of tree lookup.
88230 +    Used to avoid passing a lot of arguments to helper functions. */
88231 +typedef struct cbk_handle {
88232 +       /* tree we are in */
88233 +       reiser4_tree *tree;
88234 +       /* key we are going after */
88235 +       const reiser4_key *key;
88236 +       /* coord we will store result in */
88237 +       coord_t *coord;
88238 +       /* type of lock to take on target node */
88239 +       znode_lock_mode lock_mode;
88240 +       /* lookup bias. See comments at the declaration of lookup_bias */
88241 +       lookup_bias bias;
88242 +       /* lock level: level starting from which tree traversal starts taking
88243 +        * write locks. */
88244 +       tree_level lock_level;
88245 +       /* level where search will stop. Either item will be found between
88246 +          lock_level and stop_level, or CBK_COORD_NOTFOUND will be
88247 +          returned.
88248 +       */
88249 +       tree_level stop_level;
88250 +       /* level we are currently at */
88251 +       tree_level level;
88252 +       /* block number of @active node. Tree traversal operates on two
88253 +          nodes: active and parent.  */
88254 +       reiser4_block_nr block;
88255 +       /* put here error message to be printed by caller */
88256 +       const char *error;
88257 +       /* result passed back to caller */
88258 +       lookup_result result;
88259 +       /* lock handles for active and parent */
88260 +       lock_handle *parent_lh;
88261 +       lock_handle *active_lh;
88262 +       reiser4_key ld_key;
88263 +       reiser4_key rd_key;
88264 +       /* flags, passed to the cbk routine. Bits of this bitmask are defined
88265 +          in tree.h:cbk_flags enum. */
88266 +       __u32 flags;
88267 +       ra_info_t *ra_info;
88268 +       struct inode *object;
88269 +} cbk_handle;
88270 +
88271 +extern znode_lock_mode cbk_lock_mode(tree_level level, cbk_handle * h);
88272 +
88273 +/* eottl.c */
88274 +extern int handle_eottl(cbk_handle * h, int *outcome);
88275 +
88276 +int lookup_multikey(cbk_handle * handle, int nr_keys);
88277 +int lookup_couple(reiser4_tree * tree,
88278 +                 const reiser4_key * key1, const reiser4_key * key2,
88279 +                 coord_t * coord1, coord_t * coord2,
88280 +                 lock_handle * lh1, lock_handle * lh2,
88281 +                 znode_lock_mode lock_mode, lookup_bias bias,
88282 +                 tree_level lock_level, tree_level stop_level, __u32 flags, int *result1, int *result2);
88283 +
88284 +/* ordering constraint for tree spin lock: tree lock is "strongest" */
88285 +#define rw_ordering_pred_tree(tree)                    \
88286 +       (lock_counters()->spin_locked_txnh == 0) &&     \
88287 +       (lock_counters()->rw_locked_tree == 0)
88288 +
88289 +/* Define spin_lock_tree, spin_unlock_tree, and spin_tree_is_locked:
88290 +   spin lock protecting znode hash, and parent and sibling pointers. */
88291 +RW_LOCK_FUNCTIONS(tree, reiser4_tree, tree_lock);
88292 +
88293 +/* ordering constraint for delimiting key spin lock: dk lock is weaker than
88294 +   tree lock */
88295 +#define rw_ordering_pred_dk( tree )                    \
88296 +       (lock_counters()->rw_locked_tree == 0) &&       \
88297 +       (lock_counters()->spin_locked_jnode == 0) &&    \
88298 +       (lock_counters()->rw_locked_zlock == 0) &&      \
88299 +       (lock_counters()->spin_locked_txnh == 0) &&     \
88300 +       (lock_counters()->spin_locked_atom == 0) &&     \
88301 +       (lock_counters()->spin_locked_inode_object == 0) &&     \
88302 +       (lock_counters()->spin_locked_txnmgr == 0)
88303 +
88304 +/* Define spin_lock_dk(), spin_unlock_dk(), etc: locking for delimiting
88305 +   keys. */
88306 +RW_LOCK_FUNCTIONS(dk, reiser4_tree, dk_lock);
88307 +
88308 +#if REISER4_DEBUG
88309 +#define check_tree() print_tree_rec( "", current_tree, REISER4_TREE_CHECK )
88310 +#else
88311 +#define check_tree() noop
88312 +#endif
88313 +
88314 +/* estimate api. Implementation is in estimate.c */
88315 +reiser4_block_nr estimate_internal_amount(reiser4_block_nr childen, tree_level);
88316 +reiser4_block_nr estimate_one_insert_item(reiser4_tree *);
88317 +reiser4_block_nr estimate_one_insert_into_item(reiser4_tree *);
88318 +reiser4_block_nr estimate_insert_flow(tree_level);
88319 +reiser4_block_nr estimate_one_item_removal(reiser4_tree *);
88320 +reiser4_block_nr calc_estimate_one_insert(tree_level);
88321 +reiser4_block_nr estimate_disk_cluster(struct inode *);
88322 +reiser4_block_nr estimate_insert_cluster(struct inode *, int);
88323 +
88324 +/* take read or write tree lock, depending on @takeread argument */
88325 +#define XLOCK_TREE(tree, takeread)                             \
88326 +       (takeread ? RLOCK_TREE(tree) : WLOCK_TREE(tree))
88327 +
88328 +/* release read or write tree lock, depending on @takeread argument */
88329 +#define XUNLOCK_TREE(tree, takeread)                           \
88330 +       (takeread ? RUNLOCK_TREE(tree) : WUNLOCK_TREE(tree))
88331 +
88332 +/* __REISER4_TREE_H__ */
88333 +#endif
88334 +
88335 +/* Make Linus happy.
88336 +   Local variables:
88337 +   c-indentation-style: "K&R"
88338 +   mode-name: "LC"
88339 +   c-basic-offset: 8
88340 +   tab-width: 8
88341 +   fill-column: 120
88342 +   scroll-step: 1
88343 +   End:
88344 +*/
88345 diff -rupN linux-2.6.8-rc3/fs/reiser4/tree_mod.c linux-2.6.8-rc3-a/fs/reiser4/tree_mod.c
88346 --- linux-2.6.8-rc3/fs/reiser4/tree_mod.c       1970-01-01 03:00:00.000000000 +0300
88347 +++ linux-2.6.8-rc3-a/fs/reiser4/tree_mod.c     2004-08-05 21:20:53.306613402 +0400
88348 @@ -0,0 +1,363 @@
88349 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
88350 + * reiser4/README */
88351 +
88352 +/*
88353 + * Functions to add/delete new nodes to/from the tree.
88354 + *
88355 + * Functions from this file are used by carry (see carry*) to handle:
88356 + *
88357 + *     . insertion of new formatted node into tree
88358 + *
88359 + *     . addition of new tree root, increasing tree height
88360 + *
88361 + *     . removing tree root, decreasing tree height
88362 + *
88363 + */
88364 +
88365 +#include "forward.h"
88366 +#include "debug.h"
88367 +#include "dformat.h"
88368 +#include "key.h"
88369 +#include "coord.h"
88370 +#include "plugin/plugin.h"
88371 +#include "jnode.h"
88372 +#include "znode.h"
88373 +#include "tree_mod.h"
88374 +#include "block_alloc.h"
88375 +#include "tree_walk.h"
88376 +#include "tree.h"
88377 +#include "super.h"
88378 +
88379 +#include <linux/err.h>
88380 +
88381 +static int add_child_ptr(znode * parent, znode * child);
88382 +/* warning only issued if error is not -E_REPEAT */
88383 +#define ewarning( error, ... )                 \
88384 +       if( ( error ) != -E_REPEAT )            \
88385 +               warning( __VA_ARGS__ )
88386 +
88387 +/* allocate new node on the @level and immediately on the right of @brother. */
88388 +reiser4_internal znode *
88389 +new_node(znode * brother /* existing left neighbor of new node */ ,
88390 +        tree_level level       /* tree level at which new node is to
88391 +                                * be allocated */ )
88392 +{
88393 +       znode *result;
88394 +       int retcode;
88395 +       reiser4_block_nr blocknr;
88396 +
88397 +       assert("nikita-930", brother != NULL);
88398 +       assert("umka-264", level < REAL_MAX_ZTREE_HEIGHT);
88399 +
88400 +       retcode = assign_fake_blocknr_formatted(&blocknr);
88401 +       if (retcode == 0) {
88402 +               result = zget(znode_get_tree(brother), &blocknr, NULL, level, GFP_KERNEL);
88403 +               if (IS_ERR(result)) {
88404 +                       ewarning(PTR_ERR(result), "nikita-929",
88405 +                                "Cannot allocate znode for carry: %li", PTR_ERR(result));
88406 +                       return result;
88407 +               }
88408 +               /* cheap test, can be executed even when debugging is off */
88409 +               if (!znode_just_created(result)) {
88410 +                       warning("nikita-2213", "Allocated already existing block: %llu", blocknr);
88411 +                       zput(result);
88412 +                       return ERR_PTR(RETERR(-EIO));
88413 +               }
88414 +
88415 +               assert("nikita-931", result != NULL);
88416 +               result->nplug = znode_get_tree(brother)->nplug;
88417 +               assert("nikita-933", result->nplug != NULL);
88418 +
88419 +               retcode = zinit_new(result, GFP_KERNEL);
88420 +               if (retcode == 0) {
88421 +                       ZF_SET(result, JNODE_CREATED);
88422 +                       zrelse(result);
88423 +               } else {
88424 +                       zput(result);
88425 +                       result = ERR_PTR(retcode);
88426 +               }
88427 +       } else {
88428 +               /* failure to allocate new node during balancing.
88429 +                  This should never happen. Ever. Returning -E_REPEAT
88430 +                  is not viable solution, because "out of disk space"
88431 +                  is not transient error that will go away by itself.
88432 +               */
88433 +               ewarning(retcode, "nikita-928",
88434 +                        "Cannot allocate block for carry: %i", retcode);
88435 +               result = ERR_PTR(retcode);
88436 +       }
88437 +       assert("nikita-1071", result != NULL);
88438 +       return result;
88439 +}
88440 +
88441 +/* allocate new root and add it to the tree
88442 +
88443 +   This helper function is called by add_new_root().
88444 +
88445 +*/
88446 +reiser4_internal znode *
88447 +add_tree_root(znode * old_root /* existing tree root */ ,
88448 +             znode * fake /* "fake" znode */ )
88449 +{
88450 +       reiser4_tree *tree = znode_get_tree(old_root);
88451 +       znode *new_root = NULL; /* to shut gcc up */
88452 +       int result;
88453 +
88454 +       assert("nikita-1069", old_root != NULL);
88455 +       assert("umka-262", fake != NULL);
88456 +       assert("umka-263", tree != NULL);
88457 +
88458 +       /* "fake" znode---one always hanging just above current root. This
88459 +          node is locked when new root is created or existing root is
88460 +          deleted. Downward tree traversal takes lock on it before taking
88461 +          lock on a root node. This avoids race conditions with root
88462 +          manipulations.
88463 +
88464 +       */
88465 +       assert("nikita-1348", znode_above_root(fake));
88466 +       assert("nikita-1211", znode_is_root(old_root));
88467 +
88468 +       result = 0;
88469 +       if (tree->height >= REAL_MAX_ZTREE_HEIGHT) {
88470 +               warning("nikita-1344", "Tree is too tall: %i", tree->height);
88471 +               /* ext2 returns -ENOSPC when it runs out of free inodes with a
88472 +                  following comment (fs/ext2/ialloc.c:441): Is it really
88473 +                  ENOSPC?
88474 +
88475 +                  -EXFULL? -EINVAL?
88476 +               */
88477 +               result = RETERR(-ENOSPC);
88478 +       } else {
88479 +               /* Allocate block for new root. It's not that
88480 +                  important where it will be allocated, as root is
88481 +                  almost always in memory. Moreover, allocate on
88482 +                  flush can be going here.
88483 +               */
88484 +               assert("nikita-1448", znode_is_root(old_root));
88485 +               new_root = new_node(fake, tree->height + 1);
88486 +               if (!IS_ERR(new_root) && (result = zload(new_root)) == 0) {
88487 +                       lock_handle rlh;
88488 +
88489 +                       init_lh(&rlh);
88490 +                       result = longterm_lock_znode(&rlh, new_root, ZNODE_WRITE_LOCK, ZNODE_LOCK_LOPRI);
88491 +                       if (result == 0) {
88492 +                               parent_coord_t *in_parent;
88493 +
88494 +                               znode_make_dirty(fake);
88495 +
88496 +                               /* new root is a child of "fake" node */
88497 +                               WLOCK_TREE(tree);
88498 +
88499 +                               ++tree->height;
88500 +
88501 +                               /* recalculate max balance overhead */
88502 +                               tree->estimate_one_insert = estimate_one_insert_item(tree);
88503 +
88504 +                               tree->root_block = *znode_get_block(new_root);
88505 +                               in_parent = &new_root->in_parent;
88506 +                               init_parent_coord(in_parent, fake);
88507 +                               /* manually insert new root into sibling
88508 +                                * list. With this all nodes involved into
88509 +                                * balancing are connected after balancing is
88510 +                                * done---useful invariant to check. */
88511 +                               sibling_list_insert_nolock(new_root, NULL);
88512 +                               WUNLOCK_TREE(tree);
88513 +
88514 +                               /* insert into new root pointer to the
88515 +                                  @old_root. */
88516 +                               assert("nikita-1110", WITH_DATA(new_root, node_is_empty(new_root)));
88517 +                               WLOCK_DK(tree);
88518 +                               znode_set_ld_key(new_root, min_key());
88519 +                               znode_set_rd_key(new_root, max_key());
88520 +                               WUNLOCK_DK(tree);
88521 +                               if (REISER4_DEBUG) {
88522 +                                       ZF_CLR(old_root, JNODE_LEFT_CONNECTED);
88523 +                                       ZF_CLR(old_root, JNODE_RIGHT_CONNECTED);
88524 +                                       ZF_SET(old_root, JNODE_ORPHAN);
88525 +                               }
88526 +                               result = add_child_ptr(new_root, old_root);
88527 +                               done_lh(&rlh);
88528 +                       }
88529 +                       zrelse(new_root);
88530 +               }
88531 +       }
88532 +       if (result != 0)
88533 +               new_root = ERR_PTR(result);
88534 +       return new_root;
88535 +}
88536 +
88537 +/* build &reiser4_item_data for inserting child pointer
88538 +
88539 +   Build &reiser4_item_data that can be later used to insert pointer to @child
88540 +   in its parent.
88541 +
88542 +*/
88543 +reiser4_internal void
88544 +build_child_ptr_data(znode * child     /* node pointer to which will be
88545 +                                        * inserted */ ,
88546 +                    reiser4_item_data * data /* where to store result */ )
88547 +{
88548 +       assert("nikita-1116", child != NULL);
88549 +       assert("nikita-1117", data != NULL);
88550 +
88551 +       /* this is subtle assignment to meditate upon */
88552 +       data->data = (char *) znode_get_block(child);
88553 +       /* data -> data is kernel space */
88554 +       data->user = 0;
88555 +       data->length = sizeof (reiser4_block_nr);
88556 +       /* FIXME-VS: hardcoded internal item? */
88557 +
88558 +       /* AUDIT: Is it possible that "item_plugin_by_id" may find nothing? */
88559 +       data->iplug = item_plugin_by_id(NODE_POINTER_ID);
88560 +}
88561 +
88562 +/* add pointer to @child into empty @parent.
88563 +
88564 +   This is used when pointer to old root is inserted into new root which is
88565 +   empty.
88566 +*/
88567 +static int
88568 +add_child_ptr(znode * parent, znode * child)
88569 +{
88570 +       coord_t coord;
88571 +       reiser4_item_data data;
88572 +       int result;
88573 +       reiser4_key *key;
88574 +
88575 +       assert("nikita-1111", parent != NULL);
88576 +       assert("nikita-1112", child != NULL);
88577 +       assert("nikita-1115", znode_get_level(parent) == znode_get_level(child) + 1);
88578 +
88579 +       result = zload(parent);
88580 +       if (result != 0)
88581 +               return result;
88582 +       assert("nikita-1113", node_is_empty(parent));
88583 +       coord_init_first_unit(&coord, parent);
88584 +
88585 +       build_child_ptr_data(child, &data);
88586 +       data.arg = NULL;
88587 +
88588 +       key = UNDER_RW(dk, znode_get_tree(parent), read, znode_get_ld_key(child));
88589 +       result = node_plugin_by_node(parent)->create_item(&coord, key, &data, NULL);
88590 +       znode_make_dirty(parent);
88591 +       zrelse(parent);
88592 +       return result;
88593 +}
88594 +
88595 +/* actually remove tree root */
88596 +static int
88597 +kill_root(reiser4_tree * tree  /* tree from which root is being
88598 +                                * removed */ ,
88599 +         znode * old_root /* root node that is being removed */ ,
88600 +         znode * new_root      /* new root---sole child of *
88601 +                                * @old_root */ ,
88602 +         const reiser4_block_nr * new_root_blk /* disk address of
88603 +                                                * @new_root */ )
88604 +{
88605 +       znode *uber;
88606 +       int result;
88607 +       lock_handle handle_for_uber;
88608 +
88609 +       assert("umka-265", tree != NULL);
88610 +       assert("nikita-1198", new_root != NULL);
88611 +       assert("nikita-1199", znode_get_level(new_root) + 1 == znode_get_level(old_root));
88612 +
88613 +       assert("nikita-1201", znode_is_write_locked(old_root));
88614 +
88615 +       assert("nikita-1203", disk_addr_eq(new_root_blk, znode_get_block(new_root)));
88616 +
88617 +       init_lh(&handle_for_uber);
88618 +       /* obtain and lock "fake" znode protecting changes in tree height. */
88619 +       result = get_uber_znode(tree, ZNODE_WRITE_LOCK, ZNODE_LOCK_HIPRI,
88620 +                               &handle_for_uber);
88621 +       if (result == 0) {
88622 +               uber = handle_for_uber.node;
88623 +
88624 +               znode_make_dirty(uber);
88625 +
88626 +               /* don't take long term lock a @new_root. Take spinlock. */
88627 +
88628 +               WLOCK_TREE(tree);
88629 +
88630 +               tree->root_block = *new_root_blk;
88631 +               --tree->height;
88632 +
88633 +               /* recalculate max balance overhead */
88634 +               tree->estimate_one_insert = estimate_one_insert_item(tree);
88635 +
88636 +               assert("nikita-1202", tree->height == znode_get_level(new_root));
88637 +
88638 +               /* new root is child on "fake" node */
88639 +               init_parent_coord(&new_root->in_parent, uber);
88640 +               ++ uber->c_count;
88641 +
88642 +               /* sibling_list_insert_nolock(new_root, NULL); */
88643 +               WUNLOCK_TREE(tree);
88644 +
88645 +               /* reinitialise old root. */
88646 +               result = node_plugin_by_node(old_root)->init(old_root);
88647 +               znode_make_dirty(old_root);
88648 +               if (result == 0) {
88649 +                       assert("nikita-1279", node_is_empty(old_root));
88650 +                       ZF_SET(old_root, JNODE_HEARD_BANSHEE);
88651 +                       old_root->c_count = 0;
88652 +               }
88653 +       }
88654 +       done_lh(&handle_for_uber);
88655 +
88656 +       return result;
88657 +}
88658 +
88659 +/* remove tree root
88660 +
88661 +   This function removes tree root, decreasing tree height by one.  Tree root
88662 +   and its only child (that is going to become new tree root) are write locked
88663 +   at the entry.
88664 +
88665 +   To remove tree root we need to take lock on special "fake" znode that
88666 +   protects changes of tree height. See comments in add_tree_root() for more
88667 +   on this.
88668 +
88669 +   Also parent pointers have to be updated in
88670 +   old and new root. To simplify code, function is split into two parts: outer
88671 +   kill_tree_root() collects all necessary arguments and calls kill_root()
88672 +   to do the actual job.
88673 +
88674 +*/
88675 +reiser4_internal int
88676 +kill_tree_root(znode * old_root /* tree root that we are removing */ )
88677 +{
88678 +       int result;
88679 +       coord_t down_link;
88680 +       znode *new_root;
88681 +       reiser4_tree *tree;
88682 +
88683 +       assert("umka-266", current_tree != NULL);
88684 +       assert("nikita-1194", old_root != NULL);
88685 +       assert("nikita-1196", znode_is_root(old_root));
88686 +       assert("nikita-1200", node_num_items(old_root) == 1);
88687 +       assert("nikita-1401", znode_is_write_locked(old_root));
88688 +
88689 +       coord_init_first_unit(&down_link, old_root);
88690 +
88691 +       tree = znode_get_tree(old_root);
88692 +       new_root = child_znode(&down_link, old_root, 0, 1);
88693 +       if (!IS_ERR(new_root)) {
88694 +               result = kill_root(tree, old_root, new_root, znode_get_block(new_root));
88695 +               zput(new_root);
88696 +       } else
88697 +               result = PTR_ERR(new_root);
88698 +
88699 +       return result;
88700 +}
88701 +
88702 +/* Make Linus happy.
88703 +   Local variables:
88704 +   c-indentation-style: "K&R"
88705 +   mode-name: "LC"
88706 +   c-basic-offset: 8
88707 +   tab-width: 8
88708 +   fill-column: 120
88709 +   scroll-step: 1
88710 +   End:
88711 +*/
88712 diff -rupN linux-2.6.8-rc3/fs/reiser4/tree_mod.h linux-2.6.8-rc3-a/fs/reiser4/tree_mod.h
88713 --- linux-2.6.8-rc3/fs/reiser4/tree_mod.h       1970-01-01 03:00:00.000000000 +0300
88714 +++ linux-2.6.8-rc3-a/fs/reiser4/tree_mod.h     2004-08-05 21:20:53.094658109 +0400
88715 @@ -0,0 +1,29 @@
88716 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
88717 + * reiser4/README */
88718 +
88719 +/* Functions to add/delete new nodes to/from the tree. See tree_mod.c for
88720 + * comments. */
88721 +
88722 +#if !defined( __REISER4_TREE_MOD_H__ )
88723 +#define __REISER4_TREE_MOD_H__
88724 +
88725 +#include "forward.h"
88726 +
88727 +znode *new_node(znode * brother, tree_level level);
88728 +znode *add_tree_root(znode * old_root, znode * fake);
88729 +int kill_tree_root(znode * old_root);
88730 +void build_child_ptr_data(znode * child, reiser4_item_data * data);
88731 +
88732 +/* __REISER4_TREE_MOD_H__ */
88733 +#endif
88734 +
88735 +/* Make Linus happy.
88736 +   Local variables:
88737 +   c-indentation-style: "K&R"
88738 +   mode-name: "LC"
88739 +   c-basic-offset: 8
88740 +   tab-width: 8
88741 +   fill-column: 120
88742 +   scroll-step: 1
88743 +   End:
88744 +*/
88745 diff -rupN linux-2.6.8-rc3/fs/reiser4/tree_walk.c linux-2.6.8-rc3-a/fs/reiser4/tree_walk.c
88746 --- linux-2.6.8-rc3/fs/reiser4/tree_walk.c      1970-01-01 03:00:00.000000000 +0300
88747 +++ linux-2.6.8-rc3-a/fs/reiser4/tree_walk.c    2004-08-05 21:20:53.027672237 +0400
88748 @@ -0,0 +1,1175 @@
88749 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
88750 + * reiser4/README */
88751 +
88752 +/* Routines and macros to:
88753 +
88754 +   get_left_neighbor()
88755 +
88756 +   get_right_neighbor()
88757 +
88758 +   get_parent()
88759 +
88760 +   get_first_child()
88761 +
88762 +   get_last_child()
88763 +
88764 +   various routines to walk the whole tree and do things to it like
88765 +   repack it, or move it to tertiary storage.  Please make them as
88766 +   generic as is reasonable.
88767 +
88768 +*/
88769 +
88770 +#include "forward.h"
88771 +#include "debug.h"
88772 +#include "dformat.h"
88773 +#include "coord.h"
88774 +#include "plugin/item/item.h"
88775 +#include "jnode.h"
88776 +#include "znode.h"
88777 +#include "tree_walk.h"
88778 +#include "tree.h"
88779 +#include "super.h"
88780 +
88781 +/* These macros are used internally in tree_walk.c in attempt to make
88782 +   lock_neighbor() code usable to build lock_parent(), lock_right_neighbor,
88783 +   lock_left_neighbor */
88784 +#define GET_NODE_BY_PTR_OFFSET(node, off) (*(znode**)(((unsigned long)(node)) + (off)))
88785 +#define FIELD_OFFSET(name)  offsetof(znode, name)
88786 +#define PARENT_PTR_OFFSET FIELD_OFFSET(in_parent.node)
88787 +#define LEFT_PTR_OFFSET   FIELD_OFFSET(left)
88788 +#define RIGHT_PTR_OFFSET  FIELD_OFFSET(right)
88789 +
88790 +/* This is the generic procedure to get and lock `generic' neighbor (left or
88791 +    right neighbor or parent). It implements common algorithm for all cases of
88792 +    getting lock on neighbor node, only znode structure field is different in
88793 +    each case. This is parameterized by ptr_offset argument, which is byte
88794 +    offset for the pointer to the desired neighbor within the current node's
88795 +    znode structure. This function should be called with the tree lock held */
88796 +static int
88797 +lock_neighbor(
88798 +                    /* resulting lock handle*/
88799 +                    lock_handle * result,
88800 +                    /* znode to lock */
88801 +                    znode * node,
88802 +                    /* pointer to neighbor (or parent) znode field offset, in bytes from
88803 +                       the base address of znode structure  */
88804 +                    int ptr_offset,
88805 +                    /* lock mode for longterm_lock_znode call */
88806 +                    znode_lock_mode mode,
88807 +                    /* lock request for longterm_lock_znode call */
88808 +                    znode_lock_request req,
88809 +                    /* GN_* flags */
88810 +                    int flags,
88811 +                    int rlocked)
88812 +{
88813 +       reiser4_tree *tree = znode_get_tree(node);
88814 +       znode *neighbor;
88815 +       int ret;
88816 +
88817 +       assert("umka-236", node != NULL);
88818 +       assert("umka-237", tree != NULL);
88819 +       assert("umka-301", rw_tree_is_locked(tree));
88820 +
88821 +       reiser4_stat_inc_at_level(znode_get_level(node), znode.lock_neighbor);
88822 +
88823 +       if (flags & GN_TRY_LOCK)
88824 +               req |= ZNODE_LOCK_NONBLOCK;
88825 +       if (flags & GN_SAME_ATOM)
88826 +               req |= ZNODE_LOCK_DONT_FUSE;
88827 +
88828 +       /* get neighbor's address by using of sibling link, quit while loop
88829 +          (and return) if link is not available. */
88830 +       while (1) {
88831 +               reiser4_stat_inc_at_level(znode_get_level(node),
88832 +                                         znode.lock_neighbor_iteration);
88833 +               neighbor = GET_NODE_BY_PTR_OFFSET(node, ptr_offset);
88834 +
88835 +               /* return -E_NO_NEIGHBOR if parent or side pointer is NULL or if
88836 +                * node pointed by it is not connected.
88837 +                *
88838 +                * However, GN_ALLOW_NOT_CONNECTED option masks "connected"
88839 +                * check and allows passing reference to not connected znode to
88840 +                * subsequent longterm_lock_znode() call.  This kills possible
88841 +                * busy loop if we are trying to get longterm lock on locked but
88842 +                * not yet connected parent node. */
88843 +               if (neighbor == NULL || !((flags & GN_ALLOW_NOT_CONNECTED)
88844 +                                         || znode_is_connected(neighbor))) {
88845 +                       return RETERR(-E_NO_NEIGHBOR);
88846 +               }
88847 +
88848 +               /* protect it from deletion. */
88849 +               zref(neighbor);
88850 +
88851 +               XUNLOCK_TREE(tree, rlocked);
88852 +
88853 +               ret = longterm_lock_znode(result, neighbor, mode, req);
88854 +
88855 +               /* The lock handle obtains its own reference, release the one from above. */
88856 +               zput(neighbor);
88857 +
88858 +               XLOCK_TREE(tree, rlocked);
88859 +
88860 +               /* restart if node we got reference to is being
88861 +                  invalidated. we should not get reference to this node
88862 +                  again.*/
88863 +               if (ret == -EINVAL)
88864 +                       continue;
88865 +               if (ret)
88866 +                       return ret;
88867 +
88868 +               /* check if neighbor link still points to just locked znode;
88869 +                  the link could have been changed while the process slept. */
88870 +               if (neighbor == GET_NODE_BY_PTR_OFFSET(node, ptr_offset))
88871 +                       return 0;
88872 +
88873 +               /* znode was locked by mistake; unlock it and restart locking
88874 +                  process from beginning. */
88875 +               XUNLOCK_TREE(tree, rlocked);
88876 +               longterm_unlock_znode(result);
88877 +               XLOCK_TREE(tree, rlocked);
88878 +       }
88879 +}
88880 +/* get parent node with longterm lock, accepts GN* flags. */
88881 +reiser4_internal int
88882 +reiser4_get_parent_flags(lock_handle * result  /* resulting lock handle */,
88883 +                        znode * node /* child node */,
88884 +                        znode_lock_mode mode /* type of lock: read or write */,
88885 +                        int flags /* GN_* flags */)
88886 +{
88887 +       return UNDER_RW(tree, znode_get_tree(node), read,
88888 +                       lock_neighbor(result, node, PARENT_PTR_OFFSET, mode,
88889 +                                     ZNODE_LOCK_HIPRI, flags, 1));
88890 +}
88891 +
88892 +/* A wrapper for reiser4_get_parent_flags(). */
88893 +reiser4_internal int
88894 +reiser4_get_parent(lock_handle * result        /* resulting lock
88895 +                                          * handle */ ,
88896 +                  znode * node /* child node */ ,
88897 +                  znode_lock_mode mode /* type of lock: read or write */ ,
88898 +                  int only_connected_p /* if this is true, parent is
88899 +                                        * only returned when it is
88900 +                                        * connected. If parent is
88901 +                                        * unconnected, -E_NO_NEIGHBOR is
88902 +                                        * returned. Normal users should
88903 +                                        * pass 1 here. Only during carry
88904 +                                        * we want to access still
88905 +                                        * unconnected parents. */ )
88906 +{
88907 +       assert("umka-238", znode_get_tree(node) != NULL);
88908 +
88909 +       return reiser4_get_parent_flags(result, node, mode,
88910 +                                       only_connected_p ? 0 : GN_ALLOW_NOT_CONNECTED);
88911 +}
88912 +
88913 +/* wrapper function to lock right or left neighbor depending on GN_GO_LEFT
88914 +   bit in @flags parameter  */
88915 +/* Audited by: umka (2002.06.14) */
88916 +static inline int
88917 +lock_side_neighbor(lock_handle * result,
88918 +                  znode * node,
88919 +                  znode_lock_mode mode,
88920 +                  int flags, int rlocked)
88921 +{
88922 +       int ret;
88923 +       int ptr_offset;
88924 +       znode_lock_request req;
88925 +
88926 +       if (flags & GN_GO_LEFT) {
88927 +               ptr_offset = LEFT_PTR_OFFSET;
88928 +               req = ZNODE_LOCK_LOPRI;
88929 +       } else {
88930 +               ptr_offset = RIGHT_PTR_OFFSET;
88931 +               req = ZNODE_LOCK_HIPRI;
88932 +       }
88933 +
88934 +       ret = lock_neighbor(result, node, ptr_offset, mode, req, flags, rlocked);
88935 +
88936 +       if (ret == -E_NO_NEIGHBOR)      /* if we walk left or right -E_NO_NEIGHBOR does not
88937 +                                  * guarantee that neighbor is absent in the
88938 +                                  * tree; in this case we return -ENOENT --
88939 +                                  * means neighbor at least not found in
88940 +                                  * cache */
88941 +               return RETERR(-ENOENT);
88942 +
88943 +       return ret;
88944 +}
88945 +
88946 +#if REISER4_DEBUG_SIBLING_LIST
88947 +int check_sibling_list(znode * node)
88948 +{
88949 +       znode *scan;
88950 +       znode *next;
88951 +
88952 +       assert("nikita-3283", LOCK_CNT_GTZ(write_locked_tree));
88953 +
88954 +       if (node == NULL)
88955 +               return 1;
88956 +
88957 +       if (ZF_ISSET(node, JNODE_RIP))
88958 +               return 1;
88959 +
88960 +       assert("nikita-3270", node != NULL);
88961 +       assert("nikita-3269", rw_tree_is_write_locked(znode_get_tree(node)));
88962 +
88963 +       for (scan = node; znode_is_left_connected(scan); scan = next) {
88964 +               next = scan->left;
88965 +               if (next != NULL && !ZF_ISSET(next, JNODE_RIP)) {
88966 +                       assert("nikita-3271", znode_is_right_connected(next));
88967 +                       assert("nikita-3272", next->right == scan);
88968 +               } else
88969 +                       break;
88970 +       }
88971 +       for (scan = node; znode_is_right_connected(scan); scan = next) {
88972 +               next = scan->right;
88973 +               if (next != NULL && !ZF_ISSET(next, JNODE_RIP)) {
88974 +                       assert("nikita-3273", znode_is_left_connected(next));
88975 +                       assert("nikita-3274", next->left == scan);
88976 +               } else
88977 +                       break;
88978 +       }
88979 +       return 1;
88980 +}
88981 +#endif
88982 +
88983 +/* Znode sibling pointers maintenence. */
88984 +
88985 +/* Znode sibling pointers are established between any neighbored nodes which are
88986 +   in cache.  There are two znode state bits (JNODE_LEFT_CONNECTED,
88987 +   JNODE_RIGHT_CONNECTED), if left or right sibling pointer contains actual
88988 +   value (even NULL), corresponded JNODE_*_CONNECTED bit is set.
88989 +
88990 +   Reiser4 tree operations which may allocate new znodes (CBK, tree balancing)
88991 +   take care about searching (hash table lookup may be required) of znode
88992 +   neighbors, establishing sibling pointers between them and setting
88993 +   JNODE_*_CONNECTED state bits. */
88994 +
88995 +/* adjusting of sibling pointers and `connected' states for two
88996 +   neighbors; works if one neighbor is NULL (was not found). */
88997 +
88998 +/* FIXME-VS: this is unstatic-ed to use in tree.c in prepare_twig_cut */
88999 +reiser4_internal void
89000 +link_left_and_right(znode * left, znode * right)
89001 +{
89002 +       assert("nikita-3275", check_sibling_list(left));
89003 +       assert("nikita-3275", check_sibling_list(right));
89004 +
89005 +       if (left != NULL) {
89006 +               if (left->right == NULL) {
89007 +                       left->right = right;
89008 +                       ZF_SET(left, JNODE_RIGHT_CONNECTED);
89009 +               } else if (ZF_ISSET(left->right, JNODE_HEARD_BANSHEE)) {
89010 +                       left->right->left = NULL;
89011 +                       left->right = right;
89012 +                       ZF_SET(left, JNODE_RIGHT_CONNECTED);
89013 +               } else
89014 +                       /*
89015 +                        * there is a race condition in renew_sibling_link()
89016 +                        * and assertions below check that it is only one
89017 +                        * there. Thread T1 calls renew_sibling_link() without
89018 +                        * GN_NO_ALLOC flag. zlook() doesn't find neighbor
89019 +                        * node, but before T1 gets to the
89020 +                        * link_left_and_right(), another thread T2 creates
89021 +                        * neighbor node and connects it. check for
89022 +                        * left->right == NULL above protects T1 from
89023 +                        * overwriting correct left->right pointer installed
89024 +                        * by T2.
89025 +                        */
89026 +                       assert("nikita-3302",
89027 +                              right == NULL || left->right == right);
89028 +       }
89029 +       if (right != NULL) {
89030 +               if (right->left == NULL) {
89031 +                       right->left = left;
89032 +                       ZF_SET(right, JNODE_LEFT_CONNECTED);
89033 +               } else if (ZF_ISSET(right->left, JNODE_HEARD_BANSHEE)) {
89034 +                       right->left->right = NULL;
89035 +                       right->left = left;
89036 +                       ZF_SET(right, JNODE_LEFT_CONNECTED);
89037 +               } else
89038 +                       assert("nikita-3303",
89039 +                              left == NULL || right->left == left);
89040 +       }
89041 +       assert("nikita-3275", check_sibling_list(left));
89042 +       assert("nikita-3275", check_sibling_list(right));
89043 +}
89044 +
89045 +/* Audited by: umka (2002.06.14) */
89046 +static void
89047 +link_znodes(znode * first, znode * second, int to_left)
89048 +{
89049 +       if (to_left)
89050 +               link_left_and_right(second, first);
89051 +       else
89052 +               link_left_and_right(first, second);
89053 +}
89054 +
89055 +/* getting of next (to left or to right, depend on gn_to_left bit in flags)
89056 +   coord's unit position in horizontal direction, even across node
89057 +   boundary. Should be called under tree lock, it protects nonexistence of
89058 +   sibling link on parent level, if lock_side_neighbor() fails with
89059 +   -ENOENT. */
89060 +static int
89061 +far_next_coord(coord_t * coord, lock_handle * handle, int flags)
89062 +{
89063 +       int ret;
89064 +       znode *node;
89065 +       reiser4_tree *tree;
89066 +
89067 +       assert("umka-243", coord != NULL);
89068 +       assert("umka-244", handle != NULL);
89069 +
89070 +       handle->owner = NULL;   /* mark lock handle as unused */
89071 +
89072 +       ret = (flags & GN_GO_LEFT) ? coord_prev_unit(coord) : coord_next_unit(coord);
89073 +       if (!ret)
89074 +               return 0;
89075 +
89076 +       ret = lock_side_neighbor(handle, coord->node, ZNODE_READ_LOCK, flags, 0);
89077 +       if (ret)
89078 +               return ret;
89079 +
89080 +       node = handle->node;
89081 +       tree = znode_get_tree(node);
89082 +       WUNLOCK_TREE(tree);
89083 +
89084 +       coord_init_zero(coord);
89085 +
89086 +       /* We avoid synchronous read here if it is specified by flag. */
89087 +       if ((flags & GN_ASYNC) && znode_page(handle->node) == NULL) {
89088 +               ret = jstartio(ZJNODE(handle->node));
89089 +               if (!ret)
89090 +                       ret = -E_REPEAT;
89091 +               goto error_locked;
89092 +       }
89093 +
89094 +       /* corresponded zrelse() should be called by the clients of
89095 +          far_next_coord(), in place when this node gets unlocked. */
89096 +       ret = zload(handle->node);
89097 +       if (ret)
89098 +               goto error_locked;
89099 +
89100 +       if (flags & GN_GO_LEFT)
89101 +               coord_init_last_unit(coord, node);
89102 +       else
89103 +               coord_init_first_unit(coord, node);
89104 +
89105 +       if (0) {
89106 + error_locked:
89107 +               longterm_unlock_znode(handle);
89108 +       }
89109 +       WLOCK_TREE(tree);
89110 +       return ret;
89111 +}
89112 +
89113 +/* Very significant function which performs a step in horizontal direction
89114 +   when sibling pointer is not available.  Actually, it is only function which
89115 +   does it.
89116 +   Note: this function does not restore locking status at exit,
89117 +   caller should does care about proper unlocking and zrelsing */
89118 +static int
89119 +renew_sibling_link(coord_t * coord, lock_handle * handle, znode * child, tree_level level, int flags, int *nr_locked)
89120 +{
89121 +       int ret;
89122 +       int to_left = flags & GN_GO_LEFT;
89123 +       reiser4_block_nr da;
89124 +       /* parent of the neighbor node; we set it to parent until not sharing
89125 +          of one parent between child and neighbor node is detected */
89126 +       znode *side_parent = coord->node;
89127 +       reiser4_tree *tree = znode_get_tree(child);
89128 +       znode *neighbor = NULL;
89129 +
89130 +       assert("umka-245", coord != NULL);
89131 +       assert("umka-246", handle != NULL);
89132 +       assert("umka-247", child != NULL);
89133 +       assert("umka-303", tree != NULL);
89134 +
89135 +       WLOCK_TREE(tree);
89136 +       ret = far_next_coord(coord, handle, flags);
89137 +
89138 +       if (ret) {
89139 +               if (ret != -ENOENT) {
89140 +                       WUNLOCK_TREE(tree);
89141 +                       return ret;
89142 +               }
89143 +       } else {
89144 +               item_plugin *iplug;
89145 +
89146 +               if (handle->owner != NULL) {
89147 +                       (*nr_locked)++;
89148 +                       side_parent = handle->node;
89149 +               }
89150 +
89151 +               /* does coord object points to internal item? We do not
89152 +                  support sibling pointers between znode for formatted and
89153 +                  unformatted nodes and return -E_NO_NEIGHBOR in that case. */
89154 +               iplug = item_plugin_by_coord(coord);
89155 +               if (!item_is_internal(coord)) {
89156 +                       link_znodes(child, NULL, to_left);
89157 +                       WUNLOCK_TREE(tree);
89158 +                       /* we know there can't be formatted neighbor */
89159 +                       return RETERR(-E_NO_NEIGHBOR);
89160 +               }
89161 +               WUNLOCK_TREE(tree);
89162 +
89163 +               iplug->s.internal.down_link(coord, NULL, &da);
89164 +
89165 +               if (flags & GN_NO_ALLOC) {
89166 +                       neighbor = zlook(tree, &da);
89167 +               } else {
89168 +                       neighbor = zget(tree, &da, side_parent, level, GFP_KERNEL);
89169 +               }
89170 +
89171 +               if (IS_ERR(neighbor)) {
89172 +                       ret = PTR_ERR(neighbor);
89173 +                       return ret;
89174 +               }
89175 +
89176 +               if (neighbor)
89177 +                       /* update delimiting keys */
89178 +                       set_child_delimiting_keys(coord->node, coord, neighbor);
89179 +
89180 +               WLOCK_TREE(tree);
89181 +       }
89182 +
89183 +       if (likely(neighbor == NULL ||
89184 +                  (znode_get_level(child) == znode_get_level(neighbor) && child != neighbor)))
89185 +               link_znodes(child, neighbor, to_left);
89186 +       else {
89187 +               warning("nikita-3532",
89188 +                       "Sibling nodes on the different levels: %i != %i\n",
89189 +                       znode_get_level(child), znode_get_level(neighbor));
89190 +               ret = RETERR(-EIO);
89191 +       }
89192 +
89193 +       WUNLOCK_TREE(tree);
89194 +
89195 +       /* if GN_NO_ALLOC isn't set we keep reference to neighbor znode */
89196 +       if (neighbor != NULL && (flags & GN_NO_ALLOC))
89197 +               /* atomic_dec(&ZJNODE(neighbor)->x_count); */
89198 +               zput(neighbor);
89199 +
89200 +       return ret;
89201 +}
89202 +
89203 +/* This function is for establishing of one side relation. */
89204 +/* Audited by: umka (2002.06.14) */
89205 +static int
89206 +connect_one_side(coord_t * coord, znode * node, int flags)
89207 +{
89208 +       coord_t local;
89209 +       lock_handle handle;
89210 +       int nr_locked;
89211 +       int ret;
89212 +
89213 +       assert("umka-248", coord != NULL);
89214 +       assert("umka-249", node != NULL);
89215 +
89216 +       coord_dup_nocheck(&local, coord);
89217 +
89218 +       init_lh(&handle);
89219 +
89220 +       ret = renew_sibling_link(&local, &handle, node, znode_get_level(node), flags | GN_NO_ALLOC, &nr_locked);
89221 +
89222 +       if (handle.owner != NULL) {
89223 +               /* complementary operations for zload() and lock() in far_next_coord() */
89224 +               zrelse(handle.node);
89225 +               longterm_unlock_znode(&handle);
89226 +       }
89227 +
89228 +       /* we catch error codes which are not interesting for us because we
89229 +          run renew_sibling_link() only for znode connection. */
89230 +       if (ret == -ENOENT || ret == -E_NO_NEIGHBOR)
89231 +               return 0;
89232 +
89233 +       return ret;
89234 +}
89235 +
89236 +/* if @child is not in `connected' state, performs hash searches for left and
89237 +   right neighbor nodes and establishes horizontal sibling links */
89238 +/* Audited by: umka (2002.06.14), umka (2002.06.15) */
89239 +reiser4_internal int
89240 +connect_znode(coord_t * parent_coord, znode * child)
89241 +{
89242 +       reiser4_tree *tree = znode_get_tree(child);
89243 +       int ret = 0;
89244 +
89245 +       assert("zam-330", parent_coord != NULL);
89246 +       assert("zam-331", child != NULL);
89247 +       assert("zam-332", parent_coord->node != NULL);
89248 +       assert("umka-305", tree != NULL);
89249 +
89250 +       /* it is trivial to `connect' root znode because it can't have
89251 +          neighbors */
89252 +       if (znode_above_root(parent_coord->node)) {
89253 +               child->left = NULL;
89254 +               child->right = NULL;
89255 +               ZF_SET(child, JNODE_LEFT_CONNECTED);
89256 +               ZF_SET(child, JNODE_RIGHT_CONNECTED);
89257 +               return 0;
89258 +       }
89259 +
89260 +       /* load parent node */
89261 +       coord_clear_iplug(parent_coord);
89262 +       ret = zload(parent_coord->node);
89263 +
89264 +       if (ret != 0)
89265 +               return ret;
89266 +
89267 +       /* protect `connected' state check by tree_lock */
89268 +       RLOCK_TREE(tree);
89269 +
89270 +       if (!znode_is_right_connected(child)) {
89271 +               RUNLOCK_TREE(tree);
89272 +               /* connect right (default is right) */
89273 +               ret = connect_one_side(parent_coord, child, GN_NO_ALLOC);
89274 +               if (ret)
89275 +                       goto zrelse_and_ret;
89276 +
89277 +               RLOCK_TREE(tree);
89278 +       }
89279 +
89280 +       ret = znode_is_left_connected(child);
89281 +
89282 +       RUNLOCK_TREE(tree);
89283 +
89284 +       if (!ret) {
89285 +               ret = connect_one_side(parent_coord, child, GN_NO_ALLOC | GN_GO_LEFT);
89286 +       } else
89287 +               ret = 0;
89288 +
89289 +zrelse_and_ret:
89290 +       zrelse(parent_coord->node);
89291 +
89292 +       return ret;
89293 +}
89294 +
89295 +/* this function is like renew_sibling_link() but allocates neighbor node if
89296 +   it doesn't exist and `connects' it. It may require making two steps in
89297 +   horizontal direction, first one for neighbor node finding/allocation,
89298 +   second one is for finding neighbor of neighbor to connect freshly allocated
89299 +   znode. */
89300 +/* Audited by: umka (2002.06.14), umka (2002.06.15) */
89301 +static int
89302 +renew_neighbor(coord_t * coord, znode * node, tree_level level, int flags)
89303 +{
89304 +       coord_t local;
89305 +       lock_handle empty[2];
89306 +       reiser4_tree *tree = znode_get_tree(node);
89307 +       znode *neighbor = NULL;
89308 +       int nr_locked = 0;
89309 +       int ret;
89310 +
89311 +       assert("umka-250", coord != NULL);
89312 +       assert("umka-251", node != NULL);
89313 +       assert("umka-307", tree != NULL);
89314 +       assert("umka-308", level <= tree->height);
89315 +
89316 +       /* umka (2002.06.14)
89317 +          Here probably should be a check for given "level" validness.
89318 +          Something like assert("xxx-yyy", level < REAL_MAX_ZTREE_HEIGHT);
89319 +       */
89320 +
89321 +       coord_dup(&local, coord);
89322 +
89323 +       ret = renew_sibling_link(&local, &empty[0], node, level, flags & ~GN_NO_ALLOC, &nr_locked);
89324 +       if (ret)
89325 +               goto out;
89326 +
89327 +       /* tree lock is not needed here because we keep parent node(s) locked
89328 +          and reference to neighbor znode incremented */
89329 +       neighbor = (flags & GN_GO_LEFT) ? node->left : node->right;
89330 +
89331 +       ret = UNDER_RW(tree, tree, read, znode_is_connected(neighbor));
89332 +
89333 +       if (ret) {
89334 +               ret = 0;
89335 +               goto out;
89336 +       }
89337 +
89338 +       ret = renew_sibling_link(&local, &empty[nr_locked], neighbor, level, flags | GN_NO_ALLOC, &nr_locked);
89339 +       /* second renew_sibling_link() call is used for znode connection only,
89340 +          so we can live with these errors */
89341 +       if (-ENOENT == ret || -E_NO_NEIGHBOR == ret)
89342 +               ret = 0;
89343 +
89344 +out:
89345 +
89346 +       for (--nr_locked; nr_locked >= 0; --nr_locked) {
89347 +               zrelse(empty[nr_locked].node);
89348 +               longterm_unlock_znode(&empty[nr_locked]);
89349 +       }
89350 +
89351 +       if (neighbor != NULL)
89352 +               /* decrement znode reference counter without actually
89353 +                  releasing it. */
89354 +               atomic_dec(&ZJNODE(neighbor)->x_count);
89355 +
89356 +       return ret;
89357 +}
89358 +
89359 +/*
89360 +   reiser4_get_neighbor() -- lock node's neighbor.
89361 +
89362 +   reiser4_get_neighbor() locks node's neighbor (left or right one, depends on
89363 +   given parameter) using sibling link to it. If sibling link is not available
89364 +   (i.e. neighbor znode is not in cache) and flags allow read blocks, we go one
89365 +   level up for information about neighbor's disk address. We lock node's
89366 +   parent, if it is common parent for both 'node' and its neighbor, neighbor's
89367 +   disk address is in next (to left or to right) down link from link that points
89368 +   to original node. If not, we need to lock parent's neighbor, read its content
89369 +   and take first(last) downlink with neighbor's disk address.  That locking
89370 +   could be done by using sibling link and lock_neighbor() function, if sibling
89371 +   link exists. In another case we have to go level up again until we find
89372 +   common parent or valid sibling link. Then go down
89373 +   allocating/connecting/locking/reading nodes until neighbor of first one is
89374 +   locked.
89375 +
89376 +   @neighbor:  result lock handle,
89377 +   @node: a node which we lock neighbor of,
89378 +   @lock_mode: lock mode {LM_READ, LM_WRITE},
89379 +   @flags: logical OR of {GN_*} (see description above) subset.
89380 +
89381 +   @return: 0 if success, negative value if lock was impossible due to an error
89382 +   or lack of neighbor node.
89383 +*/
89384 +
89385 +/* Audited by: umka (2002.06.14), umka (2002.06.15) */
89386 +reiser4_internal int
89387 +reiser4_get_neighbor (
89388 +       lock_handle * neighbor, znode * node, znode_lock_mode lock_mode, int flags)
89389 +{
89390 +       reiser4_tree *tree = znode_get_tree(node);
89391 +       lock_handle path[REAL_MAX_ZTREE_HEIGHT];
89392 +
89393 +       coord_t coord;
89394 +
89395 +       tree_level base_level;
89396 +       tree_level h = 0;
89397 +       int ret;
89398 +
89399 +       assert("umka-252", tree != NULL);
89400 +       assert("umka-253", neighbor != NULL);
89401 +       assert("umka-254", node != NULL);
89402 +
89403 +       base_level = znode_get_level(node);
89404 +
89405 +       assert("umka-310", base_level <= tree->height);
89406 +
89407 +       coord_init_zero(&coord);
89408 +
89409 +again:
89410 +       /* first, we try to use simple lock_neighbor() which requires sibling
89411 +          link existence */
89412 +       ret = UNDER_RW(tree, tree, read,
89413 +                      lock_side_neighbor(neighbor, node, lock_mode, flags, 1));
89414 +
89415 +       if (!ret) {
89416 +               /* load znode content if it was specified */
89417 +               if (flags & GN_LOAD_NEIGHBOR) {
89418 +                       ret = zload(node);
89419 +                       if (ret)
89420 +                               longterm_unlock_znode(neighbor);
89421 +               }
89422 +               return ret;
89423 +       }
89424 +
89425 +       /* only -ENOENT means we may look upward and try to connect
89426 +          @node with its neighbor (if @flags allow us to do it) */
89427 +       if (ret != -ENOENT || !(flags & GN_CAN_USE_UPPER_LEVELS))
89428 +               return ret;
89429 +
89430 +       /* before establishing of sibling link we lock parent node; it is
89431 +          required by renew_neighbor() to work.  */
89432 +       init_lh(&path[0]);
89433 +       ret = reiser4_get_parent(&path[0], node, ZNODE_READ_LOCK, 1);
89434 +       if (ret)
89435 +               return ret;
89436 +       if (znode_above_root(path[0].node)) {
89437 +               longterm_unlock_znode(&path[0]);
89438 +               return RETERR(-E_NO_NEIGHBOR);
89439 +       }
89440 +
89441 +       while (1) {
89442 +               znode *child = (h == 0) ? node : path[h - 1].node;
89443 +               znode *parent = path[h].node;
89444 +
89445 +               reiser4_stat_inc_at_level(h + LEAF_LEVEL, sibling_search);
89446 +               ret = zload(parent);
89447 +               if (ret)
89448 +                       break;
89449 +
89450 +               ret = find_child_ptr(parent, child, &coord);
89451 +
89452 +               if (ret) {
89453 +                       zrelse(parent);
89454 +                       break;
89455 +               }
89456 +
89457 +               /* try to establish missing sibling link */
89458 +               ret = renew_neighbor(&coord, child, h + base_level, flags);
89459 +
89460 +               zrelse(parent);
89461 +
89462 +               switch (ret) {
89463 +               case 0:
89464 +                       /* unlocking of parent znode prevents simple
89465 +                          deadlock situation */
89466 +                       done_lh(&path[h]);
89467 +
89468 +                       /* depend on tree level we stay on we repeat first
89469 +                          locking attempt ...  */
89470 +                       if (h == 0)
89471 +                               goto again;
89472 +
89473 +                       /* ... or repeat establishing of sibling link at
89474 +                          one level below. */
89475 +                       --h;
89476 +                       break;
89477 +
89478 +               case -ENOENT:
89479 +                       /* sibling link is not available -- we go
89480 +                          upward. */
89481 +                       init_lh(&path[h + 1]);
89482 +                       ret = reiser4_get_parent(&path[h + 1], parent, ZNODE_READ_LOCK, 1);
89483 +                       if (ret)
89484 +                               goto fail;
89485 +                       ++h;
89486 +                       if (znode_above_root(path[h].node)) {
89487 +                               ret = RETERR(-E_NO_NEIGHBOR);
89488 +                               goto fail;
89489 +                       }
89490 +                       break;
89491 +
89492 +               case -E_DEADLOCK:
89493 +                       /* there was lock request from hi-pri locker. if
89494 +                          it is possible we unlock last parent node and
89495 +                          re-lock it again. */
89496 +                       while (check_deadlock()) {
89497 +                               if (h == 0)
89498 +                                       goto fail;
89499 +
89500 +                               done_lh(&path[--h]);
89501 +                       }
89502 +
89503 +                       break;
89504 +
89505 +               default:        /* other errors. */
89506 +                       goto fail;
89507 +               }
89508 +       }
89509 +fail:
89510 +       ON_DEBUG(check_lock_node_data(node));
89511 +       ON_DEBUG(check_lock_data());
89512 +
89513 +       /* unlock path */
89514 +       do {
89515 +               longterm_unlock_znode(&path[h]);
89516 +               --h;
89517 +       } while (h + 1 != 0);
89518 +
89519 +       return ret;
89520 +}
89521 +
89522 +/* remove node from sibling list */
89523 +/* Audited by: umka (2002.06.14) */
89524 +reiser4_internal void
89525 +sibling_list_remove(znode * node)
89526 +{
89527 +       assert("umka-255", node != NULL);
89528 +       assert("zam-878", rw_tree_is_write_locked(znode_get_tree(node)));
89529 +       assert("nikita-3275", check_sibling_list(node));
89530 +
89531 +       if (znode_is_right_connected(node) && node->right != NULL) {
89532 +               assert("zam-322", znode_is_left_connected(node->right));
89533 +               node->right->left = node->left;
89534 +       }
89535 +       if (znode_is_left_connected(node) && node->left != NULL) {
89536 +               assert("zam-323", znode_is_right_connected(node->left));
89537 +               node->left->right = node->right;
89538 +       }
89539 +       ZF_CLR(node, JNODE_LEFT_CONNECTED);
89540 +       ZF_CLR(node, JNODE_RIGHT_CONNECTED);
89541 +       ON_DEBUG(node->left = node->right = NULL);
89542 +       assert("nikita-3276", check_sibling_list(node));
89543 +}
89544 +
89545 +/* disconnect node from sibling list */
89546 +reiser4_internal void
89547 +sibling_list_drop(znode * node)
89548 +{
89549 +       znode *right;
89550 +       znode *left;
89551 +
89552 +       assert("nikita-2464", node != NULL);
89553 +       assert("nikita-3277", check_sibling_list(node));
89554 +
89555 +       right = node->right;
89556 +       if (right != NULL) {
89557 +               assert("nikita-2465", znode_is_left_connected(right));
89558 +               right->left = NULL;
89559 +       }
89560 +       left = node->left;
89561 +       if (left != NULL) {
89562 +               assert("zam-323", znode_is_right_connected(left));
89563 +               left->right = NULL;
89564 +       }
89565 +       ZF_CLR(node, JNODE_LEFT_CONNECTED);
89566 +       ZF_CLR(node, JNODE_RIGHT_CONNECTED);
89567 +       ON_DEBUG(node->left = node->right = NULL);
89568 +}
89569 +
89570 +/* Insert new node into sibling list. Regular balancing inserts new node
89571 +   after (at right side) existing and locked node (@before), except one case
89572 +   of adding new tree root node. @before should be NULL in that case. */
89573 +reiser4_internal void
89574 +sibling_list_insert_nolock(znode * new, znode * before)
89575 +{
89576 +       assert("zam-334", new != NULL);
89577 +       assert("nikita-3298", !znode_is_left_connected(new));
89578 +       assert("nikita-3299", !znode_is_right_connected(new));
89579 +       assert("nikita-3300", new->left == NULL);
89580 +       assert("nikita-3301", new->right == NULL);
89581 +       assert("nikita-3278", check_sibling_list(new));
89582 +       assert("nikita-3279", check_sibling_list(before));
89583 +
89584 +       if (before != NULL) {
89585 +               assert("zam-333", znode_is_connected(before));
89586 +               new->right = before->right;
89587 +               new->left = before;
89588 +               if (before->right != NULL)
89589 +                       before->right->left = new;
89590 +               before->right = new;
89591 +       } else {
89592 +               new->right = NULL;
89593 +               new->left = NULL;
89594 +       }
89595 +       ZF_SET(new, JNODE_LEFT_CONNECTED);
89596 +       ZF_SET(new, JNODE_RIGHT_CONNECTED);
89597 +       assert("nikita-3280", check_sibling_list(new));
89598 +       assert("nikita-3281", check_sibling_list(before));
89599 +}
89600 +
89601 +struct tw_handle {
89602 +       /* A key for tree walking (re)start, updated after each successful tree
89603 +        * node processing */
89604 +       reiser4_key            start_key;
89605 +       /* A tree traversal current position. */
89606 +       tap_t                  tap;
89607 +       /* An externally supplied pair of functions for formatted and
89608 +        * unformatted nodes processing. */
89609 +       struct tree_walk_actor * actor;
89610 +       /* It is passed to actor functions as is. */
89611 +       void                 * opaque;
89612 +       /* A direction of a tree traversal: 1 if going from right to left. */
89613 +       int                    go_left:1;
89614 +       /* "Done" flag */
89615 +       int                    done:1;
89616 +       /* Current node was processed completely */
89617 +       int                    node_completed:1;
89618 +};
89619 +
89620 +/* it locks the root node, handles the restarts inside */
89621 +static int lock_tree_root (lock_handle * lock, znode_lock_mode mode)
89622 +{
89623 +       int ret;
89624 +
89625 +       reiser4_tree * tree = current_tree;
89626 +       lock_handle uber_znode_lock;
89627 +       znode * root;
89628 +
89629 +       init_lh(&uber_znode_lock);
89630 + again:
89631 +
89632 +       ret = get_uber_znode(tree, mode, ZNODE_LOCK_HIPRI, &uber_znode_lock);
89633 +       if (ret)
89634 +               return ret;
89635 +
89636 +       root = zget(tree, &tree->root_block, uber_znode_lock.node, tree->height, GFP_KERNEL);
89637 +       if (IS_ERR(root)) {
89638 +               done_lh(&uber_znode_lock);
89639 +               return PTR_ERR(root);
89640 +       }
89641 +
89642 +       ret = longterm_lock_znode(lock, root, ZNODE_WRITE_LOCK, ZNODE_LOCK_HIPRI);
89643 +
89644 +       zput(root);
89645 +       done_lh(&uber_znode_lock);
89646 +
89647 +       if (ret == -E_DEADLOCK)
89648 +               goto again;
89649 +
89650 +       return ret;
89651 +}
89652 +
89653 +/* Update the handle->start_key by the first key of the node is being
89654 + * processed. */
89655 +static int update_start_key(struct tw_handle * h)
89656 +{
89657 +       int ret;
89658 +
89659 +       ret = tap_load(&h->tap);
89660 +       if (ret == 0) {
89661 +               unit_key_by_coord(h->tap.coord, &h->start_key);
89662 +               tap_relse(&h->tap);
89663 +       }
89664 +       return ret;
89665 +}
89666 +
89667 +/* Move tap to the next node, load it. */
89668 +static int go_next_node (struct tw_handle * h, lock_handle * lock, const coord_t * coord)
89669 +{
89670 +       int ret;
89671 +
89672 +       assert ("zam-948", ergo (coord != NULL, lock->node == coord->node));
89673 +
89674 +       tap_relse(&h->tap);
89675 +
89676 +       ret = tap_move(&h->tap, lock);
89677 +       if (ret)
89678 +               return ret;
89679 +
89680 +       ret = tap_load(&h->tap);
89681 +       if (ret)
89682 +               goto error;
89683 +
89684 +       if (coord)
89685 +               coord_dup(h->tap.coord, coord);
89686 +       else {
89687 +               if (h->go_left)
89688 +                       coord_init_last_unit(h->tap.coord, lock->node);
89689 +               else
89690 +                       coord_init_first_unit(h->tap.coord, lock->node);
89691 +       }
89692 +
89693 +       if (h->actor->process_znode != NULL) {
89694 +               ret = (h->actor->process_znode)(&h->tap, h->opaque);
89695 +               if (ret)
89696 +                       goto error;
89697 +       }
89698 +
89699 +       ret = update_start_key(h);
89700 +
89701 + error:
89702 +       done_lh(lock);
89703 +       return ret;
89704 +}
89705 +
89706 +static void next_unit (struct tw_handle * h)
89707 +{
89708 +       if (h->go_left)
89709 +               h->node_completed = coord_prev_unit(h->tap.coord);
89710 +       else
89711 +               h->node_completed = coord_next_unit(h->tap.coord);
89712 +}
89713 +
89714 +
89715 +/* Move tree traversal position (which is embedded into tree_walk_handle) to the
89716 + * parent of current node (h->lh.node). */
89717 +static int tw_up (struct tw_handle * h)
89718 +{
89719 +       coord_t coord;
89720 +       lock_handle lock;
89721 +       load_count load;
89722 +       int ret;
89723 +
89724 +       init_lh(&lock);
89725 +       init_load_count(&load);
89726 +
89727 +       do {
89728 +               ret = reiser4_get_parent(&lock, h->tap.lh->node, ZNODE_WRITE_LOCK, 0);
89729 +               if (ret)
89730 +                       break;
89731 +               if (znode_above_root(lock.node)) {
89732 +                       h->done = 1;
89733 +                       break;
89734 +               }
89735 +               ret = incr_load_count_znode(&load, lock.node);
89736 +               if (ret)
89737 +                       break;
89738 +               ret = find_child_ptr(lock.node, h->tap.lh->node, &coord);
89739 +               if (ret)
89740 +                       break;
89741 +               ret = go_next_node(h, &lock, &coord);
89742 +               if (ret)
89743 +                       break;
89744 +               next_unit(h);
89745 +       } while (0);
89746 +
89747 +       done_load_count(&load);
89748 +       done_lh(&lock);
89749 +
89750 +       return ret;
89751 +}
89752 +
89753 +/* Move tree traversal position to the child of current node pointed by
89754 + * h->tap.coord.  */
89755 +static int tw_down(struct tw_handle * h)
89756 +{
89757 +       reiser4_block_nr block;
89758 +       lock_handle lock;
89759 +       znode * child;
89760 +       item_plugin * iplug;
89761 +       tree_level level = znode_get_level(h->tap.lh->node);
89762 +       int ret;
89763 +
89764 +       assert ("zam-943", item_is_internal(h->tap.coord));
89765 +
89766 +       iplug = item_plugin_by_coord(h->tap.coord);
89767 +       iplug->s.internal.down_link(h->tap.coord, NULL, &block);
89768 +       init_lh(&lock);
89769 +
89770 +       do {
89771 +               child = zget(current_tree, &block, h->tap.lh->node, level - 1, GFP_KERNEL);
89772 +               if (IS_ERR(child))
89773 +                       return PTR_ERR(child);
89774 +               ret = connect_znode(h->tap.coord, child);
89775 +               if (ret)
89776 +                       break;
89777 +               ret = longterm_lock_znode(&lock, child, ZNODE_WRITE_LOCK, 0);
89778 +               if (ret)
89779 +                       break;
89780 +               set_child_delimiting_keys(h->tap.coord->node, h->tap.coord, child);
89781 +               ret = go_next_node (h, &lock, NULL);
89782 +       } while(0);
89783 +
89784 +       zput(child);
89785 +       done_lh(&lock);
89786 +       return ret;
89787 +}
89788 +/* Traverse the reiser4 tree until either all tree traversing is done or an
89789 + * error encountered (including recoverable ones as -E_DEADLOCK or -E_REPEAT).  The
89790 + * @actor function is able to stop tree traversal by returning an appropriate
89791 + * error code. */
89792 +static int tw_by_handle (struct tw_handle * h)
89793 +{
89794 +       int ret;
89795 +       lock_handle next_lock;
89796 +
89797 +       ret = tap_load(&h->tap);
89798 +       if (ret)
89799 +               return ret;
89800 +
89801 +       init_lh (&next_lock);
89802 +
89803 +       while (!h->done) {
89804 +               tree_level level;
89805 +
89806 +               if (h->node_completed) {
89807 +                       h->node_completed = 0;
89808 +                       ret = tw_up(h);
89809 +                       if (ret)
89810 +                               break;
89811 +                       continue;
89812 +               }
89813 +
89814 +               assert ("zam-944", coord_is_existing_unit(h->tap.coord));
89815 +               level = znode_get_level(h->tap.lh->node);
89816 +
89817 +               if (level == LEAF_LEVEL) {
89818 +                       h->node_completed = 1;
89819 +                       continue;
89820 +               }
89821 +
89822 +               if (item_is_extent(h->tap.coord)) {
89823 +                       if (h->actor->process_extent != NULL) {
89824 +                               ret = (h->actor->process_extent)(&h->tap, h->opaque);
89825 +                               if (ret)
89826 +                                       break;
89827 +                       }
89828 +                       next_unit(h);
89829 +                       continue;
89830 +               }
89831 +
89832 +               ret = tw_down(h);
89833 +               if (ret)
89834 +                       break;
89835 +       }
89836 +
89837 +       done_lh(&next_lock);
89838 +       return ret;
89839 +}
89840 +
89841 +/* Walk the reiser4 tree in parent-first order */
89842 +reiser4_internal int
89843 +tree_walk (const reiser4_key *start_key, int go_left, struct tree_walk_actor * actor, void * opaque)
89844 +{
89845 +       coord_t coord;
89846 +       lock_handle lock;
89847 +       struct tw_handle handle;
89848 +
89849 +       int ret;
89850 +
89851 +       assert ("zam-950", actor != NULL);
89852 +
89853 +       handle.actor = actor;
89854 +       handle.opaque = opaque;
89855 +       handle.go_left = !!go_left;
89856 +       handle.done = 0;
89857 +       handle.node_completed = 0;
89858 +
89859 +       init_lh(&lock);
89860 +
89861 +       if (start_key == NULL) {
89862 +               if (actor->before) {
89863 +                       ret = actor->before(opaque);
89864 +                       if (ret)
89865 +                               return ret;
89866 +               }
89867 +
89868 +               ret = lock_tree_root(&lock, ZNODE_WRITE_LOCK);
89869 +               if (ret)
89870 +                       return ret;
89871 +               ret = zload(lock.node);
89872 +               if (ret)
89873 +                       goto done;
89874 +
89875 +               if (go_left)
89876 +                       coord_init_last_unit(&coord, lock.node);
89877 +               else
89878 +                       coord_init_first_unit_nocheck(&coord, lock.node);
89879 +
89880 +               zrelse(lock.node);
89881 +               goto no_start_key;
89882 +       } else
89883 +               handle.start_key = *start_key;
89884 +
89885 +       do {
89886 +               if (actor->before) {
89887 +                       ret = actor->before(opaque);
89888 +                       if (ret)
89889 +                               return ret;
89890 +               }
89891 +
89892 +               ret = coord_by_key(current_tree, &handle.start_key, &coord, &lock, ZNODE_WRITE_LOCK,
89893 +                                  FIND_MAX_NOT_MORE_THAN, TWIG_LEVEL, LEAF_LEVEL, 0, NULL);
89894 +               if (ret != CBK_COORD_FOUND)
89895 +                       break;
89896 +       no_start_key:
89897 +               tap_init(&handle.tap, &coord, &lock, ZNODE_WRITE_LOCK);
89898 +
89899 +               ret = update_start_key(&handle);
89900 +               if (ret) {
89901 +                       tap_done(&handle.tap);
89902 +                       break;
89903 +               }
89904 +               ret = tw_by_handle(&handle);
89905 +               tap_done (&handle.tap);
89906 +
89907 +       } while (!handle.done && (ret == -E_DEADLOCK || ret == -E_REPEAT));
89908 +
89909 +       done:
89910 +       done_lh(&lock);
89911 +       return ret;
89912 +}
89913 +
89914 +
89915 +/*
89916 +   Local variables:
89917 +   c-indentation-style: "K&R"
89918 +   mode-name: "LC"
89919 +   c-basic-offset: 8
89920 +   tab-width: 8
89921 +   fill-column: 80
89922 +   End:
89923 +*/
89924 diff -rupN linux-2.6.8-rc3/fs/reiser4/tree_walk.h linux-2.6.8-rc3-a/fs/reiser4/tree_walk.h
89925 --- linux-2.6.8-rc3/fs/reiser4/tree_walk.h      1970-01-01 03:00:00.000000000 +0300
89926 +++ linux-2.6.8-rc3-a/fs/reiser4/tree_walk.h    2004-08-05 21:20:53.133649884 +0400
89927 @@ -0,0 +1,117 @@
89928 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
89929 +
89930 +/* definitions of reiser4 tree walk functions */
89931 +
89932 +#ifndef __FS_REISER4_TREE_WALK_H__
89933 +#define __FS_REISER4_TREE_WALK_H__
89934 +
89935 +#include "debug.h"
89936 +#include "forward.h"
89937 +
89938 +/* establishes horizontal links between cached znodes */
89939 +int connect_znode(coord_t * coord, znode * node);
89940 +
89941 +/* tree traversal functions (reiser4_get_parent(), reiser4_get_neighbor())
89942 +  have the following common arguments:
89943 +
89944 +  return codes:
89945 +
89946 +  @return : 0        - OK,
89947 +
89948 +ZAM-FIXME-HANS: wrong return code name.  Change them all.
89949 +           -ENOENT  - neighbor is not in cache, what is detected by sibling
89950 +                      link absence.
89951 +
89952 +            -E_NO_NEIGHBOR - we are sure that neighbor (or parent) node cannot be
89953 +                       found (because we are left-/right- most node of the
89954 +                      tree, for example). Also, this return code is for
89955 +                      reiser4_get_parent() when we see no parent link -- it
89956 +                      means that our node is root node.
89957 +
89958 +            -E_DEADLOCK - deadlock detected (request from high-priority process
89959 +                      received), other error codes are conformed to
89960 +                      /usr/include/asm/errno.h .
89961 +*/
89962 +
89963 +int
89964 +reiser4_get_parent_flags(lock_handle * result, znode * node,
89965 +                        znode_lock_mode mode, int flags);
89966 +
89967 +int reiser4_get_parent(lock_handle * result, znode * node, znode_lock_mode mode, int only_connected_p);
89968 +
89969 +/* bits definition for reiser4_get_neighbor function `flags' arg. */
89970 +typedef enum {
89971 +       /* If sibling pointer is NULL, this flag allows get_neighbor() to try to
89972 +        * find not allocated not connected neigbor by going though upper
89973 +        * levels */
89974 +       GN_CAN_USE_UPPER_LEVELS = 0x1,
89975 +       /* locking left neighbor instead of right one */
89976 +       GN_GO_LEFT = 0x2,
89977 +       /* automatically load neighbor node content */
89978 +       GN_LOAD_NEIGHBOR = 0x4,
89979 +       /* return -E_REPEAT if can't lock  */
89980 +       GN_TRY_LOCK = 0x8,
89981 +       /* used internally in tree_walk.c, causes renew_sibling to not
89982 +          allocate neighbor znode, but only search for it in znode cache */
89983 +       GN_NO_ALLOC = 0x10,
89984 +       /* do not go across atom boundaries */
89985 +       GN_SAME_ATOM = 0x20,
89986 +       /* allow to lock not connected nodes */
89987 +       GN_ALLOW_NOT_CONNECTED = 0x40,
89988 +       /*  Avoid synchronous jload, instead, call jstartio() and return -E_REPEAT. */
89989 +       GN_ASYNC = 0x80
89990 +} znode_get_neigbor_flags;
89991 +
89992 +int reiser4_get_neighbor(lock_handle * neighbor, znode * node, znode_lock_mode lock_mode, int flags);
89993 +
89994 +/* there are wrappers for most common usages of reiser4_get_neighbor() */
89995 +static inline int
89996 +reiser4_get_left_neighbor(lock_handle * result, znode * node, int lock_mode, int flags)
89997 +{
89998 +       return reiser4_get_neighbor(result, node, lock_mode, flags | GN_GO_LEFT);
89999 +}
90000 +
90001 +static inline int
90002 +reiser4_get_right_neighbor(lock_handle * result, znode * node, int lock_mode, int flags)
90003 +{
90004 +       ON_DEBUG(check_lock_node_data(node));
90005 +       ON_DEBUG(check_lock_data());
90006 +       return reiser4_get_neighbor(result, node, lock_mode, flags & (~GN_GO_LEFT));
90007 +}
90008 +
90009 +extern void invalidate_lock(lock_handle * _link);
90010 +
90011 +extern void sibling_list_remove(znode * node);
90012 +extern void sibling_list_drop(znode * node);
90013 +extern void sibling_list_insert_nolock(znode * new, znode * before);
90014 +extern void link_left_and_right(znode * left, znode * right);
90015 +
90016 +/* Functions called by tree_walk() when tree_walk() ...  */
90017 +struct tree_walk_actor {
90018 +       /* ... meets a formatted node, */
90019 +       int (*process_znode)(tap_t* , void*);
90020 +       /* ... meets an extent, */
90021 +       int (*process_extent)(tap_t*, void*);
90022 +       /* ... begins tree traversal or repeats it after -E_REPEAT was returned by
90023 +        * node or extent processing functions. */
90024 +       int (*before)(void *);
90025 +};
90026 +extern int tree_walk(const reiser4_key *, int, struct tree_walk_actor *, void *);
90027 +
90028 +#if REISER4_DEBUG_SIBLING_LIST
90029 +int check_sibling_list(znode * node);
90030 +#else
90031 +#define check_sibling_list(n) (1)
90032 +#endif
90033 +
90034 +#endif                         /* __FS_REISER4_TREE_WALK_H__ */
90035 +
90036 +/*
90037 +   Local variables:
90038 +   c-indentation-style: "K&R"
90039 +   mode-name: "LC"
90040 +   c-basic-offset: 8
90041 +   tab-width: 8
90042 +   fill-column: 120
90043 +   End:
90044 +*/
90045 diff -rupN linux-2.6.8-rc3/fs/reiser4/txnmgr.c linux-2.6.8-rc3-a/fs/reiser4/txnmgr.c
90046 --- linux-2.6.8-rc3/fs/reiser4/txnmgr.c 1970-01-01 03:00:00.000000000 +0300
90047 +++ linux-2.6.8-rc3-a/fs/reiser4/txnmgr.c       2004-08-05 21:20:52.896699863 +0400
90048 @@ -0,0 +1,4367 @@
90049 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
90050 + * reiser4/README */
90051 +
90052 +/* Joshua MacDonald wrote the first draft of this code. */
90053 +
90054 +/* ZAM-LONGTERM-FIXME-HANS: The locking in this file is badly designed, and a
90055 +filesystem scales only as well as its worst locking design.  You need to
90056 +substantially restructure this code. Josh was not as experienced a programmer
90057 +as you.  Particularly review how the locking style differs from what you did
90058 +for znodes usingt hi-lo priority locking, and present to me an opinion on
90059 +whether the differences are well founded.  */
90060 +
90061 +/* I cannot help but to disagree with the sentiment above. Locking of
90062 + * transaction manager is _not_ badly designed, and, at the very least, is not
90063 + * the scaling bottleneck. Scaling bottleneck is _exactly_ hi-lo priority
90064 + * locking on znodes, especially on the root node of the tree. --nikita,
90065 + * 2003.10.13 */
90066 +
90067 +/* The txnmgr is a set of interfaces that keep track of atoms and transcrash handles.  The
90068 +   txnmgr processes capture_block requests and manages the relationship between jnodes and
90069 +   atoms through the various stages of a transcrash, and it also oversees the fusion and
90070 +   capture-on-copy processes.  The main difficulty with this task is maintaining a
90071 +   deadlock-free lock ordering between atoms and jnodes/handles.  The reason for the
90072 +   difficulty is that jnodes, handles, and atoms contain pointer circles, and the cycle
90073 +   must be broken.  The main requirement is that atom-fusion be deadlock free, so once you
90074 +   hold the atom_lock you may then wait to acquire any jnode or handle lock.  This implies
90075 +   that any time you check the atom-pointer of a jnode or handle and then try to lock that
90076 +   atom, you must use trylock() and possibly reverse the order.
90077 +
90078 +   This code implements the design documented at:
90079 +
90080 +     http://namesys.com/txn-doc.html
90081 +
90082 +ZAM-FIXME-HANS: update v4.html to contain all of the information present in the above (but updated), and then remove the
90083 +above document and reference the new.  Be sure to provide some credit to Josh.  I already have some writings on this
90084 +topic in v4.html, but they are lacking in details present in the above.  Cure that.  Remember to write for the bright 12
90085 +year old --- define all technical terms used.
90086 +
90087 +*/
90088 +
90089 +/* Thoughts on the external transaction interface:
90090 +
90091 +   In the current code, a TRANSCRASH handle is created implicitly by init_context() (which
90092 +   creates state that lasts for the duration of a system call and is called at the start
90093 +   of ReiserFS methods implementing VFS operations), and closed by reiser4_exit_context(),
90094 +   occupying the scope of a single system call.  We wish to give certain applications an
90095 +   interface to begin and close (commit) transactions.  Since our implementation of
90096 +   transactions does not yet support isolation, allowing an application to open a
90097 +   transaction implies trusting it to later close the transaction.  Part of the
90098 +   transaction interface will be aimed at enabling that trust, but the interface for
90099 +   actually using transactions is fairly narrow.
90100 +
90101 +   BEGIN_TRANSCRASH: Returns a transcrash identifier.  It should be possible to translate
90102 +   this identifier into a string that a shell-script could use, allowing you to start a
90103 +   transaction by issuing a command.  Once open, the transcrash should be set in the task
90104 +   structure, and there should be options (I suppose) to allow it to be carried across
90105 +   fork/exec.  A transcrash has several options:
90106 +
90107 +     - READ_FUSING or WRITE_FUSING: The default policy is for txn-capture to capture only
90108 +     on writes (WRITE_FUSING) and allow "dirty reads".  If the application wishes to
90109 +     capture on reads as well, it should set READ_FUSING.
90110 +
90111 +     - TIMEOUT: Since a non-isolated transcrash cannot be undone, every transcrash must
90112 +     eventually close (or else the machine must crash).  If the application dies an
90113 +     unexpected death with an open transcrash, for example, or if it hangs for a long
90114 +     duration, one solution (to avoid crashing the machine) is to simply close it anyway.
90115 +     This is a dangerous option, but it is one way to solve the problem until isolated
90116 +     transcrashes are available for untrusted applications.
90117 +
90118 +     It seems to be what databases do, though it is unclear how one avoids a DoS attack
90119 +     creating a vulnerability based on resource starvation.  Guaranteeing that some
90120 +     minimum amount of computational resources are made available would seem more correct
90121 +     than guaranteeing some amount of time.  When we again have someone to code the work,
90122 +     this issue should be considered carefully.  -Hans
90123 +
90124 +   RESERVE_BLOCKS: A running transcrash should indicate to the transaction manager how
90125 +   many dirty blocks it expects.  The reserve_blocks interface should be called at a point
90126 +   where it is safe for the application to fail, because the system may not be able to
90127 +   grant the allocation and the application must be able to back-out.  For this reason,
90128 +   the number of reserve-blocks can also be passed as an argument to BEGIN_TRANSCRASH, but
90129 +   the application may also wish to extend the allocation after beginning its transcrash.
90130 +
90131 +   CLOSE_TRANSCRASH: The application closes the transcrash when it is finished making
90132 +   modifications that require transaction protection.  When isolated transactions are
90133 +   supported the CLOSE operation is replaced by either COMMIT or ABORT.  For example, if a
90134 +   RESERVE_BLOCKS call fails for the application, it should "abort" by calling
90135 +   CLOSE_TRANSCRASH, even though it really commits any changes that were made (which is
90136 +   why, for safety, the application should call RESERVE_BLOCKS before making any changes).
90137 +
90138 +   For actually implementing these out-of-system-call-scopped transcrashes, the
90139 +   reiser4_context has a "txn_handle *trans" pointer that may be set to an open
90140 +   transcrash.  Currently there are no dynamically-allocated transcrashes, but there is a
90141 +   "kmem_cache_t *_txnh_slab" created for that purpose in this file.
90142 +*/
90143 +
90144 +/* Extending the other system call interfaces for future transaction features:
90145 +
90146 +   Specialized applications may benefit from passing flags to the ordinary system call
90147 +   interface such as read(), write(), or stat().  For example, the application specifies
90148 +   WRITE_FUSING by default but wishes to add that a certain read() command should be
90149 +   treated as READ_FUSING.  But which read?  Is it the directory-entry read, the stat-data
90150 +   read, or the file-data read?  These issues are straight-forward, but there are a lot of
90151 +   them and adding the necessary flags-passing code will be tedious.
90152 +
90153 +   When supporting isolated transactions, there is a corresponding READ_MODIFY_WRITE (RMW)
90154 +   flag, which specifies that although it is a read operation being requested, a
90155 +   write-lock should be taken.  The reason is that read-locks are shared while write-locks
90156 +   are exclusive, so taking a read-lock when a later-write is known in advance will often
90157 +   leads to deadlock.  If a reader knows it will write later, it should issue read
90158 +   requests with the RMW flag set.
90159 +*/
90160 +
90161 +/*
90162 +   The znode/atom deadlock avoidance.
90163 +
90164 +   FIXME(Zam): writing of this comment is in progress.
90165 +
90166 +   The atom's special stage ASTAGE_CAPTURE_WAIT introduces a kind of atom's
90167 +   long-term locking, which makes reiser4 locking scheme more complex.  It had
90168 +   deadlocks until we implement deadlock avoidance algorithms.  That deadlocks
90169 +   looked as the following: one stopped thread waits for a long-term lock on
90170 +   znode, the thread who owns that lock waits when fusion with another atom will
90171 +   be allowed.
90172 +
90173 +   The source of the deadlocks is an optimization of not capturing index nodes
90174 +   for read.  Let's prove it.  Suppose we have dumb node capturing scheme which
90175 +   unconditionally captures each block before locking it.
90176 +
90177 +   That scheme has no deadlocks.  Let's begin with the thread which stage is
90178 +   ASTAGE_CAPTURE_WAIT and it waits for a znode lock.  The thread can't wait for
90179 +   a capture because it's stage allows fusion with any atom except which are
90180 +   being committed currently. A process of atom commit can't deadlock because
90181 +   atom commit procedure does not acquire locks and does not fuse with other
90182 +   atoms.  Reiser4 does capturing right before going to sleep inside the
90183 +   longtertm_lock_znode() function, it means the znode which we want to lock is
90184 +   already captured and its atom is in ASTAGE_CAPTURE_WAIT stage.  If we
90185 +   continue the analysis we understand that no one process in the sequence may
90186 +   waits atom fusion.  Thereby there are no deadlocks of described kind.
90187 +
90188 +   The capturing optimization makes the deadlocks possible.  A thread can wait a
90189 +   lock which owner did not captured that node.  The lock owner's current atom
90190 +   is not fused with the first atom and it does not get a ASTAGE_CAPTURE_WAIT
90191 +   state. A deadlock is possible when that atom meets another one which is in
90192 +   ASTAGE_CAPTURE_WAIT already.
90193 +
90194 +   The deadlock avoidance scheme includes two algorithms:
90195 +
90196 +   First algorithm is used when a thread captures a node which is locked but not
90197 +   captured by another thread.  Those nodes are marked MISSED_IN_CAPTURE at the
90198 +   moment we skip their capturing.  If such a node (marked MISSED_IN_CAPTURE) is
90199 +   being captured by a thread with current atom is in ASTAGE_CAPTURE_WAIT, the
90200 +   routine which forces all lock owners to join with current atom is executed.
90201 +
90202 +   Second algorithm does not allow to skip capturing of already captured nodes.
90203 +
90204 +   Both algorithms together prevent waiting a longterm lock without atom fusion
90205 +   with atoms of all lock owners, which is a key thing for getting atom/znode
90206 +   locking deadlocks.
90207 +*/
90208 +
90209 +/*
90210 + * Transactions and mmap(2).
90211 + *
90212 + *     1. Transactions are not supported for accesses through mmap(2), because
90213 + *     this would effectively amount to user-level transactions whose duration
90214 + *     is beyond control of the kernel.
90215 + *
90216 + *     2. That said, we still want to preserve some decency with regard to
90217 + *     mmap(2). During normal write(2) call, following sequence of events
90218 + *     happens:
90219 + *
90220 + *         1. page is created;
90221 + *
90222 + *         2. jnode is created, dirtied and captured into current atom.
90223 + *
90224 + *         3. extent is inserted and modified.
90225 + *
90226 + *     Steps (2) and (3) take place under long term lock on the twig node.
90227 + *
90228 + *     When file is accessed through mmap(2) page is always created during
90229 + *     page fault. After this (in reiser4_readpage()->readpage_extent()):
90230 + *
90231 + *         1. if access is made to non-hole page new jnode is created, (if
90232 + *         necessary)
90233 + *
90234 + *         2. if access is made to the hole page, jnode is not created (XXX
90235 + *         not clear why).
90236 + *
90237 + *     Also, even if page is created by write page fault it is not marked
90238 + *     dirty immediately by handle_mm_fault(). Probably this is to avoid races
90239 + *     with page write-out.
90240 + *
90241 + *     Dirty bit installed by hardware is only transferred to the struct page
90242 + *     later, when page is unmapped (in zap_pte_range(), or
90243 + *     try_to_unmap_one()).
90244 + *
90245 + *     So, with mmap(2) we have to handle following irksome situations:
90246 + *
90247 + *         1. there exists modified page (clean or dirty) without jnode
90248 + *
90249 + *         2. there exists modified page (clean or dirty) with clean jnode
90250 + *
90251 + *         3. clean page which is a part of atom can be transparently modified
90252 + *         at any moment through mapping without becoming dirty.
90253 + *
90254 + *     (1) and (2) can lead to the out-of-memory situation: ->writepage()
90255 + *     doesn't know what to do with such pages and ->sync_sb()/->writepages()
90256 + *     don't see them, because these methods operate on atoms.
90257 + *
90258 + *     (3) can lead to the loss of data: suppose we have dirty page with dirty
90259 + *     captured jnode captured by some atom. As part of early flush (for
90260 + *     example) page was written out. Dirty bit was cleared on both page and
90261 + *     jnode. After this page is modified through mapping, but kernel doesn't
90262 + *     notice and just discards page and jnode as part of commit. (XXX
90263 + *     actually it doesn't, because to reclaim page ->releasepage() has to be
90264 + *     called and before this dirty bit will be transferred to the struct
90265 + *     page).
90266 + *
90267 + */
90268 +
90269 +#include "debug.h"
90270 +#include "type_safe_list.h"
90271 +#include "txnmgr.h"
90272 +#include "jnode.h"
90273 +#include "znode.h"
90274 +#include "block_alloc.h"
90275 +#include "tree.h"
90276 +#include "wander.h"
90277 +#include "ktxnmgrd.h"
90278 +#include "super.h"
90279 +#include "page_cache.h"
90280 +#include "reiser4.h"
90281 +#include "vfs_ops.h"
90282 +#include "inode.h"
90283 +#include "prof.h"
90284 +#include "flush.h"
90285 +
90286 +#include <asm/atomic.h>
90287 +#include <linux/types.h>
90288 +#include <linux/fs.h>
90289 +#include <linux/mm.h>
90290 +#include <linux/slab.h>
90291 +#include <linux/pagemap.h>
90292 +#include <linux/writeback.h>
90293 +#include <linux/swap.h>        /* for nr_free_pagecache_pages() */
90294 +
90295 +static void atom_free(txn_atom * atom);
90296 +
90297 +static long commit_txnh(txn_handle * txnh);
90298 +
90299 +static void wakeup_atom_waitfor_list(txn_atom * atom);
90300 +static void wakeup_atom_waiting_list(txn_atom * atom);
90301 +
90302 +static void capture_assign_txnh_nolock(txn_atom * atom, txn_handle * txnh);
90303 +
90304 +static void capture_assign_block_nolock(txn_atom * atom, jnode * node);
90305 +
90306 +static int capture_assign_block(txn_handle * txnh, jnode * node);
90307 +
90308 +static int capture_assign_txnh(jnode * node, txn_handle * txnh, txn_capture mode, int can_coc);
90309 +
90310 +static int fuse_not_fused_lock_owners(txn_handle * txnh, znode * node);
90311 +
90312 +static int capture_init_fusion(jnode * node, txn_handle * txnh, txn_capture mode, int can_coc);
90313 +
90314 +static int capture_fuse_wait(jnode * node, txn_handle * txnh, txn_atom * atomf, txn_atom * atomh, txn_capture mode);
90315 +
90316 +static void capture_fuse_into(txn_atom * small, txn_atom * large);
90317 +
90318 +static int capture_copy(jnode * node, txn_handle * txnh, txn_atom * atomf, txn_atom * atomh, txn_capture mode, int can_coc);
90319 +
90320 +void invalidate_list(capture_list_head *);
90321 +
90322 +/* GENERIC STRUCTURES */
90323 +
90324 +typedef struct _txn_wait_links txn_wait_links;
90325 +
90326 +struct _txn_wait_links {
90327 +       lock_stack *_lock_stack;
90328 +       fwaitfor_list_link _fwaitfor_link;
90329 +       fwaiting_list_link _fwaiting_link;
90330 +       int (*waitfor_cb)(txn_atom *atom, struct _txn_wait_links *wlinks);
90331 +       int (*waiting_cb)(txn_atom *atom, struct _txn_wait_links *wlinks);
90332 +};
90333 +
90334 +TYPE_SAFE_LIST_DEFINE(txnh, txn_handle, txnh_link);
90335 +
90336 +TYPE_SAFE_LIST_DEFINE(fwaitfor, txn_wait_links, _fwaitfor_link);
90337 +TYPE_SAFE_LIST_DEFINE(fwaiting, txn_wait_links, _fwaiting_link);
90338 +
90339 +/* FIXME: In theory, we should be using the slab cache init & destructor
90340 +   methods instead of, e.g., jnode_init, etc. */
90341 +static kmem_cache_t *_atom_slab = NULL;
90342 +/* this is for user-visible, cross system-call transactions. */
90343 +static kmem_cache_t *_txnh_slab = NULL;
90344 +
90345 +ON_DEBUG(extern atomic_t flush_cnt;)
90346 +
90347 +/* TXN_INIT */
90348 +/* Initialize static variables in this file. */
90349 +reiser4_internal int
90350 +txnmgr_init_static(void)
90351 +{
90352 +       assert("jmacd-600", _atom_slab == NULL);
90353 +       assert("jmacd-601", _txnh_slab == NULL);
90354 +
90355 +       ON_DEBUG(atomic_set(&flush_cnt, 0));
90356 +
90357 +       _atom_slab = kmem_cache_create("txn_atom", sizeof (txn_atom), 0,
90358 +                                      SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT,
90359 +                                      NULL, NULL);
90360 +
90361 +       if (_atom_slab == NULL) {
90362 +               goto error;
90363 +       }
90364 +
90365 +       _txnh_slab = kmem_cache_create("txn_handle", sizeof (txn_handle), 0, SLAB_HWCACHE_ALIGN, NULL, NULL);
90366 +
90367 +       if (_txnh_slab == NULL) {
90368 +               goto error;
90369 +       }
90370 +
90371 +       return 0;
90372 +
90373 +error:
90374 +
90375 +       if (_atom_slab != NULL) {
90376 +               kmem_cache_destroy(_atom_slab);
90377 +       }
90378 +       if (_txnh_slab != NULL) {
90379 +               kmem_cache_destroy(_txnh_slab);
90380 +       }
90381 +       return RETERR(-ENOMEM);
90382 +}
90383 +
90384 +/* Un-initialize static variables in this file. */
90385 +reiser4_internal int
90386 +txnmgr_done_static(void)
90387 +{
90388 +       int ret1, ret2, ret3;
90389 +
90390 +       ret1 = ret2 = ret3 = 0;
90391 +
90392 +       if (_atom_slab != NULL) {
90393 +               ret1 = kmem_cache_destroy(_atom_slab);
90394 +               _atom_slab = NULL;
90395 +       }
90396 +
90397 +       if (_txnh_slab != NULL) {
90398 +               ret2 = kmem_cache_destroy(_txnh_slab);
90399 +               _txnh_slab = NULL;
90400 +       }
90401 +
90402 +       return ret1 ? : ret2;
90403 +}
90404 +
90405 +/* Initialize a new transaction manager.  Called when the super_block is initialized. */
90406 +reiser4_internal void
90407 +txnmgr_init(txn_mgr * mgr)
90408 +{
90409 +       assert("umka-169", mgr != NULL);
90410 +
90411 +       mgr->atom_count = 0;
90412 +       mgr->id_count = 1;
90413 +
90414 +       atom_list_init(&mgr->atoms_list);
90415 +       spin_txnmgr_init(mgr);
90416 +
90417 +       sema_init(&mgr->commit_semaphore, 1);
90418 +}
90419 +
90420 +/* Free transaction manager. */
90421 +reiser4_internal int
90422 +txnmgr_done(txn_mgr * mgr UNUSED_ARG)
90423 +{
90424 +       assert("umka-170", mgr != NULL);
90425 +
90426 +       return 0;
90427 +}
90428 +
90429 +/* Initialize a transaction handle. */
90430 +/* Audited by: umka (2002.06.13) */
90431 +static void
90432 +txnh_init(txn_handle * txnh, txn_mode mode)
90433 +{
90434 +       assert("umka-171", txnh != NULL);
90435 +
90436 +       txnh->mode = mode;
90437 +       txnh->atom = NULL;
90438 +       txnh->flags = 0;
90439 +
90440 +       spin_txnh_init(txnh);
90441 +
90442 +       txnh_list_clean(txnh);
90443 +}
90444 +
90445 +#if REISER4_DEBUG
90446 +/* Check if a transaction handle is clean. */
90447 +static int
90448 +txnh_isclean(txn_handle * txnh)
90449 +{
90450 +       assert("umka-172", txnh != NULL);
90451 +       return txnh->atom == NULL && spin_txnh_is_not_locked(txnh);
90452 +}
90453 +#endif
90454 +
90455 +/* Initialize an atom. */
90456 +static void
90457 +atom_init(txn_atom * atom)
90458 +{
90459 +       int level;
90460 +
90461 +       assert("umka-173", atom != NULL);
90462 +
90463 +       xmemset(atom, 0, sizeof (txn_atom));
90464 +
90465 +       atom->stage = ASTAGE_FREE;
90466 +       atom->start_time = jiffies;
90467 +
90468 +       for (level = 0; level < REAL_MAX_ZTREE_HEIGHT + 1; level += 1)
90469 +               capture_list_init(ATOM_DIRTY_LIST(atom, level));
90470 +
90471 +       capture_list_init(ATOM_CLEAN_LIST(atom));
90472 +       capture_list_init(ATOM_OVRWR_LIST(atom));
90473 +       capture_list_init(ATOM_WB_LIST(atom));
90474 +       capture_list_init(&atom->inodes);
90475 +       spin_atom_init(atom);
90476 +       txnh_list_init(&atom->txnh_list);
90477 +       atom_list_clean(atom);
90478 +       fwaitfor_list_init(&atom->fwaitfor_list);
90479 +       fwaiting_list_init(&atom->fwaiting_list);
90480 +       prot_list_init(&atom->protected);
90481 +       blocknr_set_init(&atom->delete_set);
90482 +       blocknr_set_init(&atom->wandered_map);
90483 +
90484 +       init_atom_fq_parts(atom);
90485 +}
90486 +
90487 +#if REISER4_DEBUG
90488 +/* Check if an atom is clean. */
90489 +static int
90490 +atom_isclean(txn_atom * atom)
90491 +{
90492 +       int level;
90493 +
90494 +       assert("umka-174", atom != NULL);
90495 +
90496 +       for (level = 0; level < REAL_MAX_ZTREE_HEIGHT + 1; level += 1) {
90497 +               if (!capture_list_empty(ATOM_DIRTY_LIST(atom, level))) {
90498 +                       return 0;
90499 +               }
90500 +       }
90501 +
90502 +       return
90503 +               atom->stage == ASTAGE_FREE &&
90504 +               atom->txnh_count == 0 &&
90505 +               atom->capture_count == 0 &&
90506 +               atomic_read(&atom->refcount) == 0 &&
90507 +               atom_list_is_clean(atom) &&
90508 +               txnh_list_empty(&atom->txnh_list) &&
90509 +               capture_list_empty(ATOM_CLEAN_LIST(atom)) &&
90510 +               capture_list_empty(ATOM_OVRWR_LIST(atom)) &&
90511 +               capture_list_empty(ATOM_WB_LIST(atom)) &&
90512 +               fwaitfor_list_empty(&atom->fwaitfor_list) &&
90513 +               fwaiting_list_empty(&atom->fwaiting_list) &&
90514 +               prot_list_empty(&atom->protected) &&
90515 +               atom_fq_parts_are_clean(atom);
90516 +}
90517 +#endif
90518 +
90519 +/* Begin a transaction in this context.  Currently this uses the reiser4_context's
90520 +   trans_in_ctx, which means that transaction handles are stack-allocated.  Eventually
90521 +   this will be extended to allow transaction handles to span several contexts. */
90522 +/* Audited by: umka (2002.06.13) */
90523 +reiser4_internal void
90524 +txn_begin(reiser4_context * context)
90525 +{
90526 +       assert("jmacd-544", context->trans == NULL);
90527 +
90528 +       context->trans = &context->trans_in_ctx;
90529 +
90530 +       /* FIXME_LATER_JMACD Currently there's no way to begin a TXN_READ_FUSING
90531 +          transcrash.  Default should be TXN_WRITE_FUSING.  Also, the _trans variable is
90532 +          stack allocated right now, but we would like to allow for dynamically allocated
90533 +          transcrashes that span multiple system calls.
90534 +       */
90535 +       txnh_init(context->trans, TXN_WRITE_FUSING);
90536 +}
90537 +
90538 +/* Finish a transaction handle context. */
90539 +reiser4_internal long
90540 +txn_end(reiser4_context * context)
90541 +{
90542 +       long ret = 0;
90543 +       txn_handle *txnh;
90544 +
90545 +       assert("umka-283", context != NULL);
90546 +       assert("nikita-3012", schedulable());
90547 +
90548 +       /* closing non top-level context---nothing to do */
90549 +       if (context != context->parent)
90550 +               return 0;
90551 +
90552 +       assert("nikita-2967", lock_stack_isclean(get_current_lock_stack()));
90553 +
90554 +       txnh = context->trans;
90555 +
90556 +       if (txnh != NULL) {
90557 +               /* The txnh's field "atom" can be checked for NULL w/o holding a
90558 +                  lock because txnh->atom could be set by this thread's call to
90559 +                  try_capture or the deadlock prevention code in
90560 +                  fuse_not_fused_lock_owners().  But that code may assign an
90561 +                  atom to this transaction handle only if there are locked and
90562 +                  not yet fused nodes.  It cannot happen because lock stack
90563 +                  should be clean at this moment. */
90564 +               if (txnh->atom != NULL)
90565 +                       ret = commit_txnh(txnh);
90566 +
90567 +               assert("jmacd-633", txnh_isclean(txnh));
90568 +
90569 +               context->trans = NULL;
90570 +       }
90571 +
90572 +       return ret;
90573 +}
90574 +
90575 +reiser4_internal void
90576 +txn_restart(reiser4_context * context)
90577 +{
90578 +       txn_end(context);
90579 +       preempt_point();
90580 +       txn_begin(context);
90581 +}
90582 +
90583 +reiser4_internal void
90584 +txn_restart_current(void)
90585 +{
90586 +       txn_restart(get_current_context());
90587 +}
90588 +
90589 +/* TXN_ATOM */
90590 +
90591 +/* Get the atom belonging to a txnh, which is not locked.  Return txnh locked. Locks atom, if atom
90592 +   is not NULL.  This performs the necessary spin_trylock to break the lock-ordering cycle.  May
90593 +   return NULL. */
90594 +reiser4_internal txn_atom *
90595 +txnh_get_atom(txn_handle * txnh)
90596 +{
90597 +       txn_atom *atom;
90598 +
90599 +       assert("umka-180", txnh != NULL);
90600 +       assert("jmacd-5108", spin_txnh_is_not_locked(txnh));
90601 +
90602 +       while (1) {
90603 +               LOCK_TXNH(txnh);
90604 +               atom = txnh->atom;
90605 +
90606 +               if (atom == NULL)
90607 +                       break;
90608 +
90609 +               if (spin_trylock_atom(atom))
90610 +                       break;
90611 +
90612 +               atomic_inc(&atom->refcount);
90613 +
90614 +               UNLOCK_TXNH(txnh);
90615 +               LOCK_ATOM(atom);
90616 +               LOCK_TXNH(txnh);
90617 +
90618 +               if (txnh->atom == atom) {
90619 +                       atomic_dec(&atom->refcount);
90620 +                       break;
90621 +               }
90622 +
90623 +               UNLOCK_TXNH(txnh);
90624 +               atom_dec_and_unlock(atom);
90625 +       }
90626 +
90627 +       return atom;
90628 +}
90629 +
90630 +/* Get the current atom and spinlock it if current atom present. May return NULL  */
90631 +reiser4_internal txn_atom *
90632 +get_current_atom_locked_nocheck(void)
90633 +{
90634 +       reiser4_context *cx;
90635 +       txn_atom *atom;
90636 +       txn_handle *txnh;
90637 +
90638 +       cx = get_current_context();
90639 +       assert("zam-437", cx != NULL);
90640 +
90641 +       txnh = cx->trans;
90642 +       assert("zam-435", txnh != NULL);
90643 +
90644 +       atom = txnh_get_atom(txnh);
90645 +
90646 +       UNLOCK_TXNH(txnh);
90647 +       return atom;
90648 +}
90649 +
90650 +/* Get the atom belonging to a jnode, which is initially locked.  Return with
90651 +   both jnode and atom locked.  This performs the necessary spin_trylock to
90652 +   break the lock-ordering cycle.  Assumes the jnode is already locked, and
90653 +   returns NULL if atom is not set. */
90654 +reiser4_internal txn_atom *
90655 +jnode_get_atom(jnode * node)
90656 +{
90657 +       txn_atom *atom;
90658 +
90659 +       assert("umka-181", node != NULL);
90660 +
90661 +       while (1) {
90662 +               assert("jmacd-5108", spin_jnode_is_locked(node));
90663 +
90664 +               atom = node->atom;
90665 +               /* node is not in any atom */
90666 +               if (atom == NULL)
90667 +                       break;
90668 +
90669 +               /* If atom is not locked, grab the lock and return */
90670 +               if (spin_trylock_atom(atom))
90671 +                       break;
90672 +
90673 +               /* At least one jnode belongs to this atom it guarantees that
90674 +                * atom->refcount > 0, we can safely increment refcount. */
90675 +               atomic_inc(&atom->refcount);
90676 +               UNLOCK_JNODE(node);
90677 +
90678 +               /* re-acquire spin locks in the right order */
90679 +               LOCK_ATOM(atom);
90680 +               LOCK_JNODE(node);
90681 +
90682 +               /* check if node still points to the same atom. */
90683 +               if (node->atom == atom) {
90684 +                       atomic_dec(&atom->refcount);
90685 +                       break;
90686 +               }
90687 +
90688 +               /* releasing of atom lock and reference requires not holding
90689 +                * locks on jnodes.  */
90690 +               UNLOCK_JNODE(node);
90691 +
90692 +               /* We do not sure that this atom has extra references except our
90693 +                * one, so we should call proper function which may free atom if
90694 +                * last reference is released. */
90695 +               atom_dec_and_unlock(atom);
90696 +
90697 +               /* lock jnode again for getting valid node->atom pointer
90698 +                * value. */
90699 +               LOCK_JNODE(node);
90700 +       }
90701 +
90702 +       return atom;
90703 +}
90704 +
90705 +/* Returns true if @node is dirty and part of the same atom as one of its neighbors.  Used
90706 +   by flush code to indicate whether the next node (in some direction) is suitable for
90707 +   flushing. */
90708 +reiser4_internal int
90709 +same_slum_check(jnode * node, jnode * check, int alloc_check, int alloc_value)
90710 +{
90711 +       int compat;
90712 +       txn_atom *atom;
90713 +
90714 +       assert("umka-182", node != NULL);
90715 +       assert("umka-183", check != NULL);
90716 +
90717 +       /* Not sure what this function is supposed to do if supplied with @check that is
90718 +          neither formatted nor unformatted (bitmap or so). */
90719 +       assert("nikita-2373", jnode_is_znode(check) || jnode_is_unformatted(check));
90720 +
90721 +       /* Need a lock on CHECK to get its atom and to check various state bits.
90722 +          Don't need a lock on NODE once we get the atom lock. */
90723 +       /* It is not enough to lock two nodes and check (node->atom ==
90724 +          check->atom) because atom could be locked and being fused at that
90725 +          moment, jnodes of the atom of that state (being fused) can point to
90726 +          different objects, but the atom is the same.*/
90727 +       LOCK_JNODE(check);
90728 +
90729 +       atom = jnode_get_atom(check);
90730 +
90731 +       if (atom == NULL) {
90732 +               compat = 0;
90733 +       } else {
90734 +               compat = (node->atom == atom && jnode_is_dirty(check));
90735 +
90736 +               if (compat && jnode_is_znode(check)) {
90737 +                       compat &= znode_is_connected(JZNODE(check));
90738 +               }
90739 +
90740 +               if (compat && alloc_check) {
90741 +                       compat &= (alloc_value == jnode_is_flushprepped(check));
90742 +               }
90743 +
90744 +               UNLOCK_ATOM(atom);
90745 +       }
90746 +
90747 +       UNLOCK_JNODE(check);
90748 +
90749 +       return compat;
90750 +}
90751 +
90752 +/* Decrement the atom's reference count and if it falls to zero, free it. */
90753 +reiser4_internal void
90754 +atom_dec_and_unlock(txn_atom * atom)
90755 +{
90756 +       txn_mgr *mgr = &get_super_private(reiser4_get_current_sb())->tmgr;
90757 +
90758 +       assert("umka-186", atom != NULL);
90759 +       assert("jmacd-1071", spin_atom_is_locked(atom));
90760 +       assert("zam-1039", atomic_read(&atom->refcount) > 0);
90761 +
90762 +       if (atomic_dec_and_test(&atom->refcount)) {
90763 +               /* take txnmgr lock and atom lock in proper order. */
90764 +               if (!spin_trylock_txnmgr(mgr)) {
90765 +                       /* This atom should exist after we re-acquire its
90766 +                        * spinlock, so we increment its reference counter. */
90767 +                       atomic_inc(&atom->refcount);
90768 +                       UNLOCK_ATOM(atom);
90769 +                       spin_lock_txnmgr(mgr);
90770 +                       LOCK_ATOM(atom);
90771 +
90772 +                       if (!atomic_dec_and_test(&atom->refcount)) {
90773 +                               UNLOCK_ATOM(atom);
90774 +                               spin_unlock_txnmgr(mgr);
90775 +                               return;
90776 +                       }
90777 +               }
90778 +               assert("nikita-2656", spin_txnmgr_is_locked(mgr));
90779 +               atom_free(atom);
90780 +               spin_unlock_txnmgr(mgr);
90781 +       } else
90782 +               UNLOCK_ATOM(atom);
90783 +}
90784 +
90785 +/* Return a new atom, locked.  This adds the atom to the transaction manager's list and
90786 +   sets its reference count to 1, an artificial reference which is kept until it
90787 +   commits.  We play strange games to avoid allocation under jnode & txnh spinlocks.*/
90788 +
90789 +/* ZAM-FIXME-HANS: should we set node->atom and txnh->atom here also? */
90790 +/* ANSWER(ZAM): there are special functions, capture_assign_txnh_nolock() and
90791 +   capture_assign_block_nolock(), they are called right after calling
90792 +   atom_begin_and_lock().  It could be done here, but, for understandability, it
90793 +   is better to keep those calls inside try_capture_block main routine where all
90794 +   assignments are made. */
90795 +static txn_atom *
90796 +atom_begin_andlock(txn_atom ** atom_alloc, jnode * node, txn_handle * txnh)
90797 +{
90798 +       txn_atom *atom;
90799 +       txn_mgr *mgr;
90800 +
90801 +       assert("jmacd-43228", spin_jnode_is_locked(node));
90802 +       assert("jmacd-43227", spin_txnh_is_locked(txnh));
90803 +       assert("jmacd-43226", node->atom == NULL);
90804 +       assert("jmacd-43225", txnh->atom == NULL);
90805 +
90806 +       if (REISER4_DEBUG && rofs_jnode(node)) {
90807 +               warning("nikita-3366", "Creating atom on rofs");
90808 +               dump_stack();
90809 +       }
90810 +
90811 +       /* A memory allocation may schedule we have to release those spinlocks
90812 +        * before kmem_cache_alloc() call. */
90813 +       UNLOCK_JNODE(node);
90814 +       UNLOCK_TXNH(txnh);
90815 +
90816 +       if (*atom_alloc == NULL) {
90817 +               (*atom_alloc) = kmem_cache_alloc(_atom_slab, GFP_KERNEL);
90818 +
90819 +               if (*atom_alloc == NULL)
90820 +                       return ERR_PTR(RETERR(-ENOMEM));
90821 +       }
90822 +
90823 +       /* and, also, txnmgr spin lock should be taken before jnode and txnh
90824 +          locks. */
90825 +       mgr = &get_super_private(reiser4_get_current_sb())->tmgr;
90826 +       spin_lock_txnmgr(mgr);
90827 +
90828 +       LOCK_JNODE(node);
90829 +       LOCK_TXNH(txnh);
90830 +
90831 +       /* Check if both atom pointers are still NULL... */
90832 +       if (node->atom != NULL || txnh->atom != NULL) {
90833 +               ON_TRACE(TRACE_TXN, "alloc atom race\n");
90834 +               /* NOTE-NIKITA probably it is rather better to free
90835 +                * atom_alloc here than thread it up to try_capture(). */
90836 +
90837 +               UNLOCK_TXNH(txnh);
90838 +               UNLOCK_JNODE(node);
90839 +               spin_unlock_txnmgr(mgr);
90840 +
90841 +               reiser4_stat_inc(txnmgr.restart.atom_begin);
90842 +               return ERR_PTR(-E_REPEAT);
90843 +       }
90844 +
90845 +       atom = *atom_alloc;
90846 +       *atom_alloc = NULL;
90847 +
90848 +       atom_init(atom);
90849 +
90850 +       assert("jmacd-17", atom_isclean(atom));
90851 +
90852 +       /* Take the atom and txnmgr lock. No checks for lock ordering, because
90853 +          @atom is new and inaccessible for others. */
90854 +       spin_lock_atom_no_ord(atom, 0, 0);
90855 +
90856 +       atom_list_push_back(&mgr->atoms_list, atom);
90857 +       atom->atom_id = mgr->id_count++;
90858 +       mgr->atom_count += 1;
90859 +
90860 +       /* Release txnmgr lock */
90861 +       spin_unlock_txnmgr(mgr);
90862 +
90863 +       /* One reference until it commits. */
90864 +       atomic_inc(&atom->refcount);
90865 +
90866 +       atom->stage = ASTAGE_CAPTURE_FUSE;
90867 +
90868 +       ON_TRACE(TRACE_TXN, "begin atom %u\n", atom->atom_id);
90869 +
90870 +       return atom;
90871 +}
90872 +
90873 +#if REISER4_DEBUG
90874 +/* Return true if an atom is currently "open". */
90875 +static int atom_isopen(const txn_atom * atom)
90876 +{
90877 +       assert("umka-185", atom != NULL);
90878 +
90879 +       return atom->stage > 0 && atom->stage < ASTAGE_PRE_COMMIT;
90880 +}
90881 +#endif
90882 +
90883 +/* Return the number of pointers to this atom that must be updated during fusion.  This
90884 +   approximates the amount of work to be done.  Fusion chooses the atom with fewer
90885 +   pointers to fuse into the atom with more pointers. */
90886 +static int
90887 +atom_pointer_count(const txn_atom * atom)
90888 +{
90889 +       assert("umka-187", atom != NULL);
90890 +
90891 +       /* This is a measure of the amount of work needed to fuse this atom
90892 +        * into another. */
90893 +       return atom->txnh_count + atom->capture_count;
90894 +}
90895 +
90896 +/* Called holding the atom lock, this removes the atom from the transaction manager list
90897 +   and frees it. */
90898 +static void
90899 +atom_free(txn_atom * atom)
90900 +{
90901 +       txn_mgr *mgr = &get_super_private(reiser4_get_current_sb())->tmgr;
90902 +
90903 +       assert("umka-188", atom != NULL);
90904 +
90905 +       ON_TRACE(TRACE_TXN, "free atom %u\n", atom->atom_id);
90906 +
90907 +       assert("jmacd-18", spin_atom_is_locked(atom));
90908 +
90909 +       /* Remove from the txn_mgr's atom list */
90910 +       assert("nikita-2657", spin_txnmgr_is_locked(mgr));
90911 +       mgr->atom_count -= 1;
90912 +       atom_list_remove_clean(atom);
90913 +
90914 +       /* Clean the atom */
90915 +       assert("jmacd-16", (atom->stage == ASTAGE_INVALID || atom->stage == ASTAGE_DONE));
90916 +       atom->stage = ASTAGE_FREE;
90917 +
90918 +       blocknr_set_destroy(&atom->delete_set);
90919 +       blocknr_set_destroy(&atom->wandered_map);
90920 +
90921 +       assert("jmacd-16", atom_isclean(atom));
90922 +
90923 +       UNLOCK_ATOM(atom);
90924 +
90925 +       kmem_cache_free(_atom_slab, atom);
90926 +}
90927 +
90928 +static int
90929 +atom_is_dotard(const txn_atom * atom)
90930 +{
90931 +       return time_after(jiffies, atom->start_time +
90932 +                         get_current_super_private()->tmgr.atom_max_age);
90933 +}
90934 +
90935 +static int atom_can_be_committed (txn_atom * atom)
90936 +{
90937 +       assert ("zam-884", spin_atom_is_locked(atom));
90938 +       assert ("zam-885", atom->txnh_count > atom->nr_waiters);
90939 +       return atom->txnh_count == atom->nr_waiters + 1;
90940 +}
90941 +
90942 +/* Return true if an atom should commit now.  This is determined by aging, atom
90943 +   size or atom flags. */
90944 +static int
90945 +atom_should_commit(const txn_atom * atom)
90946 +{
90947 +       assert("umka-189", atom != NULL);
90948 +       return
90949 +               (atom->flags & ATOM_FORCE_COMMIT) ||
90950 +               ((unsigned) atom_pointer_count(atom) > get_current_super_private()->tmgr.atom_max_size) ||
90951 +               atom_is_dotard(atom);
90952 +}
90953 +
90954 +/* return 1 if current atom exists and requires commit. */
90955 +reiser4_internal int current_atom_should_commit(void)
90956 +{
90957 +       txn_atom * atom;
90958 +       int result = 0;
90959 +
90960 +       atom = get_current_atom_locked_nocheck();
90961 +       if (atom) {
90962 +               result = atom_should_commit(atom);
90963 +               UNLOCK_ATOM(atom);
90964 +       }
90965 +       return result;
90966 +}
90967 +
90968 +static int
90969 +atom_should_commit_asap(const txn_atom * atom)
90970 +{
90971 +       unsigned int captured;
90972 +       unsigned int pinnedpages;
90973 +
90974 +       assert("nikita-3309", atom != NULL);
90975 +
90976 +       captured = (unsigned) atom->capture_count;
90977 +       pinnedpages = (captured >> PAGE_CACHE_SHIFT) * sizeof(znode);
90978 +
90979 +       return
90980 +               (pinnedpages > (totalram_pages >> 3)) ||
90981 +               (atom->flushed > 100);
90982 +}
90983 +
90984 +static jnode * find_first_dirty_in_list (capture_list_head * head, int flags)
90985 +{
90986 +       jnode * first_dirty;
90987 +
90988 +       for_all_type_safe_list(capture, head, first_dirty) {
90989 +               if (!(flags & JNODE_FLUSH_COMMIT)) {
90990 +                       if (
90991 +                               /* skip jnodes which have "heard banshee" */
90992 +                               JF_ISSET(first_dirty, JNODE_HEARD_BANSHEE) ||
90993 +                               /* and with active I/O */
90994 +                               JF_ISSET(first_dirty, JNODE_WRITEBACK))
90995 +                               continue;
90996 +               }
90997 +               return first_dirty;
90998 +       }
90999 +       return NULL;
91000 +}
91001 +
91002 +/* Get first dirty node from the atom's dirty_nodes[n] lists; return NULL if atom has no dirty
91003 +   nodes on atom's lists */
91004 +reiser4_internal jnode * find_first_dirty_jnode (txn_atom * atom, int flags)
91005 +{
91006 +       jnode *first_dirty;
91007 +       tree_level level;
91008 +
91009 +       assert("zam-753", spin_atom_is_locked(atom));
91010 +
91011 +       /* The flush starts from LEAF_LEVEL (=1). */
91012 +       for (level = 1; level < REAL_MAX_ZTREE_HEIGHT + 1; level += 1) {
91013 +               if (capture_list_empty(ATOM_DIRTY_LIST(atom, level)))
91014 +                       continue;
91015 +
91016 +               first_dirty = find_first_dirty_in_list(ATOM_DIRTY_LIST(atom, level), flags);
91017 +               if (first_dirty)
91018 +                       return first_dirty;
91019 +       }
91020 +
91021 +       /* znode-above-root is on the list #0. */
91022 +       return find_first_dirty_in_list(ATOM_DIRTY_LIST(atom, 0), flags);
91023 +}
91024 +
91025 +#if REISER4_COPY_ON_CAPTURE
91026 +
91027 +/* this spin lock is used to prevent races during steal on capture.
91028 +   FIXME: should be per filesystem or even per atom */
91029 +spinlock_t scan_lock = SPIN_LOCK_UNLOCKED;
91030 +
91031 +/* Scan atom->writeback_nodes list and dispatch jnodes according to their state:
91032 + * move dirty and !writeback jnodes to @fq, clean jnodes to atom's clean
91033 + * list. */
91034 +/* NOTE: doing that in end IO handler requires using of special spinlocks which
91035 + * disables interrupts in all places except IO handler. That is expensive. */
91036 +static void dispatch_wb_list (txn_atom * atom, flush_queue_t * fq)
91037 +{
91038 +       jnode * cur;
91039 +       int total, moved;
91040 +
91041 +       assert("zam-905", spin_atom_is_locked(atom));
91042 +
91043 +       total = 0;
91044 +       moved = 0;
91045 +
91046 +       spin_lock(&scan_lock);
91047 +       cur = capture_list_front(ATOM_WB_LIST(atom));
91048 +       while (!capture_list_end(ATOM_WB_LIST(atom), cur)) {
91049 +               jnode * next;
91050 +
91051 +               total ++;
91052 +               JF_SET(cur, JNODE_SCANNED);
91053 +               next = capture_list_next(cur);
91054 +               if (!capture_list_end(ATOM_WB_LIST(atom), next))
91055 +                       JF_SET(next, JNODE_SCANNED);
91056 +
91057 +               spin_unlock(&scan_lock);
91058 +
91059 +               LOCK_JNODE(cur);
91060 +               assert("vs-1441", NODE_LIST(cur) == WB_LIST);
91061 +               if (!JF_ISSET(cur, JNODE_WRITEBACK)) {
91062 +                       moved ++;
91063 +                       if (JF_ISSET(cur, JNODE_DIRTY)) {
91064 +                               queue_jnode(fq, cur);
91065 +                       } else {
91066 +                               /* move from writeback list to clean list */
91067 +                               capture_list_remove(cur);
91068 +                               capture_list_push_back(ATOM_CLEAN_LIST(atom), cur);
91069 +                               ON_DEBUG(count_jnode(atom, cur, WB_LIST, CLEAN_LIST, 1));
91070 +                       }
91071 +               }
91072 +               UNLOCK_JNODE(cur);
91073 +
91074 +               spin_lock(&scan_lock);
91075 +               JF_CLR(cur, JNODE_SCANNED);
91076 +               cur = next;
91077 +               assert("vs-1450", ergo(!capture_list_end(ATOM_WB_LIST(atom), cur),
91078 +                                      JF_ISSET(cur, JNODE_SCANNED) && NODE_LIST(cur) == WB_LIST));
91079 +       }
91080 +       spin_unlock(&scan_lock);
91081 +}
91082 +
91083 +#else
91084 +
91085 +static void dispatch_wb_list (txn_atom * atom, flush_queue_t * fq)
91086 +{
91087 +        jnode * cur;
91088 +
91089 +        assert("zam-905", atom_is_protected(atom));
91090 +
91091 +        cur = capture_list_front(ATOM_WB_LIST(atom));
91092 +        while (!capture_list_end(ATOM_WB_LIST(atom), cur)) {
91093 +                jnode * next = capture_list_next(cur);
91094 +
91095 +                LOCK_JNODE(cur);
91096 +                if (!JF_ISSET(cur, JNODE_WRITEBACK)) {
91097 +                        if (JF_ISSET(cur, JNODE_DIRTY)) {
91098 +                                queue_jnode(fq, cur);
91099 +                        } else {
91100 +                                capture_list_remove(cur);
91101 +                                capture_list_push_back(ATOM_CLEAN_LIST(atom), cur);
91102 +                        }
91103 +                }
91104 +                UNLOCK_JNODE(cur);
91105 +
91106 +                cur = next;
91107 +        }
91108 +}
91109 +
91110 +#endif
91111 +
91112 +/* Scan current atom->writeback_nodes list, re-submit dirty and !writeback
91113 + * jnodes to disk. */
91114 +static int submit_wb_list (void)
91115 +{
91116 +       int ret;
91117 +       flush_queue_t * fq;
91118 +
91119 +       fq = get_fq_for_current_atom();
91120 +       if (IS_ERR(fq))
91121 +               return PTR_ERR(fq);
91122 +
91123 +       dispatch_wb_list(fq->atom, fq);
91124 +       UNLOCK_ATOM(fq->atom);
91125 +
91126 +       /* trace_mark(flush); */
91127 +       write_current_logf(WRITE_IO_LOG, "mark=flush");
91128 +
91129 +       ret = write_fq(fq, NULL, 1);
91130 +       fq_put(fq);
91131 +
91132 +       return ret;
91133 +}
91134 +
91135 +#if 0
91136 +
91137 +/* when during system call inode is "captured" (by reiser4_mark_inode_dirty) - blocks grabbed for stat data update are
91138 +   moved to atom's flush_reserved bucket. On commit time (right before updating stat datas of all captured inodes) those
91139 +   blocks are moved to grabbed. This function is used to calculate number of blocks reserved for stat data update when
91140 +   those blocks get moved back and forwward between buckets of grabbed and flush_reserved blocks */
91141 +static reiser4_block_nr reserved_for_sd_update(struct inode *inode)
91142 +{
91143 +       return inode_file_plugin(inode)->estimate.update(inode);
91144 +}
91145 +
91146 +static void atom_update_stat_data(txn_atom **atom)
91147 +{
91148 +       jnode *j;
91149 +
91150 +       assert("vs-1241", spin_atom_is_locked(*atom));
91151 +       assert("vs-1616", capture_list_empty(&(*atom)->inodes));
91152 +
91153 +       while (!capture_list_empty(&(*atom)->inodes)) {
91154 +               struct inode *inode;
91155 +
91156 +               j = capture_list_front(&((*atom)->inodes));
91157 +
91158 +               inode = inode_by_reiser4_inode(container_of(j, reiser4_inode, inode_jnode));
91159 +
91160 +               /* move blocks grabbed for stat data update back from atom's
91161 +                * flush_reserved to grabbed */
91162 +               flush_reserved2grabbed(*atom, reserved_for_sd_update(inode));
91163 +
91164 +               capture_list_remove_clean(j);
91165 +               capture_list_push_back(ATOM_CLEAN_LIST(*atom), j);
91166 +               UNLOCK_ATOM(*atom);
91167 +
91168 +               /* FIXME: it is not clear what to do if update sd fails. A warning will be issued (nikita-2221) */
91169 +               reiser4_update_sd(inode);
91170 +               *atom = get_current_atom_locked();
91171 +       }
91172 +
91173 +       assert("vs-1231", capture_list_empty(&((*atom)->inodes)));
91174 +}
91175 +#endif
91176 +
91177 +/* Wait completion of all writes, re-submit atom writeback list if needed. */
91178 +static int current_atom_complete_writes (void)
91179 +{
91180 +       int ret;
91181 +
91182 +       /* Each jnode from that list was modified and dirtied when it had i/o
91183 +        * request running already. After i/o completion we have to resubmit
91184 +        * them to disk again.*/
91185 +       ret = submit_wb_list();
91186 +       if (ret < 0)
91187 +               return ret;
91188 +
91189 +       /* Wait all i/o completion */
91190 +       ret = current_atom_finish_all_fq();
91191 +       if (ret)
91192 +               return ret;
91193 +
91194 +       /* Scan wb list again; all i/o should be completed, we re-submit dirty
91195 +        * nodes to disk */
91196 +       ret = submit_wb_list();
91197 +       if(ret < 0)
91198 +               return ret;
91199 +
91200 +       /* Wait all nodes we just submitted */
91201 +       return current_atom_finish_all_fq();
91202 +}
91203 +
91204 +#define TOOMANYFLUSHES (1 << 13)
91205 +
91206 +/* Called with the atom locked and no open "active" transaction handlers except
91207 +   ours, this function calls flush_current_atom() until all dirty nodes are
91208 +   processed.  Then it initiates commit processing.
91209 +
91210 +   Called by the single remaining open "active" txnh, which is closing. Other
91211 +   open txnhs belong to processes which wait atom commit in commit_txnh()
91212 +   routine. They are counted as "waiters" in atom->nr_waiters.  Therefore as
91213 +   long as we hold the atom lock none of the jnodes can be captured and/or
91214 +   locked.
91215 +
91216 +   Return value is an error code if commit fails.
91217 +*/
91218 +static int commit_current_atom (long *nr_submitted, txn_atom ** atom)
91219 +{
91220 +       reiser4_super_info_data * sbinfo = get_current_super_private ();
91221 +       long ret;
91222 +       /* how many times jnode_flush() was called as a part of attempt to
91223 +        * commit this atom. */
91224 +       int  flushiters;
91225 +
91226 +       assert ("zam-888", atom != NULL && *atom != NULL);
91227 +       assert ("zam-886", spin_atom_is_locked(*atom));
91228 +       assert ("zam-887", get_current_context()->trans->atom == *atom);
91229 +       assert("jmacd-151", atom_isopen(*atom));
91230 +
91231 +       /* lock ordering: delete_sema and commit_sema are unordered */
91232 +       assert("nikita-3184",
91233 +              get_current_super_private()->delete_sema_owner != current);
91234 +
91235 +       ON_TRACE(TRACE_TXN, "atom %u trying to commit %u: CAPTURE_WAIT\n",
91236 +                (*atom)->atom_id, current->pid);
91237 +
91238 +       /* call reiser4_update_sd for all atom's inodes */
91239 +       /*atom_update_stat_data(atom);*/
91240 +
91241 +       for (flushiters = 0 ;; ++ flushiters) {
91242 +               ret = flush_current_atom(JNODE_FLUSH_WRITE_BLOCKS | JNODE_FLUSH_COMMIT, nr_submitted, atom);
91243 +               if (ret != -E_REPEAT)
91244 +                       break;
91245 +
91246 +               /* if atom's dirty list contains one znode which is
91247 +                  HEARD_BANSHEE and is locked we have to allow lock owner to
91248 +                  continue and uncapture that znode */
91249 +               preempt_point();
91250 +
91251 +               *atom = get_current_atom_locked();
91252 +               if (flushiters > TOOMANYFLUSHES && IS_POW(flushiters)) {
91253 +                       warning("nikita-3176",
91254 +                               "Flushing like mad: %i", flushiters);
91255 +                       info_atom("atom", *atom);
91256 +                       DEBUGON(flushiters > (1 << 20));
91257 +               }
91258 +       }
91259 +
91260 +       if (ret)
91261 +               return ret;
91262 +
91263 +       assert ("zam-882", spin_atom_is_locked(*atom));
91264 +
91265 +       if (!atom_can_be_committed(*atom)) {
91266 +               UNLOCK_ATOM(*atom);
91267 +               reiser4_stat_inc(txnmgr.restart.cannot_commit);
91268 +               return RETERR(-E_REPEAT);
91269 +       }
91270 +
91271 +       /* Up to this point we have been flushing and after flush is called we
91272 +          return -E_REPEAT.  Now we can commit.  We cannot return -E_REPEAT
91273 +          at this point, commit should be successful. */
91274 +       atom_set_stage(*atom, ASTAGE_PRE_COMMIT);
91275 +       ON_DEBUG(((*atom)->committer = current));
91276 +
91277 +       ON_TRACE(TRACE_TXN, "commit atom %u: PRE_COMMIT\n", (*atom)->atom_id);
91278 +       ON_TRACE(TRACE_FLUSH, "everything flushed atom %u: PRE_COMMIT\n", (*atom)->atom_id);
91279 +
91280 +       UNLOCK_ATOM(*atom);
91281 +
91282 +       ret = current_atom_complete_writes();
91283 +       if (ret)
91284 +               return ret;
91285 +
91286 +       assert ("zam-906", capture_list_empty(ATOM_WB_LIST(*atom)));
91287 +
91288 +       ON_TRACE(TRACE_FLUSH, "everything written back atom %u\n",
91289 +                (*atom)->atom_id);
91290 +
91291 +       /* isolate critical code path which should be executed by only one
91292 +        * thread using tmgr semaphore */
91293 +       down(&sbinfo->tmgr.commit_semaphore);
91294 +
91295 +       ret = reiser4_write_logs(nr_submitted);
91296 +       if (ret < 0)
91297 +               reiser4_panic("zam-597", "write log failed (%ld)\n", ret);
91298 +
91299 +       /* The atom->ovrwr_nodes list is processed under commit semaphore held
91300 +          because of bitmap nodes which are captured by special way in
91301 +          bitmap_pre_commit_hook(), that way does not include
91302 +          capture_fuse_wait() as a capturing of other nodes does -- the commit
91303 +          semaphore is used for transaction isolation instead. */
91304 +       invalidate_list(ATOM_OVRWR_LIST(*atom));
91305 +       up(&sbinfo->tmgr.commit_semaphore);
91306 +
91307 +       invalidate_list(ATOM_CLEAN_LIST(*atom));
91308 +       invalidate_list(ATOM_WB_LIST(*atom));
91309 +       assert("zam-927", capture_list_empty(&(*atom)->inodes));
91310 +
91311 +       LOCK_ATOM(*atom);
91312 +       atom_set_stage(*atom, ASTAGE_DONE);
91313 +       ON_DEBUG((*atom)->committer = 0);
91314 +
91315 +       /* Atom's state changes, so wake up everybody waiting for this
91316 +          event. */
91317 +       wakeup_atom_waiting_list(*atom);
91318 +
91319 +       /* Decrement the "until commit" reference, at least one txnh (the caller) is
91320 +          still open. */
91321 +       atomic_dec(&(*atom)->refcount);
91322 +
91323 +       assert("jmacd-1070", atomic_read(&(*atom)->refcount) > 0);
91324 +       assert("jmacd-1062", (*atom)->capture_count == 0);
91325 +       BUG_ON((*atom)->capture_count != 0);
91326 +       assert("jmacd-1071", spin_atom_is_locked(*atom));
91327 +
91328 +       ON_TRACE(TRACE_TXN, "commit atom finished %u refcount %d\n",
91329 +                (*atom)->atom_id, atomic_read(&(*atom)->refcount));
91330 +       return ret;
91331 +}
91332 +
91333 +/* TXN_TXNH */
91334 +
91335 +/* commit current atom and wait commit completion; atom and txn_handle should be
91336 + * locked before call, this function unlocks them on exit. */
91337 +static int force_commit_atom_nolock (txn_handle * txnh)
91338 +{
91339 +       txn_atom * atom;
91340 +
91341 +       assert ("zam-837", txnh != NULL);
91342 +       assert ("zam-835", spin_txnh_is_locked(txnh));
91343 +       assert ("nikita-2966", lock_stack_isclean(get_current_lock_stack()));
91344 +
91345 +       atom = txnh->atom;
91346 +
91347 +       assert ("zam-834", atom != NULL);
91348 +       assert ("zam-836", spin_atom_is_locked(atom));
91349 +
91350 +       /* Set flags for atom and txnh: forcing atom commit and waiting for
91351 +        * commit completion */
91352 +       txnh->flags |= TXNH_WAIT_COMMIT;
91353 +       atom->flags |= ATOM_FORCE_COMMIT;
91354 +
91355 +       UNLOCK_TXNH(txnh);
91356 +       UNLOCK_ATOM(atom);
91357 +
91358 +       txn_restart_current();
91359 +       return 0;
91360 +}
91361 +
91362 +/* externally visible function which takes all necessary locks and commits
91363 + * current atom */
91364 +reiser4_internal int txnmgr_force_commit_current_atom (void)
91365 +{
91366 +       txn_handle * txnh = get_current_context()->trans;
91367 +       txn_atom * atom;
91368 +
91369 +       atom = txnh_get_atom(txnh);
91370 +
91371 +       if (atom == NULL) {
91372 +               UNLOCK_TXNH(txnh);
91373 +               return 0;
91374 +       }
91375 +
91376 +       return force_commit_atom_nolock(txnh);
91377 +}
91378 +
91379 +/* Called to force commit of any outstanding atoms.  @commit_all_atoms controls
91380 + * should we commit all atoms including new ones which are created after this
91381 + * functions is called. */
91382 +reiser4_internal int
91383 +txnmgr_force_commit_all (struct super_block *super, int commit_all_atoms)
91384 +{
91385 +       int ret;
91386 +       txn_atom *atom;
91387 +       txn_mgr *mgr;
91388 +       txn_handle *txnh;
91389 +       unsigned long start_time = jiffies;
91390 +       reiser4_context * ctx = get_current_context();
91391 +
91392 +       assert("nikita-2965", lock_stack_isclean(get_current_lock_stack()));
91393 +       assert("nikita-3058", commit_check_locks());
91394 +
91395 +       txn_restart(ctx);
91396 +
91397 +       mgr = &get_super_private(super)->tmgr;
91398 +
91399 +       txnh = ctx->trans;
91400 +
91401 +again:
91402 +
91403 +       spin_lock_txnmgr(mgr);
91404 +
91405 +       for_all_type_safe_list(atom, &mgr->atoms_list, atom) {
91406 +               LOCK_ATOM(atom);
91407 +
91408 +               /* Commit any atom which can be committed.  If @commit_new_atoms
91409 +                * is not set we commit only atoms which were created before
91410 +                * this call is started. */
91411 +               if (commit_all_atoms || time_before_eq(atom->start_time, start_time)) {
91412 +                       if (atom->stage <= ASTAGE_POST_COMMIT) {
91413 +                               spin_unlock_txnmgr(mgr);
91414 +
91415 +                               if (atom->stage < ASTAGE_PRE_COMMIT) {
91416 +                                       LOCK_TXNH(txnh);
91417 +                                       /* Add force-context txnh */
91418 +                                       capture_assign_txnh_nolock(atom, txnh);
91419 +                                       ret = force_commit_atom_nolock(txnh);
91420 +                                       if(ret)
91421 +                                               return ret;
91422 +                               } else
91423 +                                       /* wait atom commit */
91424 +                                       atom_wait_event(atom);
91425 +
91426 +                               goto again;
91427 +                       }
91428 +               }
91429 +
91430 +               UNLOCK_ATOM(atom);
91431 +       }
91432 +
91433 +#if REISER4_DEBUG
91434 +       if (commit_all_atoms) {
91435 +               reiser4_super_info_data * sbinfo = get_super_private(super);
91436 +               reiser4_spin_lock_sb(sbinfo);
91437 +               assert("zam-813", sbinfo->blocks_fake_allocated_unformatted == 0);
91438 +               assert("zam-812", sbinfo->blocks_fake_allocated == 0);
91439 +               reiser4_spin_unlock_sb(sbinfo);
91440 +       }
91441 +#endif
91442 +
91443 +       spin_unlock_txnmgr(mgr);
91444 +
91445 +       return 0;
91446 +}
91447 +
91448 +/* check whether commit_some_atoms() can commit @atom. Locking is up to the
91449 + * caller */
91450 +static int atom_is_committable(txn_atom *atom)
91451 +{
91452 +       return
91453 +               atom->stage < ASTAGE_PRE_COMMIT &&
91454 +               atom->txnh_count == atom->nr_waiters &&
91455 +               atom_should_commit(atom);
91456 +}
91457 +
91458 +/* called periodically from ktxnmgrd to commit old atoms. Releases ktxnmgrd spin
91459 + * lock at exit */
91460 +reiser4_internal int
91461 +commit_some_atoms(txn_mgr * mgr)
91462 +{
91463 +       int ret = 0;
91464 +       txn_atom *atom;
91465 +       txn_atom *next_atom;
91466 +       txn_handle *txnh;
91467 +       reiser4_context *ctx;
91468 +
91469 +       ctx = get_current_context();
91470 +       assert("nikita-2444", ctx != NULL);
91471 +
91472 +       txnh = ctx->trans;
91473 +       spin_lock_txnmgr(mgr);
91474 +
91475 +       /* look for atom to commit */
91476 +       for_all_type_safe_list_safe(atom, &mgr->atoms_list, atom, next_atom) {
91477 +               /* first test without taking atom spin lock, whether it is
91478 +                * eligible for committing at all */
91479 +               if (atom_is_committable(atom)) {
91480 +                       /* now, take spin lock and re-check */
91481 +                       LOCK_ATOM(atom);
91482 +                       if (atom_is_committable(atom))
91483 +                               break;
91484 +                       UNLOCK_ATOM(atom);
91485 +               }
91486 +       }
91487 +
91488 +       ret = atom_list_end(&mgr->atoms_list, atom);
91489 +       spin_unlock_txnmgr(mgr);
91490 +
91491 +       if (ret) {
91492 +               /* nothing found */
91493 +               spin_unlock(&mgr->daemon->guard);
91494 +               return 0;
91495 +       }
91496 +
91497 +       LOCK_TXNH(txnh);
91498 +
91499 +       /* Set the atom to force committing */
91500 +       atom->flags |= ATOM_FORCE_COMMIT;
91501 +
91502 +       /* Add force-context txnh */
91503 +       capture_assign_txnh_nolock(atom, txnh);
91504 +
91505 +       UNLOCK_TXNH(txnh);
91506 +       UNLOCK_ATOM(atom);
91507 +
91508 +       /* we are about to release daemon spin lock, notify daemon it
91509 +          has to rescan atoms */
91510 +       mgr->daemon->rescan = 1;
91511 +       spin_unlock(&mgr->daemon->guard);
91512 +       txn_restart(ctx);
91513 +       return 0;
91514 +}
91515 +
91516 +/* Calls jnode_flush for current atom if it exists; if not, just take another
91517 +   atom and call jnode_flush() for him.  If current transaction handle has
91518 +   already assigned atom (current atom) we have to close current transaction
91519 +   prior to switch to another atom or do something with current atom. This
91520 +   code tries to flush current atom.
91521 +
91522 +   flush_some_atom() is called as part of memory clearing process. It is
91523 +   invoked from balance_dirty_pages(), pdflushd, and entd.
91524 +
91525 +   If we can flush no nodes, atom is committed, because this frees memory.
91526 +
91527 +   If atom is too large or too old it is committed also.
91528 +*/
91529 +reiser4_internal int
91530 +flush_some_atom(long *nr_submitted, const struct writeback_control *wbc, int flags)
91531 +{
91532 +       reiser4_context *ctx = get_current_context();
91533 +       txn_handle *txnh = ctx->trans;
91534 +       txn_atom *atom;
91535 +       int ret;
91536 +       int ret1;
91537 +
91538 + repeat:
91539 +       if (txnh->atom == NULL) {
91540 +               /* current atom is available, take first from txnmgr */
91541 +               txn_mgr *tmgr = &get_super_private(ctx->super)->tmgr;
91542 +
91543 +               spin_lock_txnmgr(tmgr);
91544 +
91545 +               /* traverse the list of all atoms */
91546 +               for_all_type_safe_list(atom, &tmgr->atoms_list, atom) {
91547 +                       /* lock atom before checking its state */
91548 +                       LOCK_ATOM(atom);
91549 +
91550 +                       /* we need an atom which is not being committed and which has no
91551 +                        * flushers (jnode_flush() add one flusher at the beginning and
91552 +                        * subtract one at the end). */
91553 +                       if (atom->stage < ASTAGE_PRE_COMMIT && atom->nr_flushers == 0) {
91554 +                               LOCK_TXNH(txnh);
91555 +                               capture_assign_txnh_nolock(atom, txnh);
91556 +                               UNLOCK_TXNH(txnh);
91557 +
91558 +                               goto found;
91559 +                       }
91560 +
91561 +                       UNLOCK_ATOM(atom);
91562 +               }
91563 +
91564 +               /* Write throttling is case of no one atom can be
91565 +                * flushed/committed.  */
91566 +               if (!current_is_pdflush() && !wbc->nonblocking) {
91567 +                       for_all_type_safe_list(atom, &tmgr->atoms_list, atom) {
91568 +                               LOCK_ATOM(atom);
91569 +                               /* Repeat the check from the above. */
91570 +                               if (atom->stage < ASTAGE_PRE_COMMIT && atom->nr_flushers == 0) {
91571 +                                       LOCK_TXNH(txnh);
91572 +                                       capture_assign_txnh_nolock(atom, txnh);
91573 +                                       UNLOCK_TXNH(txnh);
91574 +
91575 +                                       goto found;
91576 +                               }
91577 +                               if (atom->stage <= ASTAGE_POST_COMMIT) {
91578 +                                       spin_unlock_txnmgr(tmgr);
91579 +                                       /* we just wait until atom's flusher
91580 +                                          makes a progress in flushing or
91581 +                                          committing the atom */
91582 +                                       atom_wait_event(atom);
91583 +                                       goto repeat;
91584 +                               }
91585 +                               UNLOCK_ATOM(atom);
91586 +                       }
91587 +               }
91588 +               spin_unlock_txnmgr(tmgr);
91589 +               return 0;
91590 +       found:
91591 +               spin_unlock_txnmgr(tmgr);
91592 +       } else
91593 +               atom = get_current_atom_locked();
91594 +
91595 +       ret = flush_current_atom(flags, nr_submitted, &atom);
91596 +       if (ret == 0) {
91597 +               if (*nr_submitted == 0 || atom_should_commit_asap(atom)) {
91598 +                       /* if early flushing could not make more nodes clean,
91599 +                        * or atom is too old/large,
91600 +                        * we force current atom to commit */
91601 +                       /* wait for commit completion but only if this
91602 +                        * wouldn't stall pdflushd and ent thread. */
91603 +                       if (!wbc->nonblocking && !ctx->entd)
91604 +                               txnh->flags |= TXNH_WAIT_COMMIT;
91605 +                       atom->flags |= ATOM_FORCE_COMMIT;
91606 +               }
91607 +               UNLOCK_ATOM(atom);
91608 +       } else if (ret == -E_REPEAT) {
91609 +               if (*nr_submitted == 0)
91610 +                       goto repeat;
91611 +               ret = 0;
91612 +       }
91613 +
91614 +       ret1 = txn_end(ctx);
91615 +       assert("vs-1692", ret1 == 0);
91616 +       if (ret1 > 0)
91617 +               *nr_submitted += ret1;
91618 +       txn_begin(ctx);
91619 +
91620 +       return ret;
91621 +}
91622 +
91623 +#if REISER4_COPY_ON_CAPTURE
91624 +
91625 +/* Remove processed nodes from atom's clean list (thereby remove them from transaction). */
91626 +void
91627 +invalidate_list(capture_list_head * head)
91628 +{
91629 +       txn_atom *atom;
91630 +
91631 +       spin_lock(&scan_lock);
91632 +       while (!capture_list_empty(head)) {
91633 +               jnode *node;
91634 +
91635 +               node = capture_list_front(head);
91636 +               JF_SET(node, JNODE_SCANNED);
91637 +               spin_unlock(&scan_lock);
91638 +
91639 +               atom = node->atom;
91640 +               LOCK_ATOM(atom);
91641 +               LOCK_JNODE(node);
91642 +               if (JF_ISSET(node, JNODE_CC) && node->pg) {
91643 +                       /* corresponding page_cache_get is in swap_jnode_pages */
91644 +                       assert("vs-1448", test_and_clear_bit(PG_arch_1, &node->pg->flags));
91645 +                       page_cache_release(node->pg);
91646 +               }
91647 +               uncapture_block(node);
91648 +               UNLOCK_ATOM(atom);
91649 +               JF_CLR(node, JNODE_SCANNED);
91650 +               jput(node);
91651 +
91652 +               spin_lock(&scan_lock);
91653 +       }
91654 +       spin_unlock(&scan_lock);
91655 +}
91656 +
91657 +#else
91658 +
91659 +/* Remove processed nodes from atom's clean list (thereby remove them from transaction). */
91660 +void
91661 +invalidate_list(capture_list_head * head)
91662 +{
91663 +       while (!capture_list_empty(head)) {
91664 +               jnode *node;
91665 +
91666 +               node = capture_list_front(head);
91667 +               LOCK_JNODE(node);
91668 +               uncapture_block(node);
91669 +               jput(node);
91670 +       }
91671 +}
91672 +
91673 +#endif
91674 +
91675 +static void
91676 +init_wlinks(txn_wait_links * wlinks)
91677 +{
91678 +       wlinks->_lock_stack = get_current_lock_stack();
91679 +       fwaitfor_list_clean(wlinks);
91680 +       fwaiting_list_clean(wlinks);
91681 +       wlinks->waitfor_cb = NULL;
91682 +       wlinks->waiting_cb = NULL;
91683 +}
91684 +
91685 +/* Add atom to the atom's waitfor list and wait for somebody to wake us up; */
91686 +reiser4_internal void atom_wait_event(txn_atom * atom)
91687 +{
91688 +       txn_wait_links _wlinks;
91689 +
91690 +       assert("zam-744", spin_atom_is_locked(atom));
91691 +       assert("nikita-3156",
91692 +              lock_stack_isclean(get_current_lock_stack()) ||
91693 +              atom->nr_running_queues > 0);
91694 +
91695 +       init_wlinks(&_wlinks);
91696 +       fwaitfor_list_push_back(&atom->fwaitfor_list, &_wlinks);
91697 +       atomic_inc(&atom->refcount);
91698 +       UNLOCK_ATOM(atom);
91699 +
91700 +       /* assert("nikita-3056", commit_check_locks()); */
91701 +       prepare_to_sleep(_wlinks._lock_stack);
91702 +       go_to_sleep(_wlinks._lock_stack, ADD_TO_SLEPT_IN_WAIT_EVENT);
91703 +
91704 +       LOCK_ATOM (atom);
91705 +       fwaitfor_list_remove(&_wlinks);
91706 +       atom_dec_and_unlock (atom);
91707 +}
91708 +
91709 +reiser4_internal void
91710 +atom_set_stage(txn_atom *atom, txn_stage stage)
91711 +{
91712 +       assert("nikita-3535", atom != NULL);
91713 +       assert("nikita-3538", spin_atom_is_locked(atom));
91714 +       assert("nikita-3536", ASTAGE_FREE <= stage && stage <= ASTAGE_INVALID);
91715 +       /* Excelsior! */
91716 +       assert("nikita-3537", stage >= atom->stage);
91717 +       if (atom->stage != stage) {
91718 +               atom->stage = stage;
91719 +               atom_send_event(atom);
91720 +       }
91721 +}
91722 +
91723 +/* wake all threads which wait for an event */
91724 +reiser4_internal void
91725 +atom_send_event(txn_atom * atom)
91726 +{
91727 +       assert("zam-745", spin_atom_is_locked(atom));
91728 +       wakeup_atom_waitfor_list(atom);
91729 +}
91730 +
91731 +/* Informs txn manager code that owner of this txn_handle should wait atom commit completion (for
91732 +   example, because it does fsync(2)) */
91733 +static int
91734 +should_wait_commit(txn_handle * h)
91735 +{
91736 +       return h->flags & TXNH_WAIT_COMMIT;
91737 +}
91738 +
91739 +typedef struct commit_data {
91740 +       txn_atom    *atom;
91741 +       txn_handle  *txnh;
91742 +       long         nr_written;
91743 +       /* as an optimization we start committing atom by first trying to
91744 +        * flush it few times without switching into ASTAGE_CAPTURE_WAIT. This
91745 +        * allows to reduce stalls due to other threads waiting for atom in
91746 +        * ASTAGE_CAPTURE_WAIT stage. ->preflush is counter of these
91747 +        * preliminary flushes. */
91748 +       int          preflush;
91749 +       /* have we waited on atom. */
91750 +       int          wait;
91751 +       int          failed;
91752 +       int          wake_ktxnmgrd_up;
91753 +} commit_data;
91754 +
91755 +/*
91756 + * Called from commit_txnh() repeatedly, until either error happens, or atom
91757 + * commits successfully.
91758 + */
91759 +static int
91760 +try_commit_txnh(commit_data *cd)
91761 +{
91762 +       int result;
91763 +
91764 +       assert("nikita-2968", lock_stack_isclean(get_current_lock_stack()));
91765 +
91766 +       /* Get the atom and txnh locked. */
91767 +       cd->atom = txnh_get_atom(cd->txnh);
91768 +       assert("jmacd-309", cd->atom != NULL);
91769 +       UNLOCK_TXNH(cd->txnh);
91770 +
91771 +       if (cd->wait) {
91772 +               cd->atom->nr_waiters --;
91773 +               cd->wait = 0;
91774 +       }
91775 +
91776 +       if (cd->atom->stage == ASTAGE_DONE)
91777 +               return 0;
91778 +
91779 +       ON_TRACE(TRACE_TXN,
91780 +                "commit_txnh: atom %u failed %u; txnh_count %u; should_commit %u\n",
91781 +                cd->atom->atom_id, cd->failed, cd->atom->txnh_count,
91782 +                atom_should_commit(cd->atom));
91783 +
91784 +       if (cd->failed)
91785 +               return 0;
91786 +
91787 +       if (atom_should_commit(cd->atom)) {
91788 +               /* if atom is  _very_ large schedule it for  common as soon as
91789 +                * possible. */
91790 +               if (atom_should_commit_asap(cd->atom)) {
91791 +                       /*
91792 +                        * When atom is in PRE_COMMIT or later stage following
91793 +                        * invariant (encoded   in    atom_can_be_committed())
91794 +                        * holds:  there is exactly one non-waiter transaction
91795 +                        * handle opened  on this atom.  When  thread wants to
91796 +                        * wait  until atom  commits (for  example  sync()) it
91797 +                        * waits    on    atom  event     after     increasing
91798 +                        * atom->nr_waiters (see blow  in  this  function). It
91799 +                        * cannot be guaranteed that atom is already committed
91800 +                        * after    receiving event,  so     loop has   to  be
91801 +                        * re-started. But  if  atom switched into  PRE_COMMIT
91802 +                        * stage and became  too  large, we cannot  change its
91803 +                        * state back   to CAPTURE_WAIT (atom  stage can  only
91804 +                        * increase monotonically), hence this check.
91805 +                        */
91806 +                       if (cd->atom->stage < ASTAGE_CAPTURE_WAIT)
91807 +                               atom_set_stage(cd->atom, ASTAGE_CAPTURE_WAIT);
91808 +                       cd->atom->flags |= ATOM_FORCE_COMMIT;
91809 +               }
91810 +               if (cd->txnh->flags & TXNH_DONT_COMMIT) {
91811 +                       /*
91812 +                        * this  thread (transaction  handle  that is) doesn't
91813 +                        * want to commit  atom. Notify waiters that handle is
91814 +                        * closed. This can happen, for  example, when we  are
91815 +                        * under  VFS directory lock  and don't want to commit
91816 +                        * atom  right   now to  avoid  stalling other threads
91817 +                        * working in the same directory.
91818 +                        */
91819 +
91820 +                       /* Wake  the ktxnmgrd up if  the ktxnmgrd is needed to
91821 +                        * commit this  atom: no  atom  waiters  and only  one
91822 +                        * (our) open transaction handle. */
91823 +                       cd->wake_ktxnmgrd_up =
91824 +                               cd->atom->txnh_count == 1 &&
91825 +                               cd->atom->nr_waiters == 0;
91826 +                       atom_send_event(cd->atom);
91827 +                       result = 0;
91828 +               } else if (!atom_can_be_committed(cd->atom)) {
91829 +                       if (should_wait_commit(cd->txnh)) {
91830 +                               /* sync(): wait for commit */
91831 +                               cd->atom->nr_waiters++;
91832 +                               cd->wait = 1;
91833 +                               atom_wait_event(cd->atom);
91834 +                               reiser4_stat_inc(txnmgr.restart.should_wait);
91835 +                               result = RETERR(-E_REPEAT);
91836 +                       } else {
91837 +                               result = 0;
91838 +                       }
91839 +               } else if (cd->preflush > 0 && !is_current_ktxnmgrd()) {
91840 +                       /*
91841 +                        * optimization: flush  atom without switching it into
91842 +                        * ASTAGE_CAPTURE_WAIT.
91843 +                        *
91844 +                        * But don't  do this for  ktxnmgrd, because  ktxnmgrd
91845 +                        * should never block on atom fusion.
91846 +                        */
91847 +                       result = flush_current_atom(JNODE_FLUSH_WRITE_BLOCKS,
91848 +                                                   &cd->nr_written, &cd->atom);
91849 +                       if (result == 0) {
91850 +                               UNLOCK_ATOM(cd->atom);
91851 +                               cd->preflush = 0;
91852 +                               reiser4_stat_inc(txnmgr.restart.flush);
91853 +                               result = RETERR(-E_REPEAT);
91854 +                       } else  /* Atoms wasn't flushed
91855 +                                * completely. Rinse. Repeat. */
91856 +                               -- cd->preflush;
91857 +               } else {
91858 +                       /* We change   atom state  to   ASTAGE_CAPTURE_WAIT to
91859 +                          prevent atom fusion and count  ourself as an active
91860 +                          flusher */
91861 +                       atom_set_stage(cd->atom, ASTAGE_CAPTURE_WAIT);
91862 +                       cd->atom->flags |= ATOM_FORCE_COMMIT;
91863 +
91864 +                       result = commit_current_atom(&cd->nr_written, &cd->atom);
91865 +                       if (result != 0 && result != -E_REPEAT)
91866 +                               cd->failed = 1;
91867 +               }
91868 +       } else
91869 +               result = 0;
91870 +
91871 +       assert("jmacd-1027", ergo(result == 0, spin_atom_is_locked(cd->atom)));
91872 +       /* perfectly valid assertion, except that when atom/txnh is not locked
91873 +        * fusion can take place, and cd->atom points nowhere. */
91874 +       /*
91875 +         assert("jmacd-1028", ergo(result != 0, spin_atom_is_not_locked(cd->atom)));
91876 +       */
91877 +       return result;
91878 +}
91879 +
91880 +/* Called to commit a transaction handle.  This decrements the atom's number of open
91881 +   handles and if it is the last handle to commit and the atom should commit, initiates
91882 +   atom commit. if commit does not fail, return number of written blocks */
91883 +static long
91884 +commit_txnh(txn_handle * txnh)
91885 +{
91886 +       commit_data cd;
91887 +       assert("umka-192", txnh != NULL);
91888 +
91889 +       xmemset(&cd, 0, sizeof cd);
91890 +       cd.txnh = txnh;
91891 +       cd.preflush = 10;
91892 +
91893 +       /* calls try_commit_txnh() until either atom commits, or error
91894 +        * happens */
91895 +       while (try_commit_txnh(&cd) != 0)
91896 +               preempt_point();
91897 +
91898 +       assert("nikita-3171", spin_txnh_is_not_locked(txnh));
91899 +       LOCK_TXNH(txnh);
91900 +
91901 +       cd.atom->txnh_count -= 1;
91902 +       txnh->atom = NULL;
91903 +
91904 +       txnh_list_remove(txnh);
91905 +
91906 +       ON_TRACE(TRACE_TXN, "close txnh atom %u refcount %d\n",
91907 +                cd.atom->atom_id, atomic_read(&cd.atom->refcount));
91908 +
91909 +       UNLOCK_TXNH(txnh);
91910 +       atom_dec_and_unlock(cd.atom);
91911 +       /* if we don't want to do a commit (TXNH_DONT_COMMIT is set, probably
91912 +        * because it takes time) by current thread, we do that work
91913 +        * asynchronously by ktxnmgrd daemon. */
91914 +       if (cd.wake_ktxnmgrd_up)
91915 +               ktxnmgrd_kick(&get_current_super_private()->tmgr);
91916 +
91917 +       return 0;
91918 +}
91919 +
91920 +/* TRY_CAPTURE */
91921 +
91922 +/* This routine attempts a single block-capture request.  It may return -E_REPEAT if some
91923 +   condition indicates that the request should be retried, and it may block if the
91924 +   txn_capture mode does not include the TXN_CAPTURE_NONBLOCKING request flag.
91925 +
91926 +   This routine encodes the basic logic of block capturing described by:
91927 +
91928 +     http://namesys.com/v4/v4.html
91929 +
91930 +   Our goal here is to ensure that any two blocks that contain dependent modifications
91931 +   should commit at the same time.  This function enforces this discipline by initiating
91932 +   fusion whenever a transaction handle belonging to one atom requests to read or write a
91933 +   block belonging to another atom (TXN_CAPTURE_WRITE or TXN_CAPTURE_READ_ATOMIC).
91934 +
91935 +   In addition, this routine handles the initial assignment of atoms to blocks and
91936 +   transaction handles.  These are possible outcomes of this function:
91937 +
91938 +   1. The block and handle are already part of the same atom: return immediate success
91939 +
91940 +   2. The block is assigned but the handle is not: call capture_assign_txnh to assign
91941 +      the handle to the block's atom.
91942 +
91943 +   3. The handle is assigned but the block is not: call capture_assign_block to assign
91944 +      the block to the handle's atom.
91945 +
91946 +   4. Both handle and block are assigned, but to different atoms: call capture_init_fusion
91947 +      to fuse atoms.
91948 +
91949 +   5. Neither block nor handle are assigned: create a new atom and assign them both.
91950 +
91951 +   6. A read request for a non-captured block: return immediate success.
91952 +
91953 +   This function acquires and releases the handle's spinlock.  This function is called
91954 +   under the jnode lock and if the return value is 0, it returns with the jnode lock still
91955 +   held.  If the return is -E_REPEAT or some other error condition, the jnode lock is
91956 +   released.  The external interface (try_capture) manages re-aquiring the jnode lock
91957 +   in the failure case.
91958 +*/
91959 +
91960 +static int
91961 +try_capture_block(txn_handle * txnh, jnode * node, txn_capture mode, txn_atom ** atom_alloc, int can_coc)
91962 +{
91963 +       int ret;
91964 +       txn_atom *block_atom;
91965 +       txn_atom *txnh_atom;
91966 +
91967 +       /* Should not call capture for READ_NONCOM requests, handled in try_capture. */
91968 +       assert("jmacd-567", CAPTURE_TYPE(mode) != TXN_CAPTURE_READ_NONCOM);
91969 +
91970 +       /* FIXME-ZAM-HANS: FIXME_LATER_JMACD Should assert that atom->tree == node->tree somewhere. */
91971 +
91972 +       assert("umka-194", txnh != NULL);
91973 +       assert("umka-195", node != NULL);
91974 +
91975 +       /* The jnode is already locked!  Being called from try_capture(). */
91976 +       assert("jmacd-567", spin_jnode_is_locked(node));
91977 +
91978 +       block_atom = node->atom;
91979 +
91980 +       /* Get txnh spinlock, this allows us to compare txn_atom pointers but it doesn't
91981 +          let us touch the atoms themselves. */
91982 +       LOCK_TXNH(txnh);
91983 +
91984 +       txnh_atom = txnh->atom;
91985 +
91986 +       if (REISER4_STATS) {
91987 +               if (block_atom != NULL && txnh_atom != NULL)
91988 +                       if (block_atom == txnh_atom)
91989 +                               reiser4_stat_inc(txnmgr.capture_equal);
91990 +                       else
91991 +                               reiser4_stat_inc(txnmgr.capture_both);
91992 +               else if (block_atom != NULL && txnh_atom == NULL)
91993 +                       reiser4_stat_inc(txnmgr.capture_block);
91994 +               else if (block_atom == NULL && txnh_atom != NULL)
91995 +                       reiser4_stat_inc(txnmgr.capture_txnh);
91996 +               else
91997 +                       reiser4_stat_inc(txnmgr.capture_none);
91998 +       }
91999 +
92000 +       if (txnh_atom != NULL && block_atom == txnh_atom) {
92001 +               UNLOCK_TXNH(txnh);
92002 +               return 0;
92003 +       }
92004 +       /* NIKITA-HANS: nothing */
92005 +       if (txnh_atom != NULL) {
92006 +               /* It is time to perform deadlock prevention check over the
92007 +                  node we want to capture.  It is possible this node was
92008 +                  locked for read without capturing it. The optimization
92009 +                  which allows to do it helps us in keeping atoms independent
92010 +                  as long as possible but it may cause lock/fuse deadlock
92011 +                  problems.
92012 +
92013 +                  A number of similar deadlock situations with locked but not
92014 +                  captured nodes were found.  In each situation there are two
92015 +                  or more threads: one of them does flushing while another
92016 +                  one does routine balancing or tree lookup.  The flushing
92017 +                  thread (F) sleeps in long term locking request for node
92018 +                  (N), another thread (A) sleeps in trying to capture some
92019 +                  node already belonging the atom F, F has a state which
92020 +                  prevents immediately fusion .
92021 +
92022 +                  Deadlocks of this kind cannot happen if node N was properly
92023 +                  captured by thread A. The F thread fuse atoms before
92024 +                  locking therefore current atom of thread F and current atom
92025 +                  of thread A became the same atom and thread A may proceed.
92026 +                  This does not work if node N was not captured because the
92027 +                  fusion of atom does not happens.
92028 +
92029 +                  The following scheme solves the deadlock: If
92030 +                  longterm_lock_znode locks and does not capture a znode,
92031 +                  that znode is marked as MISSED_IN_CAPTURE.  A node marked
92032 +                  this way is processed by the code below which restores the
92033 +                  missed capture and fuses current atoms of all the node lock
92034 +                  owners by calling the fuse_not_fused_lock_owners()
92035 +                  function.
92036 +               */
92037 +
92038 +               if (            // txnh_atom->stage >= ASTAGE_CAPTURE_WAIT &&
92039 +                          jnode_is_znode(node) && znode_is_locked(JZNODE(node))
92040 +                          && JF_ISSET(node, JNODE_MISSED_IN_CAPTURE)) {
92041 +                       JF_CLR(node, JNODE_MISSED_IN_CAPTURE);
92042 +
92043 +                       ret = fuse_not_fused_lock_owners(txnh, JZNODE(node));
92044 +
92045 +                       if (ret) {
92046 +                               JF_SET(node, JNODE_MISSED_IN_CAPTURE);
92047 +
92048 +                               assert("zam-687", spin_txnh_is_not_locked(txnh));
92049 +                               assert("zam-688", spin_jnode_is_not_locked(node));
92050 +
92051 +                               return ret;
92052 +                       }
92053 +
92054 +                       assert("zam-701", spin_txnh_is_locked(txnh));
92055 +                       assert("zam-702", spin_jnode_is_locked(node));
92056 +               }
92057 +       }
92058 +
92059 +       if (block_atom != NULL) {
92060 +               /* The block has already been assigned to an atom. */
92061 +
92062 +               /* case (block_atom == txnh_atom) is already handled above */
92063 +               if (txnh_atom == NULL) {
92064 +
92065 +                       /* The txnh is unassigned, try to assign it. */
92066 +                       ret = capture_assign_txnh(node, txnh, mode, can_coc);
92067 +                       if (ret != 0) {
92068 +                               /* E_REPEAT or otherwise */
92069 +                               assert("jmacd-6129", spin_txnh_is_not_locked(txnh));
92070 +                               assert("jmacd-6130", spin_jnode_is_not_locked(node));
92071 +                               return ret;
92072 +                       }
92073 +
92074 +                       /* Either the txnh is now assigned to the block's atom or the read-request was
92075 +                          granted because the block is committing.  Locks still held. */
92076 +               } else {
92077 +                       if (mode & TXN_CAPTURE_DONT_FUSE) {
92078 +                               UNLOCK_TXNH(txnh);
92079 +                               UNLOCK_JNODE(node);
92080 +                               /* we are in a "no-fusion" mode and @node is
92081 +                                * already part of transaction. */
92082 +                               return RETERR(-E_NO_NEIGHBOR);
92083 +                       }
92084 +                       /* In this case, both txnh and node belong to different atoms.  This function
92085 +                          returns -E_REPEAT on successful fusion, 0 on the fall-through case. */
92086 +                       ret = capture_init_fusion(node, txnh, mode, can_coc);
92087 +                       if (ret != 0) {
92088 +                               assert("jmacd-6131", spin_txnh_is_not_locked(txnh));
92089 +                               assert("jmacd-6132", spin_jnode_is_not_locked(node));
92090 +                               return ret;
92091 +                       }
92092 +
92093 +                       /* The fall-through case is read request for committing block.  Locks still
92094 +                          held. */
92095 +               }
92096 +
92097 +       } else if ((mode & TXN_CAPTURE_WTYPES) != 0) {
92098 +
92099 +               /* In this case, the page is unlocked and the txnh wishes exclusive access. */
92100 +
92101 +               if (txnh_atom != NULL) {
92102 +                       /* The txnh is already assigned: add the page to its atom. */
92103 +                       ret = capture_assign_block(txnh, node);
92104 +                       if (ret != 0) {
92105 +                               /* E_REPEAT or otherwise */
92106 +                               assert("jmacd-6133", spin_txnh_is_not_locked(txnh));
92107 +                               assert("jmacd-6134", spin_jnode_is_not_locked(node));
92108 +                               return ret;
92109 +                       }
92110 +
92111 +                       /* Success: Locks are still held. */
92112 +
92113 +               } else {
92114 +
92115 +                       /* In this case, neither txnh nor page are assigned to an atom. */
92116 +                       block_atom = atom_begin_andlock(atom_alloc, node, txnh);
92117 +
92118 +                       if (!IS_ERR(block_atom)) {
92119 +                               /* Assign both, release atom lock. */
92120 +                               assert("jmacd-125", block_atom->stage == ASTAGE_CAPTURE_FUSE);
92121 +
92122 +                               capture_assign_txnh_nolock(block_atom, txnh);
92123 +                               capture_assign_block_nolock(block_atom, node);
92124 +
92125 +                               UNLOCK_ATOM(block_atom);
92126 +                       } else {
92127 +                               /* all locks are released already */
92128 +                               return PTR_ERR(block_atom);
92129 +                       }
92130 +
92131 +                       /* Success: Locks are still held. */
92132 +               }
92133 +
92134 +       } else {
92135 +               /* The jnode is uncaptured and its a read request -- fine. */
92136 +               assert("jmacd-411", CAPTURE_TYPE(mode) == TXN_CAPTURE_READ_ATOMIC);
92137 +       }
92138 +
92139 +       /* Successful case: both jnode and txnh are still locked. */
92140 +       assert("jmacd-740", spin_txnh_is_locked(txnh));
92141 +       assert("jmacd-741", spin_jnode_is_locked(node));
92142 +
92143 +       /* Release txnh lock, return with the jnode still locked. */
92144 +       UNLOCK_TXNH(txnh);
92145 +
92146 +       return 0;
92147 +}
92148 +
92149 +reiser4_internal txn_capture
92150 +build_capture_mode(jnode * node, znode_lock_mode lock_mode, txn_capture flags)
92151 +{
92152 +       txn_capture cap_mode;
92153 +
92154 +       assert("nikita-3187", spin_jnode_is_locked(node));
92155 +
92156 +       /* FIXME_JMACD No way to set TXN_CAPTURE_READ_MODIFY yet. */
92157 +
92158 +       if (lock_mode == ZNODE_WRITE_LOCK) {
92159 +               cap_mode = TXN_CAPTURE_WRITE;
92160 +       } else if (node->atom != NULL) {
92161 +               cap_mode = TXN_CAPTURE_WRITE;
92162 +       } else if (0 && /* txnh->mode == TXN_READ_FUSING && */
92163 +                  jnode_get_level(node) == LEAF_LEVEL) {
92164 +               /* NOTE-NIKITA TXN_READ_FUSING is not currently used */
92165 +               /* We only need a READ_FUSING capture at the leaf level.  This
92166 +                  is because the internal levels of the tree (twigs included)
92167 +                  are redundant from the point of the user that asked for a
92168 +                  read-fusing transcrash.  The user only wants to read-fuse
92169 +                  atoms due to reading uncommitted data that another user has
92170 +                  written.  It is the file system that reads/writes the
92171 +                  internal tree levels, the user only reads/writes leaves. */
92172 +               cap_mode = TXN_CAPTURE_READ_ATOMIC;
92173 +       } else {
92174 +               /* In this case (read lock at a non-leaf) there's no reason to
92175 +                * capture. */
92176 +               /* cap_mode = TXN_CAPTURE_READ_NONCOM; */
92177 +
92178 +               /* Mark this node as "MISSED".  It helps in further deadlock
92179 +                * analysis */
92180 +               JF_SET(node, JNODE_MISSED_IN_CAPTURE);
92181 +               return 0;
92182 +       }
92183 +
92184 +       cap_mode |= (flags & (TXN_CAPTURE_NONBLOCKING |
92185 +                             TXN_CAPTURE_DONT_FUSE));
92186 +       assert("nikita-3186", cap_mode != 0);
92187 +       return cap_mode;
92188 +}
92189 +
92190 +/* This is an external interface to try_capture_block(), it calls
92191 +   try_capture_block() repeatedly as long as -E_REPEAT is returned.
92192 +
92193 +   @node:         node to capture,
92194 +   @lock_mode:    read or write lock is used in capture mode calculation,
92195 +   @flags:        see txn_capture flags enumeration,
92196 +   @can_coc     : can copy-on-capture
92197 +
92198 +   @return: 0 - node was successfully captured, -E_REPEAT - capture request
92199 +            cannot be processed immediately as it was requested in flags,
92200 +           < 0 - other errors.
92201 +*/
92202 +reiser4_internal int
92203 +try_capture(jnode * node,  znode_lock_mode lock_mode,
92204 +           txn_capture flags, int can_coc)
92205 +{
92206 +       txn_atom    *atom_alloc = NULL;
92207 +       txn_capture cap_mode;
92208 +       txn_handle * txnh = get_current_context()->trans;
92209 +#if REISER4_COPY_ON_CAPTURE
92210 +       int coc_enabled = 1;
92211 +#endif
92212 +       int ret;
92213 +
92214 +       assert("jmacd-604", spin_jnode_is_locked(node));
92215 +
92216 +repeat:
92217 +       cap_mode = build_capture_mode(node, lock_mode, flags);
92218 +       if (cap_mode == 0)
92219 +               return 0;
92220 +
92221 +       /* Repeat try_capture as long as -E_REPEAT is returned. */
92222 +#if REISER4_COPY_ON_CAPTURE
92223 +       ret = try_capture_block(txnh, node, cap_mode, &atom_alloc, can_coc && coc_enabled);
92224 +       coc_enabled = 1;
92225 +#else
92226 +       ret = try_capture_block(txnh, node, cap_mode, &atom_alloc, can_coc);
92227 +#endif
92228 +       /* Regardless of non_blocking:
92229 +
92230 +          If ret == 0 then jnode is still locked.
92231 +          If ret != 0 then jnode is unlocked.
92232 +       */
92233 +       assert("nikita-2674", ergo(ret == 0, spin_jnode_is_locked(node)));
92234 +       assert("nikita-2675", ergo(ret != 0, spin_jnode_is_not_locked(node)));
92235 +
92236 +       assert("nikita-2974", spin_txnh_is_not_locked(txnh));
92237 +
92238 +       if (ret == -E_REPEAT) {
92239 +               /* E_REPEAT implies all locks were released, therefore we need
92240 +                  to take the jnode's lock again. */
92241 +               LOCK_JNODE(node);
92242 +
92243 +               /* Although this may appear to be a busy loop, it is not.
92244 +                  There are several conditions that cause E_REPEAT to be
92245 +                  returned by the call to try_capture_block, all cases
92246 +                  indicating some kind of state change that means you should
92247 +                  retry the request and will get a different result.  In some
92248 +                  cases this could be avoided with some extra code, but
92249 +                  generally it is done because the necessary locks were
92250 +                  released as a result of the operation and repeating is the
92251 +                  simplest thing to do (less bug potential).  The cases are:
92252 +                  atom fusion returns E_REPEAT after it completes (jnode and
92253 +                  txnh were unlocked); race conditions in assign_block,
92254 +                  assign_txnh, and init_fusion return E_REPEAT (trylock
92255 +                  failure); after going to sleep in capture_fuse_wait
92256 +                  (request was blocked but may now succeed).  I'm not quite
92257 +                  sure how capture_copy works yet, but it may also return
92258 +                  E_REPEAT.  When the request is legitimately blocked, the
92259 +                  requestor goes to sleep in fuse_wait, so this is not a busy
92260 +                  loop. */
92261 +               /* NOTE-NIKITA: still don't understand:
92262 +
92263 +                  try_capture_block->capture_assign_txnh->spin_trylock_atom->E_REPEAT
92264 +
92265 +                  looks like busy loop?
92266 +               */
92267 +               goto repeat;
92268 +       }
92269 +
92270 +#if REISER4_COPY_ON_CAPTURE
92271 +       if (ret == -E_WAIT) {
92272 +               reiser4_stat_inc(coc.coc_wait);
92273 +               /* disable COC for the next loop iteration */
92274 +               coc_enabled = 0;
92275 +               LOCK_JNODE(node);
92276 +               goto repeat;
92277 +       }
92278 +#endif
92279 +
92280 +       /* free extra atom object that was possibly allocated by
92281 +          try_capture_block().
92282 +
92283 +          Do this before acquiring jnode spin lock to
92284 +          minimize time spent under lock. --nikita */
92285 +       if (atom_alloc != NULL) {
92286 +               kmem_cache_free(_atom_slab, atom_alloc);
92287 +       }
92288 +
92289 +       if (ret != 0) {
92290 +               if (ret == -E_BLOCK) {
92291 +                       assert("nikita-3360", cap_mode & TXN_CAPTURE_NONBLOCKING);
92292 +                       ret = -E_REPEAT;
92293 +               }
92294 +
92295 +               /* Failure means jnode is not locked.  FIXME_LATER_JMACD May
92296 +                  want to fix the above code to avoid releasing the lock and
92297 +                  re-acquiring it, but there are cases were failure occurs
92298 +                  when the lock is not held, and those cases would need to be
92299 +                  modified to re-take the lock. */
92300 +               LOCK_JNODE(node);
92301 +       }
92302 +
92303 +       /* Jnode is still locked. */
92304 +       assert("jmacd-760", spin_jnode_is_locked(node));
92305 +       return ret;
92306 +}
92307 +
92308 +/* This function sets up a call to try_capture_block and repeats as long as -E_REPEAT is
92309 +   returned by that routine.  The txn_capture request mode is computed here depending on
92310 +   the transaction handle's type and the lock request.  This is called from the depths of
92311 +   the lock manager with the jnode lock held and it always returns with the jnode lock
92312 +   held.
92313 +*/
92314 +
92315 +/* fuse all 'active' atoms of lock owners of given node. */
92316 +static int
92317 +fuse_not_fused_lock_owners(txn_handle * txnh, znode * node)
92318 +{
92319 +       lock_handle *lh;
92320 +       int repeat = 0;
92321 +       txn_atom *atomh = txnh->atom;
92322 +
92323 +/*     assert ("zam-689", znode_is_rlocked (node));*/
92324 +       assert("zam-690", spin_znode_is_locked(node));
92325 +       assert("zam-691", spin_txnh_is_locked(txnh));
92326 +       assert("zam-692", atomh != NULL);
92327 +
92328 +       RLOCK_ZLOCK(&node->lock);
92329 +
92330 +       if (!spin_trylock_atom(atomh)) {
92331 +               repeat = 1;
92332 +               goto fail;
92333 +       }
92334 +
92335 +       /* inspect list of lock owners */
92336 +       for_all_type_safe_list(owners, &node->lock.owners, lh) {
92337 +               reiser4_context *ctx;
92338 +               txn_atom *atomf;
92339 +
92340 +               ctx = get_context_by_lock_stack(lh->owner);
92341 +
92342 +               if (ctx == get_current_context())
92343 +                       continue;
92344 +
92345 +               if (!spin_trylock_txnh(ctx->trans)) {
92346 +                       repeat = 1;
92347 +                       continue;
92348 +               }
92349 +
92350 +               atomf = ctx->trans->atom;
92351 +
92352 +               if (atomf == NULL) {
92353 +                       capture_assign_txnh_nolock(atomh, ctx->trans);
92354 +                       UNLOCK_TXNH(ctx->trans);
92355 +
92356 +                       reiser4_wake_up(lh->owner);
92357 +                       continue;
92358 +               }
92359 +
92360 +               if (atomf == atomh) {
92361 +                       UNLOCK_TXNH(ctx->trans);
92362 +                       continue;
92363 +               }
92364 +
92365 +               if (!spin_trylock_atom(atomf)) {
92366 +                       UNLOCK_TXNH(ctx->trans);
92367 +                       repeat = 1;
92368 +                       continue;
92369 +               }
92370 +
92371 +               UNLOCK_TXNH(ctx->trans);
92372 +
92373 +               if (atomf == atomh || atomf->stage > ASTAGE_CAPTURE_WAIT) {
92374 +                       UNLOCK_ATOM(atomf);
92375 +                       continue;
92376 +               }
92377 +               // repeat = 1;
92378 +
92379 +               reiser4_wake_up(lh->owner);
92380 +
92381 +               UNLOCK_TXNH(txnh);
92382 +               RUNLOCK_ZLOCK(&node->lock);
92383 +               spin_unlock_znode(node);
92384 +
92385 +               /* @atomf is "small" and @atomh is "large", by
92386 +                  definition. Small atom is destroyed and large is unlocked
92387 +                  inside capture_fuse_into()
92388 +               */
92389 +               capture_fuse_into(atomf, atomh);
92390 +
92391 +               reiser4_stat_inc(txnmgr.restart.fuse_lock_owners_fused);
92392 +               return RETERR(-E_REPEAT);
92393 +       }
92394 +
92395 +       UNLOCK_ATOM(atomh);
92396 +
92397 +       if (repeat) {
92398 +fail:
92399 +               UNLOCK_TXNH(txnh);
92400 +               RUNLOCK_ZLOCK(&node->lock);
92401 +               spin_unlock_znode(node);
92402 +               reiser4_stat_inc(txnmgr.restart.fuse_lock_owners);
92403 +               return RETERR(-E_REPEAT);
92404 +       }
92405 +
92406 +       RUNLOCK_ZLOCK(&node->lock);
92407 +       return 0;
92408 +}
92409 +
92410 +/* This is the interface to capture unformatted nodes via their struct page
92411 +   reference. Currently it is only used in reiser4_invalidatepage */
92412 +reiser4_internal int
92413 +try_capture_page_to_invalidate(struct page *pg)
92414 +{
92415 +       int ret;
92416 +       jnode *node;
92417 +
92418 +       assert("umka-292", pg != NULL);
92419 +       assert("nikita-2597", PageLocked(pg));
92420 +
92421 +       if (IS_ERR(node = jnode_of_page(pg))) {
92422 +               return PTR_ERR(node);
92423 +       }
92424 +
92425 +       LOCK_JNODE(node);
92426 +       unlock_page(pg);
92427 +
92428 +       ret = try_capture(node, ZNODE_WRITE_LOCK, 0, 0/* no copy on capture */);
92429 +       UNLOCK_JNODE(node);
92430 +       jput(node);
92431 +       lock_page(pg);
92432 +       return ret;
92433 +}
92434 +
92435 +/* This informs the transaction manager when a node is deleted.  Add the block to the
92436 +   atom's delete set and uncapture the block.
92437 +
92438 +VS-FIXME-HANS: this E_REPEAT paradigm clutters the code and creates a need for
92439 +explanations.  find all the functions that use it, and unless there is some very
92440 +good reason to use it (I have not noticed one so far and I doubt it exists, but maybe somewhere somehow....),
92441 +move the loop to inside the function.
92442 +
92443 +VS-FIXME-HANS: can this code be at all streamlined?  In particular, can you lock and unlock the jnode fewer times?
92444 +  */
92445 +reiser4_internal void
92446 +uncapture_page(struct page *pg)
92447 +{
92448 +       jnode *node;
92449 +       txn_atom *atom;
92450 +
92451 +       assert("umka-199", pg != NULL);
92452 +       assert("nikita-3155", PageLocked(pg));
92453 +
92454 +       reiser4_clear_page_dirty(pg);
92455 +
92456 +       reiser4_wait_page_writeback(pg);
92457 +
92458 +       node = (jnode *) (pg->private);
92459 +       if (node == NULL)
92460 +               return;
92461 +
92462 +       LOCK_JNODE(node);
92463 +
92464 +       eflush_del(node, 1/* page is locked */);
92465 +       /*assert ("zam-815", !JF_ISSET(node, JNODE_EFLUSH));*/
92466 +
92467 +       atom = jnode_get_atom(node);
92468 +       if (atom == NULL) {
92469 +               assert("jmacd-7111", !jnode_is_dirty(node));
92470 +               UNLOCK_JNODE (node);
92471 +               return;
92472 +       }
92473 +
92474 +       /* We can remove jnode from transaction even if it is on flush queue
92475 +        * prepped list, we only need to be sure that flush queue is not being
92476 +        * written by write_fq().  write_fq() does not use atom spin lock for
92477 +        * protection of the prepped nodes list, instead write_fq() increments
92478 +        * atom's nr_running_queues counters for the time when prepped list is
92479 +        * not protected by spin lock.  Here we check this counter if we want
92480 +        * to remove jnode from flush queue and, if the counter is not zero,
92481 +        * wait all write_fq() for this atom to complete. This is not
92482 +        * significant overhead. */
92483 +       while (JF_ISSET(node, JNODE_FLUSH_QUEUED) && atom->nr_running_queues) {
92484 +               UNLOCK_JNODE(node);
92485 +               /*
92486 +                * at this moment we want to wait for "atom event", viz. wait
92487 +                * until @node can be removed from flush queue. But
92488 +                * atom_wait_event() cannot be called with page locked, because
92489 +                * it deadlocks with jnode_extent_write(). Unlock page, after
92490 +                * making sure (through page_cache_get()) that it cannot be
92491 +                * released from memory.
92492 +                */
92493 +               page_cache_get(pg);
92494 +               unlock_page(pg);
92495 +               atom_wait_event(atom);
92496 +               lock_page(pg);
92497 +               /*
92498 +                * page may has been detached by ->writepage()->releasepage().
92499 +                */
92500 +               reiser4_wait_page_writeback(pg);
92501 +               LOCK_JNODE(node);
92502 +               eflush_del(node, 1);
92503 +               page_cache_release(pg);
92504 +               atom = jnode_get_atom(node);
92505 +/* VS-FIXME-HANS: improve the commenting in this function */
92506 +               if (atom == NULL) {
92507 +                       UNLOCK_JNODE(node);
92508 +                       return;
92509 +               }
92510 +       }
92511 +       uncapture_block(node);
92512 +       UNLOCK_ATOM(atom);
92513 +       jput(node);
92514 +}
92515 +
92516 +/* this is used in extent's kill hook to uncapture and unhash jnodes attached to inode's tree of jnodes */
92517 +reiser4_internal void
92518 +uncapture_jnode(jnode *node)
92519 +{
92520 +       txn_atom *atom;
92521 +
92522 +       assert("vs-1462", spin_jnode_is_locked(node));
92523 +       assert("", node->pg == 0);
92524 +
92525 +       eflush_del(node, 0);
92526 +       /*jnode_make_clean(node);*/
92527 +       atom = jnode_get_atom(node);
92528 +       if (atom == NULL) {
92529 +               assert("jmacd-7111", !jnode_is_dirty(node));
92530 +               UNLOCK_JNODE (node);
92531 +               return;
92532 +       }
92533 +
92534 +       uncapture_block(node);
92535 +       UNLOCK_ATOM(atom);
92536 +       jput(node);
92537 +}
92538 +
92539 +/* No-locking version of assign_txnh.  Sets the transaction handle's atom pointer,
92540 +   increases atom refcount and txnh_count, adds to txnh_list. */
92541 +static void
92542 +capture_assign_txnh_nolock(txn_atom * atom, txn_handle * txnh)
92543 +{
92544 +       assert("umka-200", atom != NULL);
92545 +       assert("umka-201", txnh != NULL);
92546 +
92547 +       assert("jmacd-822", spin_txnh_is_locked(txnh));
92548 +       assert("jmacd-823", spin_atom_is_locked(atom));
92549 +       assert("jmacd-824", txnh->atom == NULL);
92550 +       assert("nikita-3540", atom_isopen(atom));
92551 +
92552 +       atomic_inc(&atom->refcount);
92553 +
92554 +       ON_TRACE(TRACE_TXN, "assign txnh atom %u refcount %d\n", atom->atom_id, atomic_read(&atom->refcount));
92555 +
92556 +       txnh->atom = atom;
92557 +       txnh_list_push_back(&atom->txnh_list, txnh);
92558 +       atom->txnh_count += 1;
92559 +}
92560 +
92561 +/* No-locking version of assign_block.  Sets the block's atom pointer, references the
92562 +   block, adds it to the clean or dirty capture_jnode list, increments capture_count. */
92563 +static void
92564 +capture_assign_block_nolock(txn_atom * atom, jnode * node)
92565 +{
92566 +       assert("umka-202", atom != NULL);
92567 +       assert("umka-203", node != NULL);
92568 +       assert("jmacd-321", spin_jnode_is_locked(node));
92569 +       assert("umka-295", spin_atom_is_locked(atom));
92570 +       assert("jmacd-323", node->atom == NULL);
92571 +       BUG_ON(!capture_list_is_clean(node));
92572 +       assert("nikita-3470", !jnode_is_dirty(node));
92573 +
92574 +       /* Pointer from jnode to atom is not counted in atom->refcount. */
92575 +       node->atom = atom;
92576 +
92577 +       capture_list_push_back(ATOM_CLEAN_LIST(atom), node);
92578 +       atom->capture_count += 1;
92579 +       /* reference to jnode is acquired by atom. */
92580 +       jref(node);
92581 +
92582 +       ON_DEBUG(count_jnode(atom, node, NOT_CAPTURED, CLEAN_LIST, 1));
92583 +
92584 +       LOCK_CNT_INC(t_refs);
92585 +
92586 +       ON_TRACE(TRACE_TXN, "capture %p for atom %u (captured %u)\n", node, atom->atom_id, atom->capture_count);
92587 +}
92588 +
92589 +#if REISER4_COPY_ON_CAPTURE
92590 +static void
92591 +set_cced_bit(jnode *node)
92592 +{
92593 +       BUG_ON(JF_ISSET(node, JNODE_CCED));
92594 +       JF_SET(node, JNODE_CCED);
92595 +}
92596 +#endif
92597 +
92598 +static void
92599 +clear_cced_bits(jnode *node)
92600 +{
92601 +       JF_CLR(node, JNODE_CCED);
92602 +}
92603 +
92604 +int
92605 +is_cced(const jnode *node)
92606 +{
92607 +       return JF_ISSET(node, JNODE_CCED);
92608 +}
92609 +
92610 +/* common code for dirtying both unformatted jnodes and formatted znodes. */
92611 +static void
92612 +do_jnode_make_dirty(jnode * node, txn_atom * atom)
92613 +{
92614 +       assert("zam-748", spin_jnode_is_locked(node));
92615 +       assert("zam-750", spin_atom_is_locked(atom));
92616 +       assert("jmacd-3981", !jnode_is_dirty(node));
92617 +
92618 +       JF_SET(node, JNODE_DIRTY);
92619 +
92620 +       get_current_context()->nr_marked_dirty ++;
92621 +
92622 +       /* We grab2flush_reserve one additional block only if node was
92623 +          not CREATED and jnode_flush did not sort it into neither
92624 +          relocate set nor overwrite one. If node is in overwrite or
92625 +          relocate set we assume that atom's flush reserved counter was
92626 +          already adjusted. */
92627 +       if (!JF_ISSET(node, JNODE_CREATED) && !JF_ISSET(node, JNODE_RELOC)
92628 +           && !JF_ISSET(node, JNODE_OVRWR) && jnode_is_leaf(node)
92629 +           && !jnode_is_cluster_page(node)) {
92630 +               assert("vs-1093", !blocknr_is_fake(&node->blocknr));
92631 +               assert("vs-1506", *jnode_get_block(node) != 0);
92632 +               grabbed2flush_reserved_nolock(atom, (__u64)1);
92633 +               JF_SET(node, JNODE_FLUSH_RESERVED);
92634 +       }
92635 +
92636 +       if (!JF_ISSET(node, JNODE_FLUSH_QUEUED)) {
92637 +               /* If the atom is not set yet, it will be added to the appropriate list in
92638 +                  capture_assign_block_nolock. */
92639 +               /* Sometimes a node is set dirty before being captured -- the case for new
92640 +                  jnodes.  In that case the jnode will be added to the appropriate list
92641 +                  in capture_assign_block_nolock. Another reason not to re-link jnode is
92642 +                  that jnode is on a flush queue (see flush.c for details) */
92643 +
92644 +               int level = jnode_get_level(node);
92645 +
92646 +               assert("nikita-3152", !JF_ISSET(node, JNODE_OVRWR));
92647 +               assert("zam-654", atom->stage < ASTAGE_PRE_COMMIT);
92648 +               assert("nikita-2607", 0 <= level);
92649 +               assert("nikita-2606", level <= REAL_MAX_ZTREE_HEIGHT);
92650 +
92651 +               capture_list_remove(node);
92652 +               capture_list_push_back(ATOM_DIRTY_LIST(atom, level), node);
92653 +               ON_DEBUG(count_jnode(atom, node, NODE_LIST(node), DIRTY_LIST, 1));
92654 +
92655 +               /*
92656 +                * JNODE_CCED bit protects clean copy (page created by
92657 +                * copy-on-capture) from being evicted from the memory. This
92658 +                * is necessary, because otherwise jload() would load obsolete
92659 +                * disk block (up-to-date original is still in memory). But
92660 +                * once jnode is dirtied, it cannot be released without
92661 +                * storing its content on the disk, so protection is no longer
92662 +                * necessary.
92663 +                */
92664 +               clear_cced_bits(node);
92665 +       }
92666 +}
92667 +
92668 +/* Set the dirty status for this (spin locked) jnode. */
92669 +reiser4_internal void
92670 +jnode_make_dirty_locked(jnode * node)
92671 +{
92672 +       assert("umka-204", node != NULL);
92673 +       assert("zam-7481", spin_jnode_is_locked(node));
92674 +
92675 +       if (REISER4_DEBUG && rofs_jnode(node)) {
92676 +               warning("nikita-3365", "Dirtying jnode on rofs");
92677 +               dump_stack();
92678 +       }
92679 +
92680 +       /* Fast check for already dirty node */
92681 +       if (!jnode_is_dirty(node)) {
92682 +               txn_atom * atom;
92683 +
92684 +               atom = jnode_get_atom (node);
92685 +               assert("vs-1094", atom);
92686 +               /* Check jnode dirty status again because node spin lock might
92687 +                * be released inside jnode_get_atom(). */
92688 +               if (likely(!jnode_is_dirty(node)))
92689 +                       do_jnode_make_dirty(node, atom);
92690 +               UNLOCK_ATOM (atom);
92691 +       }
92692 +}
92693 +
92694 +/* Set the dirty status for this znode. */
92695 +reiser4_internal void
92696 +znode_make_dirty(znode * z)
92697 +{
92698 +       jnode *node;
92699 +       struct page *page;
92700 +
92701 +       assert("umka-204", z != NULL);
92702 +       assert("nikita-3290", znode_above_root(z) || znode_is_loaded(z));
92703 +       assert("nikita-3291", !ZF_ISSET(z, JNODE_EFLUSH));
92704 +       assert("nikita-3560", znode_is_write_locked(z));
92705 +
92706 +       node = ZJNODE(z);
92707 +
92708 +       LOCK_JNODE(node);
92709 +       jnode_make_dirty_locked(node);
92710 +       page = jnode_page(node);
92711 +       if (page != NULL) {
92712 +               /* this is useful assertion (allows one to check that no
92713 +                * modifications are lost due to update of in-flight page),
92714 +                * but it requires locking on page to check PG_writeback
92715 +                * bit. */
92716 +               /* assert("nikita-3292",
92717 +                      !PageWriteback(page) || ZF_ISSET(z, JNODE_WRITEBACK)); */
92718 +               page_cache_get(page);
92719 +               ON_DEBUG_MODIFY(znode_set_checksum(ZJNODE(z), 1));
92720 +               /* jnode lock is not needed for the rest of
92721 +                * znode_set_dirty(). */
92722 +               UNLOCK_JNODE(node);
92723 +               /* reiser4 file write code calls set_page_dirty for
92724 +                * unformatted nodes, for formatted nodes we do it here. */
92725 +               set_page_dirty_internal(page, 0);
92726 +               page_cache_release(page);
92727 +               /* bump version counter in znode */
92728 +               z->version = znode_build_version(jnode_get_tree(node));
92729 +       } else {
92730 +               assert("zam-596", znode_above_root(JZNODE(node)));
92731 +               UNLOCK_JNODE(node);
92732 +       }
92733 +
92734 +       assert("nikita-1900", znode_is_write_locked(z));
92735 +       assert("jmacd-9777", node->atom != NULL);
92736 +}
92737 +
92738 +reiser4_internal int
92739 +sync_atom(txn_atom *atom)
92740 +{
92741 +       int result;
92742 +       txn_handle *txnh;
92743 +
92744 +       txnh = get_current_context()->trans;
92745 +
92746 +       result = 0;
92747 +       if (atom != NULL) {
92748 +               if (atom->stage < ASTAGE_PRE_COMMIT) {
92749 +                       LOCK_TXNH(txnh);
92750 +                       capture_assign_txnh_nolock(atom, txnh);
92751 +                       result = force_commit_atom_nolock(txnh);
92752 +               } else if (atom->stage < ASTAGE_POST_COMMIT) {
92753 +                       /* wait atom commit */
92754 +                       atom_wait_event(atom);
92755 +                       /* try once more */
92756 +                       result = RETERR(-E_REPEAT);
92757 +               } else
92758 +                       UNLOCK_ATOM(atom);
92759 +       }
92760 +       return result;
92761 +}
92762 +
92763 +#if REISER4_DEBUG
92764 +
92765 +void check_fq(const txn_atom *atom);
92766 +
92767 +/* move jnode form one list to another
92768 +   call this after atom->capture_count is updated */
92769 +void
92770 +count_jnode(txn_atom *atom, jnode *node, atom_list old_list, atom_list new_list, int check_lists)
92771 +{
92772 +#if REISER4_COPY_ON_CAPTURE
92773 +       assert("", spin_atom_is_locked(atom));
92774 +#else
92775 +       assert("zam-1018", atom_is_protected(atom));
92776 +#endif
92777 +       assert("", spin_jnode_is_locked(node));
92778 +       assert("", NODE_LIST(node) == old_list);
92779 +
92780 +       switch(NODE_LIST(node)) {
92781 +       case NOT_CAPTURED:
92782 +               break;
92783 +       case DIRTY_LIST:
92784 +               assert("", atom->dirty > 0);
92785 +               atom->dirty --;
92786 +               break;
92787 +       case CLEAN_LIST:
92788 +               assert("", atom->clean > 0);
92789 +               atom->clean --;
92790 +               break;
92791 +       case FQ_LIST:
92792 +               assert("", atom->fq > 0);
92793 +               atom->fq --;
92794 +               break;
92795 +       case WB_LIST:
92796 +               assert("", atom->wb > 0);
92797 +               atom->wb --;
92798 +               break;
92799 +       case OVRWR_LIST:
92800 +               assert("", atom->ovrwr > 0);
92801 +               atom->ovrwr --;
92802 +               break;
92803 +       case PROTECT_LIST:
92804 +               /* protect list is an intermediate atom's list to which jnodes
92805 +                  get put from dirty list before disk space is allocated for
92806 +                  them. From this list jnodes can either go to flush queue list
92807 +                  or back to dirty list */
92808 +               assert("", atom->protect > 0);
92809 +               assert("", new_list == FQ_LIST || new_list == DIRTY_LIST);
92810 +               atom->protect --;
92811 +               break;
92812 +       default:
92813 +               impossible("", "");
92814 +       }
92815 +
92816 +       switch(new_list) {
92817 +       case NOT_CAPTURED:
92818 +               break;
92819 +       case DIRTY_LIST:
92820 +               atom->dirty ++;
92821 +               break;
92822 +       case CLEAN_LIST:
92823 +               atom->clean ++;
92824 +               break;
92825 +       case FQ_LIST:
92826 +               atom->fq ++;
92827 +               break;
92828 +       case WB_LIST:
92829 +               atom->wb ++;
92830 +               break;
92831 +       case OVRWR_LIST:
92832 +               atom->ovrwr ++;
92833 +               break;
92834 +       case PROTECT_LIST:
92835 +               assert("", old_list == DIRTY_LIST);
92836 +               atom->protect ++;
92837 +               break;
92838 +       default:
92839 +               impossible("", "");
92840 +       }
92841 +       ASSIGN_NODE_LIST(node, new_list);
92842 +       if (0 && check_lists) {
92843 +               int count;
92844 +               tree_level level;
92845 +               jnode *node;
92846 +
92847 +               count = 0;
92848 +
92849 +               /* flush queue list */
92850 +               /*check_fq(atom);*/
92851 +
92852 +               /* dirty list */
92853 +               count = 0;
92854 +               for (level = 0; level < REAL_MAX_ZTREE_HEIGHT + 1; level += 1) {
92855 +                       for_all_type_safe_list(capture, ATOM_DIRTY_LIST(atom, level), node)
92856 +                               count ++;
92857 +               }
92858 +               if (count != atom->dirty)
92859 +                       warning("", "dirty counter %d, real %d\n", atom->dirty, count);
92860 +
92861 +               /* clean list */
92862 +               count = 0;
92863 +               for_all_type_safe_list(capture, ATOM_CLEAN_LIST(atom), node)
92864 +                       count ++;
92865 +               if (count != atom->clean)
92866 +                       warning("", "clean counter %d, real %d\n", atom->clean, count);
92867 +
92868 +               /* wb list */
92869 +               count = 0;
92870 +               for_all_type_safe_list(capture, ATOM_WB_LIST(atom), node)
92871 +                       count ++;
92872 +               if (count != atom->wb)
92873 +                       warning("", "wb counter %d, real %d\n", atom->wb, count);
92874 +
92875 +               /* overwrite list */
92876 +               count = 0;
92877 +               for_all_type_safe_list(capture, ATOM_OVRWR_LIST(atom), node)
92878 +                       count ++;
92879 +
92880 +               if (count != atom->ovrwr)
92881 +                       warning("", "ovrwr counter %d, real %d\n", atom->ovrwr, count);
92882 +       }
92883 +       assert("vs-1624", atom->num_queued == atom->fq);
92884 +       if (atom->capture_count != atom->dirty + atom->clean + atom->ovrwr + atom->wb + atom->fq + atom->protect) {
92885 +               printk("count %d, dirty %d clean %d ovrwr %d wb %d fq %d protect %d\n", atom->capture_count, atom->dirty, atom->clean, atom->ovrwr, atom->wb, atom->fq, atom->protect);
92886 +               assert("vs-1622",
92887 +                      atom->capture_count == atom->dirty + atom->clean + atom->ovrwr + atom->wb + atom->fq + atom->protect);
92888 +       }
92889 +}
92890 +
92891 +#endif
92892 +
92893 +/* Make node OVRWR and put it on atom->overwrite_nodes list, atom lock and jnode
92894 + * lock should be taken before calling this function. */
92895 +reiser4_internal void jnode_make_wander_nolock (jnode * node)
92896 +{
92897 +       txn_atom * atom;
92898 +
92899 +       assert("nikita-2431", node != NULL);
92900 +       assert("nikita-2432", !JF_ISSET(node, JNODE_RELOC));
92901 +       assert("nikita-3153", jnode_is_dirty(node));
92902 +       assert("zam-897", !JF_ISSET(node, JNODE_FLUSH_QUEUED));
92903 +       assert("nikita-3367", !blocknr_is_fake(jnode_get_block(node)));
92904 +
92905 +       atom = node->atom;
92906 +
92907 +       assert("zam-895", atom != NULL);
92908 +       assert("zam-894", atom_is_protected(atom));
92909 +
92910 +       JF_SET(node, JNODE_OVRWR);
92911 +       capture_list_remove_clean(node);
92912 +       capture_list_push_back(ATOM_OVRWR_LIST(atom), node);
92913 +       /*XXXX*/ON_DEBUG(count_jnode(atom, node, DIRTY_LIST, OVRWR_LIST, 1));
92914 +}
92915 +
92916 +/* Same as jnode_make_wander_nolock, but all necessary locks are taken inside
92917 + * this function. */
92918 +reiser4_internal void jnode_make_wander (jnode * node)
92919 +{
92920 +       txn_atom * atom;
92921 +
92922 +       LOCK_JNODE(node);
92923 +       atom = jnode_get_atom(node);
92924 +       assert ("zam-913", atom != NULL);
92925 +       assert ("zam-914", !JF_ISSET(node, JNODE_RELOC));
92926 +
92927 +       jnode_make_wander_nolock(node);
92928 +       UNLOCK_ATOM(atom);
92929 +       UNLOCK_JNODE(node);
92930 +}
92931 +
92932 +/* this just sets RELOC bit  */
92933 +static void
92934 +jnode_make_reloc_nolock(flush_queue_t *fq, jnode *node)
92935 +{
92936 +       assert("vs-1480", spin_jnode_is_locked(node));
92937 +       assert ("zam-916", jnode_is_dirty(node));
92938 +       assert ("zam-917", !JF_ISSET(node, JNODE_RELOC));
92939 +       assert ("zam-918", !JF_ISSET(node, JNODE_OVRWR));
92940 +       assert ("zam-920", !JF_ISSET(node, JNODE_FLUSH_QUEUED));
92941 +       assert ("nikita-3367", !blocknr_is_fake(jnode_get_block(node)));
92942 +
92943 +       jnode_set_reloc(node);
92944 +}
92945 +
92946 +/* Make znode RELOC and put it on flush queue */
92947 +reiser4_internal void znode_make_reloc (znode *z, flush_queue_t * fq)
92948 +{
92949 +       jnode *node;
92950 +       txn_atom * atom;
92951 +
92952 +       node = ZJNODE(z);
92953 +       LOCK_JNODE(node);
92954 +
92955 +       atom = jnode_get_atom(node);
92956 +       assert ("zam-919", atom != NULL);
92957 +
92958 +       jnode_make_reloc_nolock(fq, node);
92959 +       queue_jnode(fq, node);
92960 +
92961 +       UNLOCK_ATOM(atom);
92962 +       UNLOCK_JNODE(node);
92963 +
92964 +}
92965 +
92966 +/* Make unformatted node RELOC and put it on flush queue */
92967 +reiser4_internal void
92968 +unformatted_make_reloc(jnode *node, flush_queue_t * fq)
92969 +{
92970 +       assert("vs-1479", jnode_is_unformatted(node));
92971 +
92972 +       jnode_make_reloc_nolock(fq, node);
92973 +       mark_jnode_queued(fq, node);
92974 +}
92975 +
92976 +static int
92977 +trylock_wait(txn_atom *atom, txn_handle * txnh, jnode * node)
92978 +{
92979 +       if (unlikely(!spin_trylock_atom(atom))) {
92980 +               atomic_inc(&atom->refcount);
92981 +
92982 +               UNLOCK_JNODE(node);
92983 +               UNLOCK_TXNH(txnh);
92984 +
92985 +               LOCK_ATOM(atom);
92986 +               /* caller should eliminate extra reference by calling
92987 +                * atom_dec_and_unlock() for this atom. */
92988 +               return 1;
92989 +       } else
92990 +               return 0;
92991 +}
92992 +
92993 +/*
92994 + * in transaction manager jnode spin lock and transaction handle spin lock
92995 + * nest within atom spin lock. During capturing we are in a situation when
92996 + * jnode and transaction handle spin locks are held and we want to manipulate
92997 + * atom's data (capture lists, and txnh list) to add node and/or handle to the
92998 + * atom. Releasing jnode (or txnh) spin lock at this point is unsafe, because
92999 + * concurrent fusion can render assumption made by capture so far (about
93000 + * ->atom pointers in jnode and txnh) invalid. Initial code used try-lock and
93001 + * if atom was busy returned -E_REPEAT to the top level. This can lead to the
93002 + * busy loop if atom is locked for long enough time. Function below tries to
93003 + * throttle this loop.
93004 + *
93005 + */
93006 +/* ZAM-FIXME-HANS: how feasible would it be to use our hi-lo priority locking
93007 +   mechanisms/code for this as well? Does that make any sense? */
93008 +/* ANSWER(Zam): I am not sure that I understand you proposal right, but the idea
93009 +   might be in inventing spin_lock_lopri() which should be a complex loop with
93010 +   "release lock" messages check like we have in the znode locking.  I think we
93011 +   should not substitute spin locks by more complex busy loops.  Once it was
93012 +   done that way in try_capture_block() where spin lock waiting was spread in a
93013 +   busy loop  through several functions.  The proper solution should be in
93014 +   making spin lock contention rare. */
93015 +static int
93016 +trylock_throttle(txn_atom *atom, txn_handle * txnh, jnode * node)
93017 +{
93018 +       assert("nikita-3224", atom != NULL);
93019 +       assert("nikita-3225", txnh != NULL);
93020 +       assert("nikita-3226", node != NULL);
93021 +
93022 +       assert("nikita-3227", spin_txnh_is_locked(txnh));
93023 +       assert("nikita-3229", spin_jnode_is_locked(node));
93024 +
93025 +       if (unlikely(trylock_wait(atom, txnh, node) != 0)) {
93026 +               atom_dec_and_unlock(atom);
93027 +               reiser4_stat_inc(txnmgr.restart.trylock_throttle);
93028 +               return RETERR(-E_REPEAT);
93029 +       } else
93030 +               return 0;
93031 +}
93032 +
93033 +/* This function assigns a block to an atom, but first it must obtain the atom lock.  If
93034 +   the atom lock is busy, it returns -E_REPEAT to avoid deadlock with a fusing atom.  Since
93035 +   the transaction handle is currently open, we know the atom must also be open. */
93036 +static int
93037 +capture_assign_block(txn_handle * txnh, jnode * node)
93038 +{
93039 +       txn_atom *atom;
93040 +       int       result;
93041 +
93042 +       assert("umka-206", txnh != NULL);
93043 +       assert("umka-207", node != NULL);
93044 +
93045 +       atom = txnh->atom;
93046 +
93047 +       assert("umka-297", atom != NULL);
93048 +
93049 +       result = trylock_throttle(atom, txnh, node);
93050 +       if (result != 0) {
93051 +               /* this avoid busy loop, but we return -E_REPEAT anyway to
93052 +                * simplify things. */
93053 +               reiser4_stat_inc(txnmgr.restart.assign_block);
93054 +               return result;
93055 +       } else {
93056 +               assert("jmacd-19", atom_isopen(atom));
93057 +
93058 +               /* Add page to capture list. */
93059 +               capture_assign_block_nolock(atom, node);
93060 +
93061 +               /* Success holds onto jnode & txnh locks.  Unlock atom. */
93062 +               UNLOCK_ATOM(atom);
93063 +               return 0;
93064 +       }
93065 +}
93066 +
93067 +/* This function assigns a handle to an atom, but first it must obtain the atom lock.  If
93068 +   the atom is busy, it returns -E_REPEAT to avoid deadlock with a fusing atom.  Unlike
93069 +   capture_assign_block, the atom may be closed but we cannot know this until the atom is
93070 +   locked.  If the atom is closed and the request is to read, it is as if the block is
93071 +   unmodified and the request is satisified without actually assigning the transaction
93072 +   handle.  If the atom is closed and the handle requests to write the block, then
93073 +   initiate copy-on-capture.
93074 +*/
93075 +static int
93076 +capture_assign_txnh(jnode * node, txn_handle * txnh, txn_capture mode, int can_coc)
93077 +{
93078 +       txn_atom *atom;
93079 +
93080 +       assert("umka-208", node != NULL);
93081 +       assert("umka-209", txnh != NULL);
93082 +
93083 +       atom = node->atom;
93084 +
93085 +       assert("umka-298", atom != NULL);
93086 +
93087 +       /*
93088 +        * optimization: this code went through three evolution stages. Main
93089 +        * driving force of evolution here is lock ordering:
93090 +        *
93091 +        * at the entry to this function following pre-conditions are met:
93092 +        *
93093 +        *     1. txnh and node are both spin locked,
93094 +        *
93095 +        *     2. node belongs to atom, and
93096 +        *
93097 +        *     3. txnh don't.
93098 +        *
93099 +        * What we want to do here is to acquire spin lock on node's atom and
93100 +        * modify it somehow depending on its ->stage. In the simplest case,
93101 +        * where ->stage is ASTAGE_CAPTURE_FUSE, txnh should be added to
93102 +        * atom's list. Problem is that atom spin lock nests outside of jnode
93103 +        * and transaction handle ones. So, we cannot just LOCK_ATOM here.
93104 +        *
93105 +        * Solutions tried here:
93106 +        *
93107 +        *     1. spin_trylock(atom), return -E_REPEAT on failure.
93108 +        *
93109 +        *     2. spin_trylock(atom). On failure to acquire lock, increment
93110 +        *     atom->refcount, release all locks, and spin on atom lock. Then
93111 +        *     decrement ->refcount, unlock atom and return -E_REPEAT.
93112 +        *
93113 +        *     3. like previous one, but before unlocking atom, re-acquire
93114 +        *     spin locks on node and txnh and re-check whether function
93115 +        *     pre-condition are still met. Continue boldly if they are.
93116 +        *
93117 +        */
93118 +       if (trylock_wait(atom, txnh, node) != 0) {
93119 +               LOCK_JNODE(node);
93120 +               LOCK_TXNH(txnh);
93121 +               /* NOTE-NIKITA is it at all possible that current txnh
93122 +                * spontaneously changes ->atom from NULL to non-NULL? */
93123 +               if (node->atom == NULL ||
93124 +                   txnh->atom != NULL || atom != node->atom) {
93125 +                       /* something changed. Caller have to re-decide */
93126 +                       UNLOCK_TXNH(txnh);
93127 +                       UNLOCK_JNODE(node);
93128 +                       atom_dec_and_unlock(atom);
93129 +                       reiser4_stat_inc(txnmgr.restart.assign_txnh);
93130 +                       return RETERR(-E_REPEAT);
93131 +               } else {
93132 +                       /* atom still has a jnode on its list (node->atom ==
93133 +                        * atom), it means atom is not fused or finished
93134 +                        * (committed), we can safely decrement its refcount
93135 +                        * because it is not a last reference. */
93136 +                       atomic_dec(&atom->refcount);
93137 +                       assert("zam-990", atomic_read(&atom->refcount) > 0);
93138 +               }
93139 +       }
93140 +
93141 +       if (atom->stage == ASTAGE_CAPTURE_WAIT &&
93142 +           (atom->txnh_count != 0 ||
93143 +            atom_should_commit(atom) || atom_should_commit_asap(atom))) {
93144 +               /* We don't fuse with the atom in ASTAGE_CAPTURE_WAIT only if
93145 +                * there is open transaction handler.  It makes sense: those
93146 +                * atoms should not wait ktxnmgrd to flush and commit them.
93147 +                * And, it solves deadlocks with loop back devices (reiser4 over
93148 +                * loopback over reiser4), when ktxnmrgd is busy committing one
93149 +                * atom (above the loop back device) and can't flush an atom
93150 +                * below the loopback. */
93151 +
93152 +               /* The atom could be blocking requests--this is the first chance we've had
93153 +                  to test it.  Since this txnh is not yet assigned, the fuse_wait logic
93154 +                  is not to avoid deadlock, its just waiting.  Releases all three locks
93155 +                  and returns E_REPEAT. */
93156 +
93157 +               return capture_fuse_wait(node, txnh, atom, NULL, mode);
93158 +
93159 +       } else if (atom->stage > ASTAGE_CAPTURE_WAIT) {
93160 +
93161 +               /* The block is involved with a committing atom. */
93162 +               if (CAPTURE_TYPE(mode) == TXN_CAPTURE_READ_ATOMIC) {
93163 +
93164 +                       /* A read request for a committing block can be satisfied w/o
93165 +                          COPY-ON-CAPTURE. */
93166 +
93167 +                       /* Success holds onto the jnode & txnh lock.  Continue to unlock
93168 +                          atom below. */
93169 +
93170 +               } else {
93171 +
93172 +                       /* Perform COPY-ON-CAPTURE.  Copy and try again.  This function
93173 +                          releases all three locks. */
93174 +                       return capture_copy(node, txnh, atom, NULL, mode, can_coc);
93175 +               }
93176 +
93177 +       } else {
93178 +
93179 +               assert("jmacd-160", atom->stage == ASTAGE_CAPTURE_FUSE ||
93180 +                      (atom->stage == ASTAGE_CAPTURE_WAIT && atom->txnh_count == 0));
93181 +
93182 +               /* Add txnh to active list. */
93183 +               capture_assign_txnh_nolock(atom, txnh);
93184 +
93185 +               /* Success holds onto the jnode & txnh lock.  Continue to unlock atom
93186 +                  below. */
93187 +       }
93188 +
93189 +       /* Unlock the atom */
93190 +       UNLOCK_ATOM(atom);
93191 +       return 0;
93192 +}
93193 +
93194 +reiser4_internal int
93195 +capture_super_block(struct super_block *s)
93196 +{
93197 +       int result;
93198 +       znode *uber;
93199 +       lock_handle lh;
93200 +
93201 +       init_lh(&lh);
93202 +       result = get_uber_znode(get_tree(s),
93203 +                               ZNODE_WRITE_LOCK, ZNODE_LOCK_LOPRI, &lh);
93204 +       if (result)
93205 +               return result;
93206 +
93207 +       uber = lh.node;
93208 +       /* Grabbing one block for superblock */
93209 +       result = reiser4_grab_space_force((__u64)1, BA_RESERVED);
93210 +       if (result != 0)
93211 +               return result;
93212 +
93213 +       znode_make_dirty(uber);
93214 +
93215 +       done_lh(&lh);
93216 +       return 0;
93217 +}
93218 +
93219 +/* Wakeup every handle on the atom's WAITFOR list */
93220 +static void
93221 +wakeup_atom_waitfor_list(txn_atom * atom)
93222 +{
93223 +       txn_wait_links *wlinks;
93224 +
93225 +       assert("umka-210", atom != NULL);
93226 +
93227 +       /* atom is locked */
93228 +       for_all_type_safe_list(fwaitfor, &atom->fwaitfor_list, wlinks) {
93229 +               if (wlinks->waitfor_cb == NULL ||
93230 +                   wlinks->waitfor_cb(atom, wlinks))
93231 +                       /* Wake up. */
93232 +                       reiser4_wake_up(wlinks->_lock_stack);
93233 +       }
93234 +}
93235 +
93236 +/* Wakeup every handle on the atom's WAITING list */
93237 +static void
93238 +wakeup_atom_waiting_list(txn_atom * atom)
93239 +{
93240 +       txn_wait_links *wlinks;
93241 +
93242 +       assert("umka-211", atom != NULL);
93243 +
93244 +       /* atom is locked */
93245 +       for_all_type_safe_list(fwaiting, &atom->fwaiting_list, wlinks) {
93246 +               if (wlinks->waiting_cb == NULL ||
93247 +                   wlinks->waiting_cb(atom, wlinks))
93248 +                       /* Wake up. */
93249 +                       reiser4_wake_up(wlinks->_lock_stack);
93250 +       }
93251 +}
93252 +
93253 +/* helper function used by capture_fuse_wait() to avoid "spurious wake-ups" */
93254 +static int wait_for_fusion(txn_atom * atom, txn_wait_links * wlinks)
93255 +{
93256 +       assert("nikita-3330", atom != NULL);
93257 +       assert("nikita-3331", spin_atom_is_locked(atom));
93258 +
93259 +
93260 +       /* atom->txnh_count == 1 is for waking waiters up if we are releasing
93261 +        * last transaction handle. */
93262 +       return atom->stage != ASTAGE_CAPTURE_WAIT || atom->txnh_count == 1;
93263 +}
93264 +
93265 +/* The general purpose of this function is to wait on the first of two possible events.
93266 +   The situation is that a handle (and its atom atomh) is blocked trying to capture a
93267 +   block (i.e., node) but the node's atom (atomf) is in the CAPTURE_WAIT state.  The
93268 +   handle's atom (atomh) is not in the CAPTURE_WAIT state.  However, atomh could fuse with
93269 +   another atom or, due to age, enter the CAPTURE_WAIT state itself, at which point it
93270 +   needs to unblock the handle to avoid deadlock.  When the txnh is unblocked it will
93271 +   proceed and fuse the two atoms in the CAPTURE_WAIT state.
93272 +
93273 +   In other words, if either atomh or atomf change state, the handle will be awakened,
93274 +   thus there are two lists per atom: WAITING and WAITFOR.
93275 +
93276 +   This is also called by capture_assign_txnh with (atomh == NULL) to wait for atomf to
93277 +   close but it is not assigned to an atom of its own.
93278 +
93279 +   Lock ordering in this method: all four locks are held: JNODE_LOCK, TXNH_LOCK,
93280 +   BOTH_ATOM_LOCKS.  Result: all four locks are released.
93281 +*/
93282 +static int
93283 +capture_fuse_wait(jnode * node, txn_handle * txnh, txn_atom * atomf, txn_atom * atomh, txn_capture mode)
93284 +{
93285 +       int ret;
93286 +
93287 +       /* Initialize the waiting list links. */
93288 +       txn_wait_links wlinks;
93289 +
93290 +       assert("umka-212", node != NULL);
93291 +       assert("umka-213", txnh != NULL);
93292 +       assert("umka-214", atomf != NULL);
93293 +
93294 +       /* We do not need the node lock. */
93295 +       UNLOCK_JNODE(node);
93296 +
93297 +       if ((mode & TXN_CAPTURE_NONBLOCKING) != 0) {
93298 +               UNLOCK_TXNH(txnh);
93299 +               UNLOCK_ATOM(atomf);
93300 +
93301 +               if (atomh) {
93302 +                       UNLOCK_ATOM(atomh);
93303 +               }
93304 +
93305 +               ON_TRACE(TRACE_TXN, "thread %u nonblocking on atom %u\n", current->pid, atomf->atom_id);
93306 +
93307 +               reiser4_stat_inc(txnmgr.restart.fuse_wait_nonblock);
93308 +               return RETERR(-E_BLOCK);
93309 +       }
93310 +
93311 +       init_wlinks(&wlinks);
93312 +
93313 +       /* Add txnh to atomf's waitfor list, unlock atomf. */
93314 +       fwaitfor_list_push_back(&atomf->fwaitfor_list, &wlinks);
93315 +       wlinks.waitfor_cb = wait_for_fusion;
93316 +       atomic_inc(&atomf->refcount);
93317 +       UNLOCK_ATOM(atomf);
93318 +
93319 +       if (atomh) {
93320 +               /* Add txnh to atomh's waiting list, unlock atomh. */
93321 +               fwaiting_list_push_back(&atomh->fwaiting_list, &wlinks);
93322 +               atomic_inc(&atomh->refcount);
93323 +               UNLOCK_ATOM(atomh);
93324 +       }
93325 +
93326 +       ON_TRACE(TRACE_TXN, "thread %u waitfor %u waiting %u\n", current->pid,
93327 +                atomf->atom_id, atomh ? atomh->atom_id : 0);
93328 +
93329 +       /* Go to sleep. */
93330 +       UNLOCK_TXNH(txnh);
93331 +
93332 +       ret = prepare_to_sleep(wlinks._lock_stack);
93333 +       if (ret != 0) {
93334 +               ON_TRACE(TRACE_TXN, "thread %u deadlock blocking on atom %u\n", current->pid, atomf->atom_id);
93335 +       } else {
93336 +               go_to_sleep(wlinks._lock_stack, ADD_TO_SLEPT_IN_WAIT_ATOM);
93337 +
93338 +               reiser4_stat_inc(txnmgr.restart.fuse_wait_slept);
93339 +               ret = RETERR(-E_REPEAT);
93340 +               ON_TRACE(TRACE_TXN, "thread %u wakeup %u waiting %u\n",
93341 +                        current->pid, atomf->atom_id, atomh ? atomh->atom_id : 0);
93342 +       }
93343 +
93344 +       /* Remove from the waitfor list. */
93345 +       LOCK_ATOM(atomf);
93346 +       fwaitfor_list_remove(&wlinks);
93347 +       atom_dec_and_unlock(atomf);
93348 +
93349 +       if (atomh) {
93350 +               /* Remove from the waiting list. */
93351 +               LOCK_ATOM(atomh);
93352 +               fwaiting_list_remove(&wlinks);
93353 +               atom_dec_and_unlock(atomh);
93354 +       }
93355 +
93356 +       assert("nikita-2186", ergo(ret, spin_jnode_is_not_locked(node)));
93357 +       return ret;
93358 +}
93359 +
93360 +static inline int
93361 +capture_init_fusion_locked(jnode * node, txn_handle * txnh, txn_capture mode, int can_coc)
93362 +{
93363 +       txn_atom *atomf;
93364 +       txn_atom *atomh;
93365 +
93366 +       assert("umka-216", txnh != NULL);
93367 +       assert("umka-217", node != NULL);
93368 +
93369 +       atomh = txnh->atom;
93370 +       atomf = node->atom;
93371 +
93372 +       /* The txnh atom must still be open (since the txnh is active)...  the node atom may
93373 +          be in some later stage (checked next). */
93374 +       assert("jmacd-20", atom_isopen(atomh));
93375 +
93376 +       /* If the node atom is in the FUSE_WAIT state then we should wait, except to
93377 +          avoid deadlock we still must fuse if the txnh atom is also in FUSE_WAIT. */
93378 +       if (atomf->stage == ASTAGE_CAPTURE_WAIT &&
93379 +           atomh->stage != ASTAGE_CAPTURE_WAIT &&
93380 +           (atomf->txnh_count != 0 ||
93381 +            atom_should_commit(atomf) || atom_should_commit_asap(atomf))) {
93382 +               /* see comment in capture_assign_txnh() about the
93383 +                * "atomf->txnh_count != 0" condition. */
93384 +               /* This unlocks all four locks and returns E_REPEAT. */
93385 +               return capture_fuse_wait(node, txnh, atomf, atomh, mode);
93386 +
93387 +       } else if (atomf->stage > ASTAGE_CAPTURE_WAIT) {
93388 +
93389 +               /* The block is involved with a comitting atom. */
93390 +               if (CAPTURE_TYPE(mode) == TXN_CAPTURE_READ_ATOMIC) {
93391 +                       /* A read request for a committing block can be satisfied w/o
93392 +                          COPY-ON-CAPTURE.  Success holds onto the jnode & txnh
93393 +                          locks. */
93394 +                       UNLOCK_ATOM(atomf);
93395 +                       UNLOCK_ATOM(atomh);
93396 +                       return 0;
93397 +               } else {
93398 +                       /* Perform COPY-ON-CAPTURE.  Copy and try again.  This function
93399 +                          releases all four locks. */
93400 +                       return capture_copy(node, txnh, atomf, atomh, mode, can_coc);
93401 +               }
93402 +       }
93403 +
93404 +       /* Because atomf's stage <= CAPTURE_WAIT */
93405 +       assert("jmacd-175", atom_isopen(atomf));
93406 +
93407 +       /* If we got here its either because the atomh is in CAPTURE_WAIT or because the
93408 +          atomf is not in CAPTURE_WAIT. */
93409 +       assert("jmacd-176", (atomh->stage == ASTAGE_CAPTURE_WAIT || atomf->stage != ASTAGE_CAPTURE_WAIT) || atomf->txnh_count == 0);
93410 +
93411 +       /* Now release the txnh lock: only holding the atoms at this point. */
93412 +       UNLOCK_TXNH(txnh);
93413 +       UNLOCK_JNODE(node);
93414 +
93415 +       /* Decide which should be kept and which should be merged. */
93416 +       if (atom_pointer_count(atomf) < atom_pointer_count(atomh)) {
93417 +               capture_fuse_into(atomf, atomh);
93418 +       } else {
93419 +               capture_fuse_into(atomh, atomf);
93420 +       }
93421 +
93422 +       /* Atoms are unlocked in capture_fuse_into.  No locks held. */
93423 +       reiser4_stat_inc(txnmgr.restart.init_fusion_fused);
93424 +       return RETERR(-E_REPEAT);
93425 +}
93426 +
93427 +/* Perform the necessary work to prepare for fusing two atoms, which involves
93428 + * acquiring two atom locks in the proper order.  If one of the node's atom is
93429 + * blocking fusion (i.e., it is in the CAPTURE_WAIT stage) and the handle's
93430 + * atom is not then the handle's request is put to sleep.  If the node's atom
93431 + * is committing, then the node can be copy-on-captured.  Otherwise, pick the
93432 + * atom with fewer pointers to be fused into the atom with more pointer and
93433 + * call capture_fuse_into.
93434 + */
93435 +static int
93436 +capture_init_fusion(jnode * node, txn_handle * txnh, txn_capture mode, int can_coc)
93437 +{
93438 +       /* Have to perform two trylocks here. */
93439 +       if (likely(spin_trylock_atom(node->atom)))
93440 +               if (likely(spin_trylock_atom(txnh->atom)))
93441 +                       return capture_init_fusion_locked(node, txnh, mode, can_coc);
93442 +               else {
93443 +                       UNLOCK_ATOM(node->atom);
93444 +                       reiser4_stat_inc(txnmgr.restart.init_fusion_atomh);
93445 +               }
93446 +       else {
93447 +               reiser4_stat_inc(txnmgr.restart.init_fusion_atomf);
93448 +       }
93449 +
93450 +       UNLOCK_JNODE(node);
93451 +       UNLOCK_TXNH(txnh);
93452 +       return RETERR(-E_REPEAT);
93453 +}
93454 +/* This function splices together two jnode lists (small and large) and sets all jnodes in
93455 +   the small list to point to the large atom.  Returns the length of the list. */
93456 +static int
93457 +capture_fuse_jnode_lists(txn_atom * large, capture_list_head * large_head, capture_list_head * small_head)
93458 +{
93459 +       int count = 0;
93460 +       jnode *node;
93461 +
93462 +       assert("umka-218", large != NULL);
93463 +       assert("umka-219", large_head != NULL);
93464 +       assert("umka-220", small_head != NULL);
93465 +       /* small atom should be locked also. */
93466 +       assert("zam-968", spin_atom_is_locked(large));
93467 +
93468 +       /* For every jnode on small's capture list... */
93469 +       for_all_type_safe_list(capture, small_head, node) {
93470 +               count += 1;
93471 +
93472 +               /* With the jnode lock held, update atom pointer. */
93473 +               UNDER_SPIN_VOID(jnode, node, node->atom = large);
93474 +       }
93475 +
93476 +       /* Splice the lists. */
93477 +       capture_list_splice(large_head, small_head);
93478 +
93479 +       return count;
93480 +}
93481 +
93482 +/* This function splices together two txnh lists (small and large) and sets all txn handles in
93483 +   the small list to point to the large atom.  Returns the length of the list. */
93484 +/* Audited by: umka (2002.06.13) */
93485 +static int
93486 +capture_fuse_txnh_lists(txn_atom * large, txnh_list_head * large_head, txnh_list_head * small_head)
93487 +{
93488 +       int count = 0;
93489 +       txn_handle *txnh;
93490 +
93491 +       assert("umka-221", large != NULL);
93492 +       assert("umka-222", large_head != NULL);
93493 +       assert("umka-223", small_head != NULL);
93494 +
93495 +       /* Adjust every txnh to the new atom. */
93496 +       for_all_type_safe_list(txnh, small_head, txnh) {
93497 +               count += 1;
93498 +
93499 +               /* With the txnh lock held, update atom pointer. */
93500 +               UNDER_SPIN_VOID(txnh, txnh, txnh->atom = large);
93501 +       }
93502 +
93503 +       /* Splice the txn_handle list. */
93504 +       txnh_list_splice(large_head, small_head);
93505 +
93506 +       return count;
93507 +}
93508 +
93509 +/* This function fuses two atoms.  The captured nodes and handles belonging to SMALL are
93510 +   added to LARGE and their ->atom pointers are all updated.  The associated counts are
93511 +   updated as well, and any waiting handles belonging to either are awakened.  Finally the
93512 +   smaller atom's refcount is decremented.
93513 +*/
93514 +static void
93515 +capture_fuse_into(txn_atom * small, txn_atom * large)
93516 +{
93517 +       int level;
93518 +       unsigned zcount = 0;
93519 +       unsigned tcount = 0;
93520 +       protected_jnodes *prot_list;
93521 +
93522 +       assert("umka-224", small != NULL);
93523 +       assert("umka-225", small != NULL);
93524 +
93525 +       assert("umka-299", spin_atom_is_locked(large));
93526 +       assert("umka-300", spin_atom_is_locked(small));
93527 +
93528 +       assert("jmacd-201", atom_isopen(small));
93529 +       assert("jmacd-202", atom_isopen(large));
93530 +
93531 +       ON_TRACE(TRACE_TXN, "fuse atom %u into %u\n", small->atom_id, large->atom_id);
93532 +
93533 +       /* Splice and update the per-level dirty jnode lists */
93534 +       for (level = 0; level < REAL_MAX_ZTREE_HEIGHT + 1; level += 1) {
93535 +               zcount += capture_fuse_jnode_lists(large, ATOM_DIRTY_LIST(large, level), ATOM_DIRTY_LIST(small, level));
93536 +       }
93537 +
93538 +       /* Splice and update the [clean,dirty] jnode and txnh lists */
93539 +       zcount += capture_fuse_jnode_lists(large, ATOM_CLEAN_LIST(large), ATOM_CLEAN_LIST(small));
93540 +       zcount += capture_fuse_jnode_lists(large, ATOM_OVRWR_LIST(large), ATOM_OVRWR_LIST(small));
93541 +       zcount += capture_fuse_jnode_lists(large, ATOM_WB_LIST(large), ATOM_WB_LIST(small));
93542 +       zcount += capture_fuse_jnode_lists(large, &large->inodes, &small->inodes);
93543 +       tcount += capture_fuse_txnh_lists(large, &large->txnh_list, &small->txnh_list);
93544 +
93545 +       for_all_type_safe_list(prot, &small->protected, prot_list) {
93546 +               jnode *node;
93547 +
93548 +               for_all_type_safe_list(capture, &prot_list->nodes, node) {
93549 +                       zcount += 1;
93550 +
93551 +                       LOCK_JNODE(node);
93552 +                       assert("nikita-3375", node->atom == small);
93553 +                       /* With the jnode lock held, update atom pointer. */
93554 +                       node->atom = large;
93555 +                       UNLOCK_JNODE(node);
93556 +               }
93557 +       }
93558 +       /* Splice the lists of lists. */
93559 +       prot_list_splice(&large->protected, &small->protected);
93560 +
93561 +       /* Check our accounting. */
93562 +       assert("jmacd-1063", zcount + small->num_queued == small->capture_count);
93563 +       assert("jmacd-1065", tcount == small->txnh_count);
93564 +
93565 +       /* sum numbers of waiters threads */
93566 +       large->nr_waiters += small->nr_waiters;
93567 +       small->nr_waiters = 0;
93568 +
93569 +       /* splice flush queues */
93570 +       fuse_fq(large, small);
93571 +
93572 +       /* update counter of jnode on every atom' list */
93573 +       ON_DEBUG(large->dirty += small->dirty;
93574 +                small->dirty = 0;
93575 +                large->clean += small->clean;
93576 +                small->clean = 0;
93577 +                large->ovrwr += small->ovrwr;
93578 +                small->ovrwr = 0;
93579 +                large->wb += small->wb;
93580 +                small->wb = 0;
93581 +                large->fq += small->fq;
93582 +                small->fq = 0;
93583 +                large->protect += small->protect;
93584 +                small->protect = 0;
93585 +               );
93586 +
93587 +       /* count flushers in result atom */
93588 +       large->nr_flushers += small->nr_flushers;
93589 +       small->nr_flushers = 0;
93590 +
93591 +       /* update counts of flushed nodes */
93592 +       large->flushed += small->flushed;
93593 +       small->flushed = 0;
93594 +
93595 +       /* Transfer list counts to large. */
93596 +       large->txnh_count += small->txnh_count;
93597 +       large->capture_count += small->capture_count;
93598 +
93599 +       /* Add all txnh references to large. */
93600 +       atomic_add(small->txnh_count, &large->refcount);
93601 +       atomic_sub(small->txnh_count, &small->refcount);
93602 +
93603 +       /* Reset small counts */
93604 +       small->txnh_count = 0;
93605 +       small->capture_count = 0;
93606 +
93607 +       /* Assign the oldest start_time, merge flags. */
93608 +       large->start_time = min(large->start_time, small->start_time);
93609 +       large->flags |= small->flags;
93610 +
93611 +       /* Merge blocknr sets. */
93612 +       blocknr_set_merge(&small->delete_set, &large->delete_set);
93613 +       blocknr_set_merge(&small->wandered_map, &large->wandered_map);
93614 +
93615 +       /* Merge allocated/deleted file counts */
93616 +       large->nr_objects_deleted += small->nr_objects_deleted;
93617 +       large->nr_objects_created += small->nr_objects_created;
93618 +
93619 +       small->nr_objects_deleted = 0;
93620 +       small->nr_objects_created = 0;
93621 +
93622 +       /* Merge allocated blocks counts */
93623 +       large->nr_blocks_allocated += small->nr_blocks_allocated;
93624 +
93625 +       large->nr_running_queues += small->nr_running_queues;
93626 +       small->nr_running_queues = 0;
93627 +
93628 +       /* Merge blocks reserved for overwrite set. */
93629 +       large->flush_reserved += small->flush_reserved;
93630 +       small->flush_reserved = 0;
93631 +
93632 +       if (large->stage < small->stage) {
93633 +               /* Large only needs to notify if it has changed state. */
93634 +               atom_set_stage(large, small->stage);
93635 +               wakeup_atom_waiting_list(large);
93636 +       }
93637 +
93638 +       atom_set_stage(small, ASTAGE_INVALID);
93639 +
93640 +       /* Notify any waiters--small needs to unload its wait lists.  Waiters
93641 +          actually remove themselves from the list before returning from the
93642 +          fuse_wait function. */
93643 +       wakeup_atom_waiting_list(small);
93644 +
93645 +       /* Unlock atoms */
93646 +       UNLOCK_ATOM(large);
93647 +       atom_dec_and_unlock(small);
93648 +}
93649 +
93650 +reiser4_internal void
93651 +protected_jnodes_init(protected_jnodes *list)
93652 +{
93653 +       txn_atom *atom;
93654 +
93655 +       assert("nikita-3376", list != NULL);
93656 +
93657 +       atom = get_current_atom_locked();
93658 +       prot_list_push_front(&atom->protected, list);
93659 +       capture_list_init(&list->nodes);
93660 +       UNLOCK_ATOM(atom);
93661 +}
93662 +
93663 +reiser4_internal void
93664 +protected_jnodes_done(protected_jnodes *list)
93665 +{
93666 +       txn_atom *atom;
93667 +
93668 +       assert("nikita-3379", capture_list_empty(&list->nodes));
93669 +
93670 +       atom = get_current_atom_locked();
93671 +       prot_list_remove(list);
93672 +       UNLOCK_ATOM(atom);
93673 +}
93674 +
93675 +/* TXNMGR STUFF */
93676 +
93677 +#if REISER4_COPY_ON_CAPTURE
93678 +
93679 +/* copy on capture steals jnode (J) from capture list. It may replace (J) with
93680 +   special newly created jnode (CCJ) to which J's page gets attached. J in its
93681 +   turn gets newly created copy of page.
93682 +   Or, it may merely take J from capture list if J was never dirtied
93683 +
93684 +   The problem with this replacement is that capture lists are being contiguously
93685 +   scanned.
93686 +   Race between replacement and scanning are avoided with one global spin lock
93687 +   (scan_lock) and JNODE_SCANNED state of jnode. Replacement (in capture copy)
93688 +   goes under scan_lock locked only if jnode is not in JNODE_SCANNED state. This
93689 +   state gets set under scan_lock locked whenever scanning is working with that
93690 +   jnode.
93691 +*/
93692 +
93693 +/* remove jnode page from mapping's tree and insert new page with the same index */
93694 +static void
93695 +replace_page_in_mapping(jnode *node, struct page *new_page)
93696 +{
93697 +       struct address_space *mapping;
93698 +       unsigned long index;
93699 +
93700 +       mapping = jnode_get_mapping(node);
93701 +       index = jnode_get_index(node);
93702 +
93703 +       spin_lock(&mapping->page_lock);
93704 +
93705 +       /* delete old page from. This resembles __remove_from_page_cache */
93706 +       assert("vs-1416", radix_tree_lookup(&mapping->page_tree, index) == node->pg);
93707 +       assert("vs-1428", node->pg->mapping == mapping);
93708 +       __remove_from_page_cache(node->pg);
93709 +
93710 +       /* insert new page into mapping */
93711 +       check_me("vs-1411",
93712 +                radix_tree_insert(&mapping->page_tree, index, new_page) == 0);
93713 +
93714 +       /* this resembles add_to_page_cache */
93715 +       page_cache_get(new_page);
93716 +       ___add_to_page_cache(new_page, mapping, index);
93717 +
93718 +       spin_unlock(&mapping->page_lock);
93719 +       lru_cache_add(new_page);
93720 +}
93721 +
93722 +/* attach page of @node to @copy, @new_page to @node */
93723 +static void
93724 +swap_jnode_pages(jnode *node, jnode *copy, struct page *new_page)
93725 +{
93726 +       /* attach old page to new jnode */
93727 +       assert("vs-1414", jnode_by_page(node->pg) == node);
93728 +       copy->pg = node->pg;
93729 +       copy->data = page_address(copy->pg);
93730 +       jnode_set_block(copy, jnode_get_block(node));
93731 +       copy->pg->private = (unsigned long)copy;
93732 +
93733 +       /* attach new page to jnode */
93734 +       assert("vs-1412", !PagePrivate(new_page));
93735 +       page_cache_get(new_page);
93736 +       node->pg = new_page;
93737 +       node->data = page_address(new_page);
93738 +       new_page->private = (unsigned long)node;
93739 +       SetPagePrivate(new_page);
93740 +
93741 +       {
93742 +               /* insert old page to new mapping */
93743 +               struct address_space *mapping;
93744 +               unsigned long index;
93745 +
93746 +               mapping = get_current_super_private()->cc->i_mapping;
93747 +               index = (unsigned long)copy;
93748 +               spin_lock(&mapping->page_lock);
93749 +
93750 +               /* insert old page into new (fake) mapping. No page_cache_get
93751 +                  because page reference counter was not decreased on removing
93752 +                  it from old mapping */
93753 +               assert("vs-1416", radix_tree_lookup(&mapping->page_tree, index) == NULL);
93754 +               check_me("vs-1418", radix_tree_insert(&mapping->page_tree, index, copy->pg) == 0);
93755 +               ___add_to_page_cache(copy->pg, mapping, index);
93756 +               ON_DEBUG(set_bit(PG_arch_1, &(copy->pg)->flags));
93757 +
93758 +               /* corresponding page_cache_release is in invalidate_list */
93759 +               page_cache_get(copy->pg);
93760 +               spin_unlock(&mapping->page_lock);
93761 +       }
93762 +}
93763 +
93764 +/* this is to make capture copied jnode looking like if there were jload called for it */
93765 +static void
93766 +fake_jload(jnode *node)
93767 +{
93768 +       jref(node);
93769 +       atomic_inc(&node->d_count);
93770 +       JF_SET(node, JNODE_PARSED);
93771 +}
93772 +
93773 +/* for now - refuse to copy-on-capture any suspicious nodes (WRITEBACK, DIRTY, FLUSH_QUEUED) */
93774 +static int
93775 +check_capturable(const jnode *node, const txn_atom *atom)
93776 +{
93777 +       assert("vs-1429", spin_jnode_is_locked(node));
93778 +       assert("vs-1487", check_spin_is_locked(&scan_lock));
93779 +
93780 +       if (JF_ISSET(node, JNODE_WRITEBACK)) {
93781 +               reiser4_stat_inc(coc.writeback);
93782 +               return RETERR(-E_WAIT);
93783 +       }
93784 +       if (JF_ISSET(node, JNODE_FLUSH_QUEUED)) {
93785 +               reiser4_stat_inc(coc.flush_queued);
93786 +               return RETERR(-E_WAIT);
93787 +       }
93788 +       if (JF_ISSET(node, JNODE_DIRTY)) {
93789 +               reiser4_stat_inc(coc.dirty);
93790 +               return RETERR(-E_WAIT);
93791 +       }
93792 +       if (JF_ISSET(node, JNODE_SCANNED)) {
93793 +               reiser4_stat_inc(coc.scan_race);
93794 +               return RETERR(-E_REPEAT);
93795 +       }
93796 +       if (node->atom != atom) {
93797 +               reiser4_stat_inc(coc.atom_changed);
93798 +               return RETERR(-E_WAIT);
93799 +       }
93800 +       return 0; /* OK */
93801 +}
93802 +
93803 +static void
93804 +remove_from_capture_list(jnode *node)
93805 +{
93806 +       ON_DEBUG_MODIFY(znode_set_checksum(node, 1));
93807 +       JF_CLR(node, JNODE_DIRTY);
93808 +       JF_CLR(node, JNODE_RELOC);
93809 +       JF_CLR(node, JNODE_OVRWR);
93810 +       JF_CLR(node, JNODE_CREATED);
93811 +       JF_CLR(node, JNODE_WRITEBACK);
93812 +       JF_CLR(node, JNODE_REPACK);
93813 +
93814 +       capture_list_remove_clean(node);
93815 +       node->atom->capture_count --;
93816 +       atomic_dec(&node->x_count);
93817 +       /*XXXX*/ON_DEBUG(count_jnode(node->atom, node, NODE_LIST(node), NOT_CAPTURED, 1));
93818 +       node->atom = 0;
93819 +}
93820 +
93821 +/* insert new jnode (copy) to capture list instead of old one */
93822 +static void
93823 +replace_on_capture_list(jnode *node, jnode *copy)
93824 +{
93825 +       assert("vs-1415", node->atom);
93826 +       assert("vs-1489", !capture_list_is_clean(node));
93827 +       assert("vs-1493", JF_ISSET(copy, JNODE_CC) && JF_ISSET(copy, JNODE_HEARD_BANSHEE));
93828 +
93829 +       copy->state |= node->state;
93830 +
93831 +       /* insert cc-jnode @copy into capture list before old jnode @node */
93832 +       capture_list_insert_before(node, copy);
93833 +       jref(copy);
93834 +       copy->atom = node->atom;
93835 +       node->atom->capture_count ++;
93836 +       /*XXXX*/ON_DEBUG(count_jnode(node->atom, copy, NODE_LIST(copy), NODE_LIST(node), 1));
93837 +
93838 +       /* remove old jnode from capture list */
93839 +       remove_from_capture_list(node);
93840 +}
93841 +
93842 +/* when capture request is made for a node which is captured but was never
93843 +   dirtied copy on capture will merely uncapture it */
93844 +static int
93845 +copy_on_capture_clean(jnode *node, txn_atom *atom)
93846 +{
93847 +       int result;
93848 +
93849 +       assert("vs-1625", spin_atom_is_locked(atom));
93850 +       assert("vs-1432", spin_jnode_is_locked(node));
93851 +       assert("vs-1627", !JF_ISSET(node, JNODE_WRITEBACK));
93852 +
93853 +       spin_lock(&scan_lock);
93854 +       result = check_capturable(node, atom);
93855 +       if (result == 0) {
93856 +               /* remove jnode from capture list */
93857 +               remove_from_capture_list(node);
93858 +               reiser4_stat_inc(coc.ok_clean);
93859 +       }
93860 +       spin_unlock(&scan_lock);
93861 +       UNLOCK_JNODE(node);
93862 +       UNLOCK_ATOM(atom);
93863 +
93864 +       return result;
93865 +}
93866 +
93867 +static void
93868 +lock_two_nodes(jnode *node1, jnode *node2)
93869 +{
93870 +       if (node1 > node2) {
93871 +               LOCK_JNODE(node2);
93872 +               LOCK_JNODE(node1);
93873 +       } else {
93874 +               LOCK_JNODE(node1);
93875 +               LOCK_JNODE(node2);
93876 +       }
93877 +}
93878 +
93879 +/* capture request is made for node which does not have page. In most cases this
93880 +   is "uber" znode */
93881 +static int
93882 +copy_on_capture_nopage(jnode *node, txn_atom *atom)
93883 +{
93884 +       int result;
93885 +       jnode *copy;
93886 +
93887 +       assert("vs-1432", spin_atom_is_locked(atom));
93888 +       assert("vs-1432", spin_jnode_is_locked(node));
93889 +
93890 +       jref(node);
93891 +       UNLOCK_JNODE(node);
93892 +       UNLOCK_ATOM(atom);
93893 +       assert("nikita-3475", schedulable());
93894 +       copy = jclone(node);
93895 +       if (IS_ERR(copy)) {
93896 +               jput(node);
93897 +               return PTR_ERR(copy);
93898 +       }
93899 +
93900 +       LOCK_ATOM(atom);
93901 +       lock_two_nodes(node, copy);
93902 +       spin_lock(&scan_lock);
93903 +
93904 +       result = check_capturable(node, atom);
93905 +       if (result == 0) {
93906 +               if (jnode_page(node) == NULL) {
93907 +                       replace_on_capture_list(node, copy);
93908 +#if REISER4_STATS
93909 +                       if (znode_above_root(JZNODE(node)))
93910 +                               reiser4_stat_inc(coc.ok_uber);
93911 +                       else
93912 +                               reiser4_stat_inc(coc.ok_nopage);
93913 +#endif
93914 +               } else
93915 +                       result = RETERR(-E_REPEAT);
93916 +       }
93917 +
93918 +       spin_unlock(&scan_lock);
93919 +       UNLOCK_JNODE(node);
93920 +       UNLOCK_JNODE(copy);
93921 +       UNLOCK_ATOM(atom);
93922 +       assert("nikita-3476", schedulable());
93923 +       jput(copy);
93924 +       assert("nikita-3477", schedulable());
93925 +       jput(node);
93926 +       assert("nikita-3478", schedulable());
93927 +       ON_TRACE(TRACE_CAPTURE_COPY, "nopage\n");
93928 +       return result;
93929 +}
93930 +
93931 +static int
93932 +handle_coc(jnode *node, jnode *copy, struct page *page, struct page *new_page,
93933 +          txn_atom *atom)
93934 +{
93935 +       char *to;
93936 +       char *from;
93937 +       int   result;
93938 +
93939 +       to = kmap(new_page);
93940 +       lock_page(page);
93941 +       from = kmap(page);
93942 +       /*
93943 +        * FIXME(zam): one preloaded radix tree node may be not enough for two
93944 +        * insertions, one insertion is in replace_page_in_mapping(), another
93945 +        * one is in swap_jnode_pages(). The radix_tree_delete() call might
93946 +        * not help, because an empty radix tree node is freed and the node's
93947 +        * free space may not be re-used in insertion.
93948 +        */
93949 +       radix_tree_preload(GFP_KERNEL);
93950 +       LOCK_ATOM(atom);
93951 +       lock_two_nodes(node, copy);
93952 +       spin_lock(&scan_lock);
93953 +
93954 +       result = check_capturable(node, atom);
93955 +       if (result == 0) {
93956 +               /* if node was jloaded by get_overwrite_set, we have to jrelse
93957 +                  it here, because we remove jnode from atom's capture list -
93958 +                  put_overwrite_set will not jrelse it */
93959 +               int was_jloaded;
93960 +
93961 +               was_jloaded = JF_ISSET(node, JNODE_JLOADED_BY_GET_OVERWRITE_SET);
93962 +
93963 +               replace_page_in_mapping(node, new_page);
93964 +               swap_jnode_pages(node, copy, new_page);
93965 +               replace_on_capture_list(node, copy);
93966 +               /* statistics */
93967 +               if (JF_ISSET(copy, JNODE_RELOC)) {
93968 +                       reiser4_stat_inc(coc.ok_reloc);
93969 +               } else if (JF_ISSET(copy, JNODE_OVRWR)) {
93970 +                       reiser4_stat_inc(coc.ok_ovrwr);
93971 +               } else
93972 +                       impossible("", "");
93973 +
93974 +               memcpy(to, from, PAGE_CACHE_SIZE);
93975 +               SetPageUptodate(new_page);
93976 +               if (was_jloaded)
93977 +                       fake_jload(copy);
93978 +               else
93979 +                       kunmap(page);
93980 +
93981 +               assert("vs-1419", page_count(new_page) >= 3);
93982 +               spin_unlock(&scan_lock);
93983 +               UNLOCK_JNODE(node);
93984 +               UNLOCK_JNODE(copy);
93985 +               UNLOCK_ATOM(atom);
93986 +               radix_tree_preload_end();
93987 +               unlock_page(page);
93988 +
93989 +               if (was_jloaded) {
93990 +                       jrelse_tail(node);
93991 +                       assert("vs-1494", JF_ISSET(node, JNODE_JLOADED_BY_GET_OVERWRITE_SET));
93992 +                       JF_CLR(node, JNODE_JLOADED_BY_GET_OVERWRITE_SET);
93993 +               } else
93994 +                       kunmap(new_page);
93995 +
93996 +               jput(copy);
93997 +               jrelse(node);
93998 +               jput(node);
93999 +               page_cache_release(page);
94000 +               page_cache_release(new_page);
94001 +               ON_TRACE(TRACE_CAPTURE_COPY, "copy on capture done\n");
94002 +       } else {
94003 +               spin_unlock(&scan_lock);
94004 +               UNLOCK_JNODE(node);
94005 +               UNLOCK_JNODE(copy);
94006 +               UNLOCK_ATOM(atom);
94007 +               radix_tree_preload_end();
94008 +               kunmap(page);
94009 +               unlock_page(page);
94010 +               kunmap(new_page);
94011 +               page_cache_release(new_page);
94012 +       }
94013 +       return result;
94014 +}
94015 +
94016 +static int
94017 +real_copy_on_capture(jnode *node, txn_atom *atom)
94018 +{
94019 +       int result;
94020 +       jnode *copy;
94021 +       struct page *page;
94022 +       struct page *new_page;
94023 +
94024 +       assert("vs-1432", spin_jnode_is_locked(node));
94025 +       assert("vs-1490", !JF_ISSET(node, JNODE_EFLUSH));
94026 +       assert("vs-1491", node->pg);
94027 +       assert("vs-1492", jprivate(node->pg) == node);
94028 +
94029 +       page = node->pg;
94030 +       page_cache_get(page);
94031 +       jref(node);
94032 +       UNLOCK_JNODE(node);
94033 +       UNLOCK_ATOM(atom);
94034 +
94035 +       /* prevent node from eflushing */
94036 +       result = jload(node);
94037 +       if (!result) {
94038 +               copy = jclone(node);
94039 +               if (likely(!IS_ERR(copy))) {
94040 +                       new_page = alloc_page(GFP_KERNEL);
94041 +                       if (new_page) {
94042 +                               result = handle_coc(node,
94043 +                                                   copy, page, new_page, atom);
94044 +                               if (result == 0)
94045 +                                       return 0;
94046 +                       } else
94047 +                               result = RETERR(-ENOMEM);
94048 +                       jput(copy);
94049 +               }
94050 +               jrelse(node);
94051 +       }
94052 +
94053 +       jput(node);
94054 +       page_cache_release(page);
94055 +       return result;
94056 +}
94057 +
94058 +/* create new jnode, create new page, jload old jnode, copy data, detach old
94059 +   page from old jnode, attach new page to old jnode, attach old page to new
94060 +   jnode this returns 0 if copy on capture succeeded, E_REPEAT to have
94061 +   capture_fuse_wait to be called */
94062 +static int
94063 +create_copy_and_replace(jnode *node, txn_atom *atom)
94064 +{
94065 +       int result;
94066 +       struct inode *inode; /* inode for which filemap_nopage is blocked */
94067 +
94068 +       assert("jmacd-321", spin_jnode_is_locked(node));
94069 +       assert("umka-295", spin_atom_is_locked(atom));
94070 +       assert("vs-1381", node->atom == atom);
94071 +       assert("vs-1409", atom->stage > ASTAGE_CAPTURE_WAIT && atom->stage < ASTAGE_DONE);
94072 +       assert("vs-1410", jnode_is_znode(node) || jnode_is_unformatted(node));
94073 +
94074 +
94075 +       if (JF_ISSET(node, JNODE_CCED)) {
94076 +               /* node is under copy on capture already */
94077 +               reiser4_stat_inc(coc.coc_race);
94078 +               UNLOCK_JNODE(node);
94079 +               UNLOCK_ATOM(atom);
94080 +               return RETERR(-E_WAIT);
94081 +       }
94082 +
94083 +       /* measure how often suspicious (WRITEBACK, DIRTY, FLUSH_QUEUED) appear
94084 +          here. For most often case we can return EAGAIN right here and avoid
94085 +          all the preparations made for copy on capture */
94086 +       ON_TRACE(TRACE_CAPTURE_COPY, "copy_on_capture: node %p, atom %p..", node, atom);
94087 +       if (JF_ISSET(node, JNODE_EFLUSH)) {
94088 +               UNLOCK_JNODE(node);
94089 +               UNLOCK_ATOM(atom);
94090 +
94091 +               reiser4_stat_inc(coc.eflush);
94092 +               ON_TRACE(TRACE_CAPTURE_COPY, "eflushed\n");
94093 +               result = jload(node);
94094 +               if (result)
94095 +                       return RETERR(result);
94096 +               jrelse(node);
94097 +               return RETERR(-E_REPEAT);
94098 +       }
94099 +
94100 +       set_cced_bit(node);
94101 +
94102 +       if (jnode_is_unformatted(node)) {
94103 +               /* to capture_copy unformatted node we have to take care of its
94104 +                  page mappings. Page gets unmapped here and concurrent
94105 +                  mappings are blocked on reiser4 inodes's coc_sem in reiser4's
94106 +                  filemap_nopage */
94107 +               struct page *page;
94108 +
94109 +               inode = mapping_jnode(node)->host;
94110 +               page = jnode_page(node);
94111 +               assert("vs-1640", inode != NULL);
94112 +               assert("vs-1641", page != NULL);
94113 +               assert("vs-1642", page->mapping != NULL);
94114 +               UNLOCK_JNODE(node);
94115 +               UNLOCK_ATOM(atom);
94116 +
94117 +               down_write(&reiser4_inode_data(inode)->coc_sem);
94118 +               lock_page(page);
94119 +               pte_chain_lock(page);
94120 +
94121 +               if (page_mapped(page)) {
94122 +                       result = try_to_unmap(page);
94123 +                       if (result == SWAP_AGAIN) {
94124 +                               result = RETERR(-E_REPEAT);
94125 +
94126 +                       } else if (result == SWAP_FAIL)
94127 +                               result = RETERR(-E_WAIT);
94128 +                       else {
94129 +                               assert("vs-1643", result == SWAP_SUCCESS);
94130 +                               result = 0;
94131 +                       }
94132 +                       if (result != 0) {
94133 +                               unlock_page(page);
94134 +                               pte_chain_unlock(page);
94135 +                               up_write(&reiser4_inode_data(inode)->coc_sem);
94136 +                               return result;
94137 +                       }
94138 +               }
94139 +               pte_chain_unlock(page);
94140 +               unlock_page(page);
94141 +               LOCK_ATOM(atom);
94142 +               LOCK_JNODE(node);
94143 +       } else
94144 +               inode = NULL;
94145 +
94146 +       if (!JF_ISSET(node, JNODE_OVRWR) && !JF_ISSET(node, JNODE_RELOC)) {
94147 +               /* clean node can be made available for capturing. Just take
94148 +                  care to preserve atom list during uncapturing */
94149 +               ON_TRACE(TRACE_CAPTURE_COPY, "clean\n");
94150 +               result = copy_on_capture_clean(node, atom);
94151 +       } else if (!node->pg) {
94152 +               ON_TRACE(TRACE_CAPTURE_COPY, "uber\n");
94153 +               result = copy_on_capture_nopage(node, atom);
94154 +       } else
94155 +               result = real_copy_on_capture(node, atom);
94156 +       if (result != 0)
94157 +               clear_cced_bits(node);
94158 +       assert("vs-1626", spin_atom_is_not_locked(atom));
94159 +
94160 +       if (inode != NULL)
94161 +               up_write(&reiser4_inode_data(inode)->coc_sem);
94162 +
94163 +       return result;
94164 +}
94165 +#endif /* REISER4_COPY_ON_CAPTURE */
94166 +
94167 +/* Perform copy-on-capture of a block. */
94168 +static int
94169 +capture_copy(jnode * node, txn_handle * txnh, txn_atom * atomf, txn_atom * atomh, txn_capture mode, int can_coc)
94170 +{
94171 +#if REISER4_COPY_ON_CAPTURE
94172 +       reiser4_stat_inc(coc.calls);
94173 +
94174 +       /* do not copy on capture in ent thread to avoid deadlock on coc semaphore */
94175 +       if (can_coc && get_current_context()->entd == 0) {
94176 +               int result;
94177 +
94178 +               ON_TRACE(TRACE_TXN, "capture_copy\n");
94179 +
94180 +               /* The txnh and its (possibly NULL) atom's locks are not needed
94181 +                  at this point. */
94182 +               UNLOCK_TXNH(txnh);
94183 +               if (atomh != NULL)
94184 +                       UNLOCK_ATOM(atomh);
94185 +
94186 +               /* create a copy of node, detach node from atom and attach its copy
94187 +                  instead */
94188 +               atomic_inc(&atomf->refcount);
94189 +               result = create_copy_and_replace(node, atomf);
94190 +               assert("nikita-3474", schedulable());
94191 +               preempt_point();
94192 +               LOCK_ATOM(atomf);
94193 +               atom_dec_and_unlock(atomf);
94194 +               preempt_point();
94195 +
94196 +               if (result == 0) {
94197 +                       if (jnode_is_znode(node)) {
94198 +                               znode *z;
94199 +
94200 +                               z = JZNODE(node);
94201 +                               z->version = znode_build_version(jnode_get_tree(node));
94202 +                       }
94203 +                       result = RETERR(-E_REPEAT);
94204 +               }
94205 +               return result;
94206 +       }
94207 +
94208 +       reiser4_stat_inc(coc.forbidden);
94209 +       return capture_fuse_wait(node, txnh, atomf, atomh, mode);
94210 +#else
94211 +       ON_TRACE(TRACE_TXN, "capture_copy: fuse wait\n");
94212 +
94213 +       return capture_fuse_wait(node, txnh, atomf, atomh, mode);
94214 +
94215 +#endif
94216 +}
94217 +
94218 +/* Release a block from the atom, reversing the effects of being captured,
94219 +   do not release atom's reference to jnode due to holding spin-locks.
94220 +   Currently this is only called when the atom commits.
94221 +
94222 +   NOTE: this function does not release a (journal) reference to jnode
94223 +   due to locking optimizations, you should call jput() somewhere after
94224 +   calling uncapture_block(). */
94225 +reiser4_internal void uncapture_block(jnode * node)
94226 +{
94227 +       txn_atom * atom;
94228 +
94229 +       assert("umka-226", node != NULL);
94230 +       atom = node->atom;
94231 +       assert("umka-228", atom != NULL);
94232 +
94233 +       assert("jmacd-1021", node->atom == atom);
94234 +       assert("jmacd-1022", spin_jnode_is_locked(node));
94235 +#if REISER4_COPY_ON_CAPTURE
94236 +       assert("jmacd-1023", spin_atom_is_locked(atom));
94237 +#else
94238 +       assert("jmacd-1023", atom_is_protected(atom));
94239 +#endif
94240 +
94241 +       /*ON_TRACE (TRACE_TXN, "un-capture %p from atom %u (captured %u)\n",
94242 +        * node, atom->atom_id, atom->capture_count); */
94243 +
94244 +       ON_DEBUG_MODIFY(znode_set_checksum(node, 1));
94245 +       JF_CLR(node, JNODE_DIRTY);
94246 +       JF_CLR(node, JNODE_RELOC);
94247 +       JF_CLR(node, JNODE_OVRWR);
94248 +       JF_CLR(node, JNODE_CREATED);
94249 +       JF_CLR(node, JNODE_WRITEBACK);
94250 +       JF_CLR(node, JNODE_REPACK);
94251 +       clear_cced_bits(node);
94252 +#if REISER4_DEBUG
94253 +       node->written = 0;
94254 +#endif
94255 +
94256 +       capture_list_remove_clean(node);
94257 +       if (JF_ISSET(node, JNODE_FLUSH_QUEUED)) {
94258 +               assert("zam-925", atom_isopen(atom));
94259 +               assert("vs-1623", NODE_LIST(node) == FQ_LIST);
94260 +               ON_DEBUG(atom->num_queued --);
94261 +               JF_CLR(node, JNODE_FLUSH_QUEUED);
94262 +       }
94263 +       atom->capture_count -= 1;
94264 +       ON_DEBUG(count_jnode(atom, node, NODE_LIST(node), NOT_CAPTURED, 1));
94265 +       node->atom = NULL;
94266 +
94267 +       UNLOCK_JNODE(node);
94268 +       LOCK_CNT_DEC(t_refs);
94269 +}
94270 +
94271 +/* Unconditional insert of jnode into atom's overwrite list. Currently used in
94272 +   bitmap-based allocator code for adding modified bitmap blocks the
94273 +   transaction. @atom and @node are spin locked */
94274 +reiser4_internal void
94275 +insert_into_atom_ovrwr_list(txn_atom * atom, jnode * node)
94276 +{
94277 +       assert("zam-538", spin_atom_is_locked(atom) || atom->stage >= ASTAGE_PRE_COMMIT);
94278 +       assert("zam-539", spin_jnode_is_locked(node));
94279 +       assert("zam-899", JF_ISSET(node, JNODE_OVRWR));
94280 +       assert("zam-543", node->atom == NULL);
94281 +       assert("vs-1433", !jnode_is_unformatted(node) && !jnode_is_znode(node));
94282 +
94283 +       capture_list_push_front(ATOM_OVRWR_LIST(atom), node);
94284 +       jref(node);
94285 +       node->atom = atom;
94286 +       atom->capture_count++;
94287 +       ON_DEBUG(count_jnode(atom, node, NODE_LIST(node), OVRWR_LIST, 1));
94288 +}
94289 +
94290 +/* return 1 if two dirty jnodes belong to one atom, 0 - otherwise */
94291 +reiser4_internal int
94292 +jnodes_of_one_atom(jnode * j1, jnode * j2)
94293 +{
94294 +       int ret = 0;
94295 +       int finish = 0;
94296 +
94297 +       assert("zam-9003", j1 != j2);
94298 +       /*assert ("zam-9004", jnode_check_dirty (j1)); */
94299 +       assert("zam-9005", jnode_check_dirty(j2));
94300 +
94301 +       do {
94302 +               LOCK_JNODE(j1);
94303 +               assert("zam-9001", j1->atom != NULL);
94304 +               if (spin_trylock_jnode(j2)) {
94305 +                       assert("zam-9002", j2->atom != NULL);
94306 +                       ret = (j2->atom == j1->atom);
94307 +                       finish = 1;
94308 +
94309 +                       UNLOCK_JNODE(j2);
94310 +               }
94311 +               UNLOCK_JNODE(j1);
94312 +       } while (!finish);
94313 +
94314 +       return ret;
94315 +}
94316 +
94317 +/* when atom becomes that big, commit it as soon as possible. This was found
94318 + * to be most effective by testing. */
94319 +reiser4_internal unsigned int
94320 +txnmgr_get_max_atom_size(struct super_block *super UNUSED_ARG)
94321 +{
94322 +       return nr_free_pagecache_pages() / 2;
94323 +}
94324 +
94325 +
94326 +#if REISER4_DEBUG_OUTPUT
94327 +
94328 +reiser4_internal void
94329 +info_atom(const char *prefix, const txn_atom * atom)
94330 +{
94331 +       if (atom == NULL) {
94332 +               printk("%s: no atom\n", prefix);
94333 +               return;
94334 +       }
94335 +
94336 +       printk("%s: refcount: %i id: %i flags: %x txnh_count: %i"
94337 +              " capture_count: %i stage: %x start: %lu, flushed: %i\n", prefix,
94338 +              atomic_read(&atom->refcount), atom->atom_id, atom->flags, atom->txnh_count,
94339 +              atom->capture_count, atom->stage, atom->start_time, atom->flushed);
94340 +}
94341 +
94342 +
94343 +reiser4_internal void
94344 +print_atom(const char *prefix, txn_atom * atom)
94345 +{
94346 +       jnode *pos_in_atom;
94347 +       char list[32];
94348 +       int level;
94349 +
94350 +       assert("umka-229", atom != NULL);
94351 +
94352 +       info_atom(prefix, atom);
94353 +
94354 +       for (level = 0; level < REAL_MAX_ZTREE_HEIGHT + 1; level += 1) {
94355 +
94356 +               sprintf(list, "capture level %d", level);
94357 +
94358 +               for (pos_in_atom = capture_list_front(ATOM_DIRTY_LIST(atom, level));
94359 +                    !capture_list_end(ATOM_DIRTY_LIST(atom, level), pos_in_atom);
94360 +                    pos_in_atom = capture_list_next(pos_in_atom)) {
94361 +
94362 +                       info_jnode(list, pos_in_atom);
94363 +                       printk("\n");
94364 +               }
94365 +       }
94366 +
94367 +       for_all_type_safe_list(capture, ATOM_CLEAN_LIST(atom), pos_in_atom) {
94368 +               info_jnode("clean", pos_in_atom);
94369 +               printk("\n");
94370 +       }
94371 +}
94372 +#endif
94373 +
94374 +static int count_deleted_blocks_actor (
94375 +       txn_atom *atom, const reiser4_block_nr * a, const reiser4_block_nr *b, void * data)
94376 +{
94377 +       reiser4_block_nr *counter = data;
94378 +
94379 +       assert ("zam-995", data != NULL);
94380 +       assert ("zam-996", a != NULL);
94381 +       if (b == NULL)
94382 +               *counter += 1;
94383 +       else
94384 +               *counter += *b;
94385 +       return 0;
94386 +}
94387 +reiser4_internal reiser4_block_nr txnmgr_count_deleted_blocks (void)
94388 +{
94389 +       reiser4_block_nr result;
94390 +       txn_mgr *tmgr = &get_super_private(reiser4_get_current_sb())->tmgr;
94391 +       txn_atom * atom;
94392 +
94393 +       result = 0;
94394 +
94395 +       spin_lock_txnmgr(tmgr);
94396 +       for_all_type_safe_list(atom, &tmgr->atoms_list, atom) {
94397 +               LOCK_ATOM(atom);
94398 +               blocknr_set_iterator(atom, &atom->delete_set,
94399 +                                    count_deleted_blocks_actor, &result, 0);
94400 +               UNLOCK_ATOM(atom);
94401 +       }
94402 +       spin_unlock_txnmgr(tmgr);
94403 +
94404 +       return result;
94405 +}
94406 +
94407 +/* Make Linus happy.
94408 +   Local variables:
94409 +   c-indentation-style: "K&R"
94410 +   mode-name: "LC"
94411 +   c-basic-offset: 8
94412 +   tab-width: 8
94413 +   fill-column: 80
94414 +   End:
94415 +*/
94416 diff -rupN linux-2.6.8-rc3/fs/reiser4/txnmgr.h linux-2.6.8-rc3-a/fs/reiser4/txnmgr.h
94417 --- linux-2.6.8-rc3/fs/reiser4/txnmgr.h 1970-01-01 03:00:00.000000000 +0300
94418 +++ linux-2.6.8-rc3-a/fs/reiser4/txnmgr.h       2004-08-05 21:20:53.362601593 +0400
94419 @@ -0,0 +1,659 @@
94420 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
94421 + * reiser4/README */
94422 +
94423 +/* data-types and function declarations for transaction manager. See txnmgr.c
94424 + * for details. */
94425 +
94426 +#ifndef __REISER4_TXNMGR_H__
94427 +#define __REISER4_TXNMGR_H__
94428 +
94429 +#include "forward.h"
94430 +#include "spin_macros.h"
94431 +#include "dformat.h"
94432 +#include "type_safe_list.h"
94433 +
94434 +#include <linux/fs.h>
94435 +#include <linux/mm.h>
94436 +#include <linux/types.h>
94437 +#include <linux/spinlock.h>
94438 +#include <asm/atomic.h>
94439 +#include <asm/semaphore.h>
94440 +
94441 +/* LIST TYPES */
94442 +
94443 +/* list of all atoms controlled by single transaction manager (that is, file
94444 + * system) */
94445 +TYPE_SAFE_LIST_DECLARE(atom);
94446 +/* list of transaction handles attached to given atom */
94447 +TYPE_SAFE_LIST_DECLARE(txnh);
94448 +
94449 +/*
94450 + * ->fwaitfor and ->fwaiting lists.
94451 + *
94452 + * Each atom has one of these lists: one for its own handles waiting on
94453 + * another atom and one for reverse mapping.  Used to prevent deadlock in the
94454 + * ASTAGE_CAPTURE_WAIT state.
94455 + *
94456 + * Thread that needs to wait for a given atom, attaches itself to the atom's
94457 + * ->fwaitfor list. This is done in atom_wait_event() (and, in
94458 + * capture_fuse_wait()). All threads waiting on this list are waked up
94459 + * whenever "event" occurs for this atom: it changes stage, commits, flush
94460 + * queue is released, etc. This is used, in particular, to implement sync(),
94461 + * where thread has to wait until atom commits.
94462 + */
94463 +TYPE_SAFE_LIST_DECLARE(fwaitfor);
94464 +
94465 +/*
94466 + * This list is used to wait for atom fusion (in capture_fuse_wait()). Threads
94467 + * waiting on this list are waked up if atom commits or is fused into another.
94468 + *
94469 + * This is used in capture_fuse_wait() which see for more comments.
94470 + */
94471 +TYPE_SAFE_LIST_DECLARE(fwaiting);
94472 +
94473 +/* The transaction's list of captured jnodes */
94474 +TYPE_SAFE_LIST_DECLARE(capture);
94475 +#if REISER4_DEBUG
94476 +TYPE_SAFE_LIST_DECLARE(inode_jnodes);
94477 +#endif
94478 +
94479 +TYPE_SAFE_LIST_DECLARE(blocknr_set);   /* Used for the transaction's delete set
94480 +                                * and wandered mapping. */
94481 +
94482 +/* list of flush queues attached to a given atom */
94483 +TYPE_SAFE_LIST_DECLARE(fq);
94484 +
94485 +/* list of lists of jnodes that threads take into exclusive ownership during
94486 + * allocate-on-flush.*/
94487 +TYPE_SAFE_LIST_DECLARE(prot);
94488 +
94489 +/* TYPE DECLARATIONS */
94490 +
94491 +/* This enumeration describes the possible types of a capture request (try_capture).
94492 +   A capture request dynamically assigns a block to the calling thread's transaction
94493 +   handle. */
94494 +typedef enum {
94495 +       /* A READ_ATOMIC request indicates that a block will be read and that the caller's
94496 +          atom should fuse in order to ensure that the block commits atomically with the
94497 +          caller. */
94498 +       TXN_CAPTURE_READ_ATOMIC = (1 << 0),
94499 +
94500 +       /* A READ_NONCOM request indicates that a block will be read and that the caller is
94501 +          willing to read a non-committed block without causing atoms to fuse. */
94502 +       TXN_CAPTURE_READ_NONCOM = (1 << 1),
94503 +
94504 +       /* A READ_MODIFY request indicates that a block will be read but that the caller
94505 +          wishes for the block to be captured as it will be written.  This capture request
94506 +          mode is not currently used, but eventually it will be useful for preventing
94507 +          deadlock in read-modify-write cycles. */
94508 +       TXN_CAPTURE_READ_MODIFY = (1 << 2),
94509 +
94510 +       /* A WRITE capture request indicates that a block will be modified and that atoms
94511 +          should fuse to make the commit atomic. */
94512 +       TXN_CAPTURE_WRITE = (1 << 3),
94513 +
94514 +       /* CAPTURE_TYPES is a mask of the four above capture types, used to separate the
94515 +          exclusive type designation from extra bits that may be supplied -- see
94516 +          below. */
94517 +       TXN_CAPTURE_TYPES = (TXN_CAPTURE_READ_ATOMIC |
94518 +                            TXN_CAPTURE_READ_NONCOM | TXN_CAPTURE_READ_MODIFY | TXN_CAPTURE_WRITE),
94519 +
94520 +       /* A subset of CAPTURE_TYPES, CAPTURE_WTYPES is a mask of request types that
94521 +          indicate modification will occur. */
94522 +       TXN_CAPTURE_WTYPES = (TXN_CAPTURE_READ_MODIFY | TXN_CAPTURE_WRITE),
94523 +
94524 +       /* An option to try_capture, NONBLOCKING indicates that the caller would
94525 +          prefer not to sleep waiting for an aging atom to commit. */
94526 +       TXN_CAPTURE_NONBLOCKING = (1 << 4),
94527 +
94528 +       /* An option to try_capture to prevent atom fusion, just simple capturing is allowed */
94529 +       TXN_CAPTURE_DONT_FUSE = (1 << 5),
94530 +
94531 +       /* if it is set - copy on capture is allowed */
94532 +       /*TXN_CAPTURE_CAN_COC = (1 << 6)*/
94533 +
94534 +           /* This macro selects only the exclusive capture request types, stripping out any
94535 +              options that were supplied (i.e., NONBLOCKING). */
94536 +#define CAPTURE_TYPE(x) ((x) & TXN_CAPTURE_TYPES)
94537 +} txn_capture;
94538 +
94539 +/* There are two kinds of transaction handle: WRITE_FUSING and READ_FUSING, the only
94540 +   difference is in the handling of read requests.  A WRITE_FUSING transaction handle
94541 +   defaults read capture requests to TXN_CAPTURE_READ_NONCOM whereas a READ_FUSIONG
94542 +   transaction handle defaults to TXN_CAPTURE_READ_ATOMIC. */
94543 +typedef enum {
94544 +       TXN_WRITE_FUSING = (1 << 0),
94545 +       TXN_READ_FUSING = (1 << 1) | TXN_WRITE_FUSING,  /* READ implies WRITE */
94546 +} txn_mode;
94547 +
94548 +/* Every atom has a stage, which is one of these exclusive values: */
94549 +typedef enum {
94550 +       /* Initially an atom is free. */
94551 +       ASTAGE_FREE = 0,
94552 +
94553 +       /* An atom begins by entering the CAPTURE_FUSE stage, where it proceeds to capture
94554 +          blocks and fuse with other atoms. */
94555 +       ASTAGE_CAPTURE_FUSE = 1,
94556 +
94557 +       /* We need to have a ASTAGE_CAPTURE_SLOW in which an atom fuses with one node for every X nodes it flushes to disk where X > 1. */
94558 +
94559 +       /* When an atom reaches a certain age it must do all it can to commit.  An atom in
94560 +          the CAPTURE_WAIT stage refuses new transaction handles and prevents fusion from
94561 +          atoms in the CAPTURE_FUSE stage. */
94562 +       ASTAGE_CAPTURE_WAIT = 2,
94563 +
94564 +       /* Waiting for I/O before commit.  Copy-on-capture (see
94565 +          http://namesys.com/v4/v4.html). */
94566 +       ASTAGE_PRE_COMMIT = 3,
94567 +
94568 +       /* Post-commit overwrite I/O.  Steal-on-capture. */
94569 +       ASTAGE_POST_COMMIT = 4,
94570 +
94571 +       /* Atom which waits for the removal of the last reference to (it? ) to
94572 +        * be deleted from memory  */
94573 +       ASTAGE_DONE = 5,
94574 +
94575 +       /* invalid atom. */
94576 +       ASTAGE_INVALID = 6,
94577 +
94578 +} txn_stage;
94579 +
94580 +/* Certain flags may be set in the txn_atom->flags field. */
94581 +typedef enum {
94582 +       /* Indicates that the atom should commit as soon as possible. */
94583 +       ATOM_FORCE_COMMIT = (1 << 0)
94584 +} txn_flags;
94585 +
94586 +/* Flags for controlling commit_txnh */
94587 +typedef enum {
94588 +       /* Wait commit atom completion in commit_txnh */
94589 +       TXNH_WAIT_COMMIT = 0x2,
94590 +       /* Don't commit atom when this handle is closed */
94591 +       TXNH_DONT_COMMIT = 0x4
94592 +} txn_handle_flags_t;
94593 +
94594 +/* TYPE DEFINITIONS */
94595 +
94596 +/* A note on lock ordering: the handle & jnode spinlock protects reading of their ->atom
94597 +   fields, so typically an operation on the atom through either of these objects must (1)
94598 +   lock the object, (2) read the atom pointer, (3) lock the atom.
94599 +
94600 +   During atom fusion, the process holds locks on both atoms at once.  Then, it iterates
94601 +   through the list of handles and pages held by the smaller of the two atoms.  For each
94602 +   handle and page referencing the smaller atom, the fusing process must: (1) lock the
94603 +   object, and (2) update the atom pointer.
94604 +
94605 +   You can see that there is a conflict of lock ordering here, so the more-complex
94606 +   procedure should have priority, i.e., the fusing process has priority so that it is
94607 +   guaranteed to make progress and to avoid restarts.
94608 +
94609 +   This decision, however, means additional complexity for aquiring the atom lock in the
94610 +   first place.
94611 +
94612 +   The general original procedure followed in the code was:
94613 +
94614 +       TXN_OBJECT *obj = ...;
94615 +       TXN_ATOM   *atom;
94616 +
94617 +       spin_lock (& obj->_lock);
94618 +
94619 +       atom = obj->_atom;
94620 +
94621 +       if (! spin_trylock_atom (atom))
94622 +         {
94623 +           spin_unlock (& obj->_lock);
94624 +           RESTART OPERATION, THERE WAS A RACE;
94625 +         }
94626 +
94627 +       ELSE YOU HAVE BOTH ATOM AND OBJ LOCKED
94628 +
94629 +
94630 +   It has however been found that this wastes CPU a lot in a manner that is
94631 +   hard to profile. So, proper refcounting was added to atoms, and new
94632 +   standard locking sequence is like following:
94633 +
94634 +       TXN_OBJECT *obj = ...;
94635 +       TXN_ATOM   *atom;
94636 +
94637 +       spin_lock (& obj->_lock);
94638 +
94639 +       atom = obj->_atom;
94640 +
94641 +       if (! spin_trylock_atom (atom))
94642 +         {
94643 +           atomic_inc (& atom->refcount);
94644 +           spin_unlock (& obj->_lock);
94645 +           spin_lock (&atom->_lock);
94646 +           atomic_dec (& atom->refcount);
94647 +           // HERE atom is locked
94648 +           spin_unlock (&atom->_lock);
94649 +           RESTART OPERATION, THERE WAS A RACE;
94650 +         }
94651 +
94652 +       ELSE YOU HAVE BOTH ATOM AND OBJ LOCKED
94653 +
94654 +   (core of this is implemented in trylock_throttle() function)
94655 +
94656 +   See the jnode_get_atom() function for a common case.
94657 +
94658 +   As an additional (and important) optimization allowing to avoid restarts,
94659 +   it is possible to re-check required pre-conditions at the HERE point in
94660 +   code above and proceed without restarting if they are still satisfied.
94661 +*/
94662 +
94663 +/* A block number set consists of only the list head. */
94664 +struct blocknr_set {
94665 +       blocknr_set_list_head entries; /* blocknr_set_list_head defined from a template from tslist.h */
94666 +};
94667 +
94668 +/* An atomic transaction: this is the underlying system representation
94669 +   of a transaction, not the one seen by clients.
94670 +
94671 +   Invariants involving this data-type:
94672 +
94673 +      [sb-fake-allocated]
94674 +*/
94675 +struct txn_atom {
94676 +       /* The spinlock protecting the atom, held during fusion and various other state
94677 +          changes. */
94678 +       reiser4_spin_data alock;
94679 +
94680 +       /* The atom's reference counter, increasing (in case of a duplication
94681 +          of an existing reference or when we are sure that some other
94682 +          reference exists) may be done without taking spinlock, decrementing
94683 +          of the ref. counter requires a spinlock to be held.
94684 +
94685 +          Each transaction handle counts in ->refcount. All jnodes count as
94686 +          one reference acquired in atom_begin_andlock(), released in
94687 +          commit_current_atom().
94688 +       */
94689 +       atomic_t refcount;
94690 +
94691 +       /* The atom_id identifies the atom in persistent records such as the log. */
94692 +       __u32 atom_id;
94693 +
94694 +       /* Flags holding any of the txn_flags enumerated values (e.g.,
94695 +          ATOM_FORCE_COMMIT). */
94696 +       __u32 flags;
94697 +
94698 +       /* Number of open handles. */
94699 +       __u32 txnh_count;
94700 +
94701 +       /* The number of znodes captured by this atom.  Equal to the sum of lengths of the
94702 +          dirty_nodes[level] and clean_nodes lists. */
94703 +       __u32 capture_count;
94704 +
94705 +#if REISER4_DEBUG
94706 +       int clean;
94707 +       int dirty;
94708 +       int ovrwr;
94709 +       int wb;
94710 +       int fq;
94711 +       int protect;
94712 +#endif
94713 +
94714 +       __u32 flushed;
94715 +
94716 +       /* Current transaction stage. */
94717 +       txn_stage stage;
94718 +
94719 +       /* Start time. */
94720 +       unsigned long start_time;
94721 +
94722 +       /* The atom's delete set. It collects block numbers of the nodes
94723 +          which were deleted during the transaction. */
94724 +       blocknr_set delete_set;
94725 +
94726 +       /* The atom's wandered_block mapping. */
94727 +       blocknr_set wandered_map;
94728 +
94729 +       /* The transaction's list of dirty captured nodes--per level.  Index
94730 +          by (level). dirty_nodes[0] is for znode-above-root */
94731 +       capture_list_head dirty_nodes1[REAL_MAX_ZTREE_HEIGHT + 1];
94732 +
94733 +       /* The transaction's list of clean captured nodes. */
94734 +       capture_list_head clean_nodes1;
94735 +
94736 +       /* The atom's overwrite set */
94737 +       capture_list_head ovrwr_nodes1;
94738 +
94739 +       /* nodes which are being written to disk */
94740 +       capture_list_head writeback_nodes1;
94741 +
94742 +       /* list of inodes */
94743 +       capture_list_head inodes;
94744 +
94745 +       /* List of handles associated with this atom. */
94746 +       txnh_list_head txnh_list;
94747 +
94748 +       /* Transaction list link: list of atoms in the transaction manager. */
94749 +       atom_list_link atom_link;
94750 +
94751 +       /* List of handles waiting FOR this atom: see 'capture_fuse_wait' comment. */
94752 +       fwaitfor_list_head fwaitfor_list;
94753 +
94754 +       /* List of this atom's handles that are waiting: see 'capture_fuse_wait' comment. */
94755 +       fwaiting_list_head fwaiting_list;
94756 +
94757 +       prot_list_head protected;
94758 +
94759 +       /* Numbers of objects which were deleted/created in this transaction
94760 +          thereby numbers of objects IDs which were released/deallocated. */
94761 +       int nr_objects_deleted;
94762 +       int nr_objects_created;
94763 +       /* number of blocks allocated during the transaction */
94764 +       __u64 nr_blocks_allocated;
94765 +       /* All atom's flush queue objects are on this list  */
94766 +       fq_list_head flush_queues;
94767 +#if REISER4_DEBUG
94768 +       /* number of flush queues for this atom. */
94769 +       int nr_flush_queues;
94770 +       /* Number of jnodes which were removed from atom's lists and put
94771 +          on flush_queue */
94772 +       int num_queued;
94773 +#endif
94774 +       /* number of threads who wait for this atom to complete commit */
94775 +       int nr_waiters;
94776 +       /* number of threads which do jnode_flush() over this atom */
94777 +       int nr_flushers;
94778 +       /* number of flush queues which are IN_USE and jnodes from fq->prepped
94779 +          are submitted to disk by the write_fq() routine. */
94780 +       int nr_running_queues;
94781 +       /* A counter of grabbed unformatted nodes, see a description of the
94782 +        * reiser4 space reservation scheme at block_alloc.c */
94783 +       reiser4_block_nr flush_reserved;
94784 +#if REISER4_DEBUG
94785 +       void *committer;
94786 +#endif
94787 +};
94788 +
94789 +#define ATOM_DIRTY_LIST(atom, level) (&(atom)->dirty_nodes1[level])
94790 +#define ATOM_CLEAN_LIST(atom) (&(atom)->clean_nodes1)
94791 +#define ATOM_OVRWR_LIST(atom) (&(atom)->ovrwr_nodes1)
94792 +#define ATOM_WB_LIST(atom) (&(atom)->writeback_nodes1)
94793 +#define ATOM_FQ_LIST(fq) (&(fq)->prepped1)
94794 +
94795 +#define NODE_LIST(node) (node)->list1
94796 +#define ASSIGN_NODE_LIST(node, list) ON_DEBUG(NODE_LIST(node) = list)
94797 +ON_DEBUG(void count_jnode(txn_atom *, jnode *, atom_list old_list, atom_list new_list, int check_lists));
94798 +
94799 +typedef struct protected_jnodes {
94800 +       prot_list_link inatom;
94801 +       capture_list_head nodes;
94802 +} protected_jnodes;
94803 +
94804 +TYPE_SAFE_LIST_DEFINE(prot, protected_jnodes, inatom);
94805 +
94806 +TYPE_SAFE_LIST_DEFINE(atom, txn_atom, atom_link);
94807 +
94808 +/* A transaction handle: the client obtains and commits this handle which is assigned by
94809 +   the system to a txn_atom. */
94810 +struct txn_handle {
94811 +       /* Spinlock protecting ->atom pointer */
94812 +       reiser4_spin_data hlock;
94813 +
94814 +       /* Flags for controlling commit_txnh() behavior */
94815 +       /* from txn_handle_flags_t */
94816 +       txn_handle_flags_t flags;
94817 +
94818 +       /* Whether it is READ_FUSING or WRITE_FUSING. */
94819 +       txn_mode mode;
94820 +
94821 +       /* If assigned, the atom it is part of. */
94822 +       txn_atom *atom;
94823 +
94824 +       /* Transaction list link. */
94825 +       txnh_list_link txnh_link;
94826 +};
94827 +
94828 +TYPE_SAFE_LIST_DECLARE(txn_mgrs);
94829 +
94830 +/* The transaction manager: one is contained in the reiser4_super_info_data */
94831 +struct txn_mgr {
94832 +       /* A spinlock protecting the atom list, id_count, flush_control */
94833 +       reiser4_spin_data tmgr_lock;
94834 +
94835 +       /* List of atoms. */
94836 +       atom_list_head atoms_list;
94837 +
94838 +       /* Number of atoms. */
94839 +       int atom_count;
94840 +
94841 +       /* A counter used to assign atom->atom_id values. */
94842 +       __u32 id_count;
94843 +
94844 +       /* a semaphore object for commit serialization */
94845 +       struct semaphore commit_semaphore;
94846 +
94847 +       /* a list of all txnmrgs served by particular daemon. */
94848 +       txn_mgrs_list_link linkage;
94849 +
94850 +       /* description of daemon for this txnmgr */
94851 +       ktxnmgrd_context *daemon;
94852 +
94853 +       /* parameters. Adjustable through mount options. */
94854 +       unsigned int atom_max_size;
94855 +       unsigned int atom_max_age;
94856 +       /* max number of concurrent flushers for one atom, 0 - unlimited.  */
94857 +       unsigned int atom_max_flushers;
94858 +};
94859 +
94860 +/* list of all transaction managers in a system */
94861 +TYPE_SAFE_LIST_DEFINE(txn_mgrs, txn_mgr, linkage);
94862 +
94863 +/* FUNCTION DECLARATIONS */
94864 +
94865 +/* These are the externally (within Reiser4) visible transaction functions, therefore they
94866 +   are prefixed with "txn_".  For comments, see txnmgr.c. */
94867 +
94868 +extern int txnmgr_init_static(void);
94869 +extern void txnmgr_init(txn_mgr * mgr);
94870 +
94871 +extern int txnmgr_done_static(void);
94872 +extern int txnmgr_done(txn_mgr * mgr);
94873 +
94874 +extern int txn_reserve(int reserved);
94875 +
94876 +extern void txn_begin(reiser4_context * context);
94877 +extern long txn_end(reiser4_context * context);
94878 +
94879 +extern void txn_restart(reiser4_context * context);
94880 +extern void txn_restart_current(void);
94881 +
94882 +extern int txnmgr_force_commit_current_atom(void);
94883 +extern int txnmgr_force_commit_all(struct super_block *, int);
94884 +extern int current_atom_should_commit(void);
94885 +
94886 +extern jnode * find_first_dirty_jnode (txn_atom *, int);
94887 +
94888 +extern int commit_some_atoms(txn_mgr *);
94889 +extern int flush_current_atom (int, long *, txn_atom **);
94890 +
94891 +extern int flush_some_atom(long *, const struct writeback_control *, int);
94892 +
94893 +extern void atom_set_stage(txn_atom *atom, txn_stage stage);
94894 +
94895 +extern int same_slum_check(jnode * base, jnode * check, int alloc_check, int alloc_value);
94896 +extern void atom_dec_and_unlock(txn_atom * atom);
94897 +
94898 +extern txn_capture build_capture_mode(jnode           * node,
94899 +                                     znode_lock_mode   lock_mode,
94900 +                                     txn_capture       flags);
94901 +
94902 +extern int try_capture(jnode * node, znode_lock_mode mode, txn_capture flags, int can_coc);
94903 +extern int try_capture_page_to_invalidate(struct page *pg);
94904 +
94905 +extern void uncapture_page(struct page *pg);
94906 +extern void uncapture_block(jnode *);
94907 +extern void uncapture_jnode(jnode *);
94908 +
94909 +extern int capture_inode(struct inode *);
94910 +extern int uncapture_inode(struct inode *);
94911 +
94912 +extern txn_atom *txnh_get_atom(txn_handle * txnh);
94913 +extern txn_atom *get_current_atom_locked_nocheck(void);
94914 +
94915 +#define atom_is_protected(atom) (spin_atom_is_locked(atom) || (atom)->stage >= ASTAGE_PRE_COMMIT)
94916 +
94917 +/* Get the current atom and spinlock it if current atom present. May not return NULL */
94918 +static inline txn_atom *
94919 +get_current_atom_locked(void)
94920 +{
94921 +       txn_atom *atom;
94922 +
94923 +       atom = get_current_atom_locked_nocheck();
94924 +       assert("zam-761", atom != NULL);
94925 +
94926 +       return atom;
94927 +}
94928 +
94929 +extern txn_atom *jnode_get_atom(jnode *);
94930 +
94931 +extern void atom_wait_event(txn_atom *);
94932 +extern void atom_send_event(txn_atom *);
94933 +
94934 +extern void insert_into_atom_ovrwr_list(txn_atom * atom, jnode * node);
94935 +extern int capture_super_block(struct super_block *s);
94936 +
94937 +extern int jnodes_of_one_atom(jnode *, jnode *);
94938 +
94939 +/* See the comment on the function blocknrset.c:blocknr_set_add for the
94940 +   calling convention of these three routines. */
94941 +extern void blocknr_set_init(blocknr_set * bset);
94942 +extern void blocknr_set_destroy(blocknr_set * bset);
94943 +extern void blocknr_set_merge(blocknr_set * from, blocknr_set * into);
94944 +extern int blocknr_set_add_extent(txn_atom * atom,
94945 +                                 blocknr_set * bset,
94946 +                                 blocknr_set_entry ** new_bsep,
94947 +                                 const reiser4_block_nr * start, const reiser4_block_nr * len);
94948 +extern int blocknr_set_add_pair(txn_atom * atom,
94949 +                               blocknr_set * bset,
94950 +                               blocknr_set_entry ** new_bsep, const reiser4_block_nr * a, const reiser4_block_nr * b);
94951 +
94952 +typedef int (*blocknr_set_actor_f) (txn_atom *, const reiser4_block_nr *, const reiser4_block_nr *, void *);
94953 +
94954 +extern int blocknr_set_iterator(txn_atom * atom, blocknr_set * bset, blocknr_set_actor_f actor, void *data, int delete);
94955 +
94956 +/* flush code takes care about how to fuse flush queues */
94957 +extern void flush_init_atom(txn_atom * atom);
94958 +extern void flush_fuse_queues(txn_atom * large, txn_atom * small);
94959 +
94960 +/* INLINE FUNCTIONS */
94961 +
94962 +#define spin_ordering_pred_atom(atom)                          \
94963 +       ( ( lock_counters() -> spin_locked_txnh == 0 ) &&       \
94964 +         ( lock_counters() -> spin_locked_jnode == 0 ) &&      \
94965 +         ( lock_counters() -> rw_locked_zlock == 0 ) &&        \
94966 +         ( lock_counters() -> rw_locked_dk == 0 ) &&           \
94967 +         ( lock_counters() -> rw_locked_tree == 0 ) )
94968 +
94969 +#define spin_ordering_pred_txnh(txnh)                          \
94970 +       ( ( lock_counters() -> rw_locked_dk == 0 ) &&           \
94971 +         ( lock_counters() -> rw_locked_zlock == 0 ) &&        \
94972 +         ( lock_counters() -> rw_locked_tree == 0 ) )
94973 +
94974 +#define spin_ordering_pred_txnmgr(tmgr)                        \
94975 +       ( ( lock_counters() -> spin_locked_atom == 0 ) &&       \
94976 +         ( lock_counters() -> spin_locked_txnh == 0 ) &&       \
94977 +         ( lock_counters() -> spin_locked_jnode == 0 ) &&      \
94978 +         ( lock_counters() -> rw_locked_zlock == 0 ) &&        \
94979 +         ( lock_counters() -> rw_locked_dk == 0 ) &&           \
94980 +         ( lock_counters() -> rw_locked_tree == 0 ) )
94981 +
94982 +SPIN_LOCK_FUNCTIONS(atom, txn_atom, alock);
94983 +SPIN_LOCK_FUNCTIONS(txnh, txn_handle, hlock);
94984 +SPIN_LOCK_FUNCTIONS(txnmgr, txn_mgr, tmgr_lock);
94985 +
94986 +typedef enum {
94987 +       FQ_IN_USE = 0x1
94988 +} flush_queue_state_t;
94989 +
94990 +typedef struct flush_queue flush_queue_t;
94991 +
94992 +/* This is an accumulator for jnodes prepared for writing to disk. A flush queue
94993 +   is filled by the jnode_flush() routine, and written to disk under memory
94994 +   pressure or at atom commit time. */
94995 +/* LOCKING: fq state and fq->atom are protected by guard spinlock, fq->nr_queued
94996 +   field and fq->prepped list can be modified if atom is spin-locked and fq
94997 +   object is "in-use" state.  For read-only traversal of the fq->prepped list
94998 +   and reading of the fq->nr_queued field it is enough to keep fq "in-use" or
94999 +   only have atom spin-locked. */
95000 +struct flush_queue {
95001 +       /* linkage element is the first in this structure to make debugging
95002 +          easier.  See field in atom struct for description of list. */
95003 +       fq_list_link alink;
95004 +       /* A spinlock to protect changes of fq state and fq->atom pointer */
95005 +       reiser4_spin_data guard;
95006 +       /* flush_queue state: [in_use | ready] */
95007 +       flush_queue_state_t state;
95008 +       /* A list which contains queued nodes, queued nodes are removed from any
95009 +        * atom's list and put on this ->prepped one. */
95010 +       capture_list_head prepped1;
95011 +       /* number of submitted i/o requests */
95012 +       atomic_t nr_submitted;
95013 +       /* number of i/o errors */
95014 +       atomic_t nr_errors;
95015 +       /* An atom this flush queue is attached to */
95016 +       txn_atom *atom;
95017 +       /* A semaphore for waiting on i/o completion */
95018 +       struct semaphore io_sem;
95019 +#if REISER4_DEBUG
95020 +       /* A thread which took this fq in exclusive use, NULL if fq is free,
95021 +        * used for debugging. */
95022 +       struct task_struct *owner;
95023 +#endif
95024 +};
95025 +
95026 +extern int fq_by_atom(txn_atom *, flush_queue_t **);
95027 +extern int fq_by_atom_gfp(txn_atom *, flush_queue_t **, int);
95028 +extern int fq_by_jnode(jnode *, flush_queue_t **);
95029 +extern int fq_by_jnode_gfp(jnode *, flush_queue_t **, int);
95030 +extern void fq_put_nolock(flush_queue_t *);
95031 +extern void fq_put(flush_queue_t *);
95032 +extern void fuse_fq(txn_atom * to, txn_atom * from);
95033 +extern void queue_jnode(flush_queue_t *, jnode *);
95034 +extern void mark_jnode_queued(flush_queue_t *, jnode *);
95035 +
95036 +extern int write_fq(flush_queue_t *, long *, int);
95037 +extern int current_atom_finish_all_fq(void);
95038 +extern void init_atom_fq_parts(txn_atom *);
95039 +
95040 +extern unsigned int txnmgr_get_max_atom_size(struct super_block *super);
95041 +extern reiser4_block_nr txnmgr_count_deleted_blocks (void);
95042 +
95043 +extern void znode_make_dirty(znode * node);
95044 +extern void jnode_make_dirty_locked(jnode * node);
95045 +
95046 +extern int sync_atom(txn_atom *atom);
95047 +
95048 +#if REISER4_DEBUG
95049 +extern int atom_fq_parts_are_clean (txn_atom *);
95050 +#endif
95051 +
95052 +extern void add_fq_to_bio(flush_queue_t *, struct bio *);
95053 +extern flush_queue_t *get_fq_for_current_atom(void);
95054 +
95055 +void protected_jnodes_init(protected_jnodes *list);
95056 +void protected_jnodes_done(protected_jnodes *list);
95057 +void invalidate_list(capture_list_head * head);
95058 +
95059 +/* Debugging */
95060 +#if REISER4_DEBUG_OUTPUT
95061 +void print_atom(const char *prefix, txn_atom * atom);
95062 +void info_atom(const char *prefix, const txn_atom * atom);
95063 +#else
95064 +#define       print_atom(p,a) noop
95065 +#define       info_atom(p,a) noop
95066 +#endif
95067 +
95068 +# endif                                /* __REISER4_TXNMGR_H__ */
95069 +
95070 +/* Make Linus happy.
95071 +   Local variables:
95072 +   c-indentation-style: "K&R"
95073 +   mode-name: "LC"
95074 +   c-basic-offset: 8
95075 +   tab-width: 8
95076 +   fill-column: 120
95077 +   End:
95078 +*/
95079 diff -rupN linux-2.6.8-rc3/fs/reiser4/type_safe_hash.h linux-2.6.8-rc3-a/fs/reiser4/type_safe_hash.h
95080 --- linux-2.6.8-rc3/fs/reiser4/type_safe_hash.h 1970-01-01 03:00:00.000000000 +0300
95081 +++ linux-2.6.8-rc3-a/fs/reiser4/type_safe_hash.h       2004-08-05 21:20:53.175641027 +0400
95082 @@ -0,0 +1,332 @@
95083 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
95084 + * reiser4/README */
95085 +
95086 +/* A hash table class that uses hash chains (singly-linked) and is
95087 +   parametrized to provide type safety.  */
95088 +
95089 +#ifndef __REISER4_TYPE_SAFE_HASH_H__
95090 +#define __REISER4_TYPE_SAFE_HASH_H__
95091 +
95092 +#include "debug.h"
95093 +#include "stats.h"
95094 +
95095 +#include <asm/errno.h>
95096 +/* Step 1: Use TYPE_SAFE_HASH_DECLARE() to define the TABLE and LINK objects
95097 +   based on the object type.  You need to declare the item type before
95098 +   this definition, define it after this definition. */
95099 +#define TYPE_SAFE_HASH_DECLARE(PREFIX,ITEM_TYPE)                                                     \
95100 +                                                                                              \
95101 +typedef struct PREFIX##_hash_table_  PREFIX##_hash_table;                                     \
95102 +typedef struct PREFIX##_hash_link_   PREFIX##_hash_link;                                      \
95103 +                                                                                              \
95104 +struct PREFIX##_hash_table_                                                                   \
95105 +{                                                                                             \
95106 +  ITEM_TYPE  **_table;                                                                        \
95107 +  __u32        _buckets;                                                                      \
95108 +  tshash_stat *_stats;                                                                        \
95109 +};                                                                                            \
95110 +                                                                                              \
95111 +struct PREFIX##_hash_link_                                                                    \
95112 +{                                                                                             \
95113 +  ITEM_TYPE *_next;                                                                           \
95114 +}
95115 +
95116 +/* Step 2: Define the object type of the hash: give it field of type
95117 +   PREFIX_hash_link. */
95118 +
95119 +/* Step 3: Use TYPE_SAFE_HASH_DEFINE to define the hash table interface using
95120 +   the type and field name used in step 3.  The arguments are:
95121 +
95122 +   ITEM_TYPE    The item type being hashed
95123 +   KEY_TYPE     The type of key being hashed
95124 +   KEY_NAME     The name of the key field within the item
95125 +   LINK_NAME    The name of the link field within the item, which you must make type PREFIX_hash_link)
95126 +   HASH_FUNC    The name of the hash function (or macro, takes const pointer to key)
95127 +   EQ_FUNC      The name of the equality function (or macro, takes const pointer to two keys)
95128 +
95129 +   It implements these functions:
95130 +
95131 +   prefix_hash_init           Initialize the table given its size.
95132 +   prefix_hash_insert         Insert an item
95133 +   prefix_hash_insert_index   Insert an item w/ precomputed hash_index
95134 +   prefix_hash_find           Find an item by key
95135 +   prefix_hash_find_index     Find an item w/ precomputed hash_index
95136 +   prefix_hash_remove         Remove an item, returns 1 if found, 0 if not found
95137 +   prefix_hash_remove_index   Remove an item w/ precomputed hash_index
95138 +
95139 +   If you'd like something to be done differently, feel free to ask me
95140 +   for modifications.  Additional features that could be added but
95141 +   have not been:
95142 +
95143 +   prefix_hash_remove_key           Find and remove an item by key
95144 +   prefix_hash_remove_key_index     Find and remove an item by key w/ precomputed hash_index
95145 +
95146 +   The hash_function currently receives only the key as an argument,
95147 +   meaning it must somehow know the number of buckets.  If this is a
95148 +   problem let me know.
95149 +
95150 +   This hash table uses a single-linked hash chain.  This means
95151 +   insertion is fast but deletion requires searching the chain.
95152 +
95153 +   There is also the doubly-linked hash chain approach, under which
95154 +   deletion requires no search but the code is longer and it takes two
95155 +   pointers per item.
95156 +
95157 +   The circularly-linked approach has the shortest code but requires
95158 +   two pointers per bucket, doubling the size of the bucket array (in
95159 +   addition to two pointers per item).
95160 +*/
95161 +#define TYPE_SAFE_HASH_DEFINE(PREFIX,ITEM_TYPE,KEY_TYPE,KEY_NAME,LINK_NAME,HASH_FUNC,EQ_FUNC)  \
95162 +                                                                                       \
95163 +static __inline__ void                                                                 \
95164 +PREFIX##_check_hash (PREFIX##_hash_table *table UNUSED_ARG,                            \
95165 +                    __u32                hash UNUSED_ARG)                              \
95166 +{                                                                                      \
95167 +       assert("nikita-2780", hash < table->_buckets);                                  \
95168 +}                                                                                      \
95169 +                                                                                       \
95170 +static __inline__ int                                                                  \
95171 +PREFIX##_hash_init (PREFIX##_hash_table *hash,                                         \
95172 +                   __u32                buckets,                                       \
95173 +                   tshash_stat         *stats)                                         \
95174 +{                                                                                      \
95175 +  hash->_table   = (ITEM_TYPE**) KMALLOC (sizeof (ITEM_TYPE*) * buckets);              \
95176 +  hash->_buckets = buckets;                                                            \
95177 +  hash->_stats = stats;                                                                \
95178 +  if (hash->_table == NULL)                                                            \
95179 +    {                                                                                  \
95180 +      return RETERR(-ENOMEM);                                                          \
95181 +    }                                                                                  \
95182 +  xmemset (hash->_table, 0, sizeof (ITEM_TYPE*) * buckets);                            \
95183 +  ON_DEBUG(printk(#PREFIX "_hash_table: %i buckets\n", buckets));                      \
95184 +  return 0;                                                                            \
95185 +}                                                                                      \
95186 +                                                                                       \
95187 +static __inline__ void                                                                 \
95188 +PREFIX##_hash_done (PREFIX##_hash_table *hash)                                         \
95189 +{                                                                                      \
95190 +  if (REISER4_DEBUG && hash->_table != NULL) {                                          \
95191 +           __u32 i;                                                                    \
95192 +           for (i = 0 ; i < hash->_buckets ; ++ i)                                     \
95193 +                   assert("nikita-2905", hash->_table[i] == NULL);                     \
95194 +  }                                                                                     \
95195 +  if (hash->_table != NULL)                                                            \
95196 +    KFREE (hash->_table, sizeof (ITEM_TYPE*) * hash->_buckets);                                \
95197 +  hash->_table = NULL;                                                                 \
95198 +}                                                                                      \
95199 +                                                                                       \
95200 +static __inline__ void                                                                 \
95201 +PREFIX##_hash_prefetch_next (ITEM_TYPE *item)                                          \
95202 +{                                                                                      \
95203 +       prefetch(item->LINK_NAME._next);                                                \
95204 +}                                                                                      \
95205 +                                                                                       \
95206 +static __inline__ void                                                                 \
95207 +PREFIX##_hash_prefetch_bucket (PREFIX##_hash_table *hash,                              \
95208 +                              __u32                index)                              \
95209 +{                                                                                      \
95210 +       prefetch(hash->_table[index]);                                                  \
95211 +}                                                                                      \
95212 +                                                                                       \
95213 +static __inline__ ITEM_TYPE*                                                           \
95214 +PREFIX##_hash_find_index (PREFIX##_hash_table *hash,                                   \
95215 +                         __u32                hash_index,                              \
95216 +                         KEY_TYPE const      *find_key)                                \
95217 +{                                                                                      \
95218 +  ITEM_TYPE *item;                                                                     \
95219 +                                                                                       \
95220 +  PREFIX##_check_hash(hash, hash_index);                                               \
95221 +  TSHASH_LOOKUP(hash->_stats);                                                         \
95222 +                                                                                       \
95223 +  for (item  = hash->_table[hash_index];                                               \
95224 +       item != NULL;                                                                   \
95225 +       item  = item->LINK_NAME._next)                                                  \
95226 +    {                                                                                  \
95227 +      TSHASH_SCANNED(hash->_stats);                                                    \
95228 +      prefetch(item->LINK_NAME._next);                                                 \
95229 +      prefetch(item->LINK_NAME._next + offsetof(ITEM_TYPE, KEY_NAME));                 \
95230 +      if (EQ_FUNC (& item->KEY_NAME, find_key))                                                \
95231 +        {                                                                              \
95232 +          return item;                                                                 \
95233 +        }                                                                              \
95234 +    }                                                                                  \
95235 +                                                                                       \
95236 +  return NULL;                                                                         \
95237 +}                                                                                      \
95238 +                                                                                       \
95239 +static __inline__ ITEM_TYPE*                                                           \
95240 +PREFIX##_hash_find_index_lru (PREFIX##_hash_table *hash,                               \
95241 +                             __u32                hash_index,                          \
95242 +                             KEY_TYPE const      *find_key)                            \
95243 +{                                                                                      \
95244 +  ITEM_TYPE ** item = &hash->_table[hash_index];                                        \
95245 +                                                                                       \
95246 +  PREFIX##_check_hash(hash, hash_index);                                               \
95247 +  TSHASH_LOOKUP(hash->_stats);                                                         \
95248 +                                                                                        \
95249 +  while (*item != NULL) {                                                               \
95250 +    TSHASH_SCANNED(hash->_stats);                                                      \
95251 +    prefetch(&(*item)->LINK_NAME._next);                                               \
95252 +    if (EQ_FUNC (&(*item)->KEY_NAME, find_key)) {                                       \
95253 +      ITEM_TYPE *found;                                                                \
95254 +                                                                                       \
95255 +      found = *item;                                                                   \
95256 +      *item = found->LINK_NAME._next;                                                   \
95257 +      found->LINK_NAME._next = hash->_table[hash_index];                               \
95258 +      hash->_table[hash_index] = found;                                                        \
95259 +      return found;                                                                     \
95260 +    }                                                                                   \
95261 +    item = &(*item)->LINK_NAME._next;                                                   \
95262 +  }                                                                                    \
95263 +  return NULL;                                                                         \
95264 +}                                                                                      \
95265 +                                                                                       \
95266 +static __inline__ int                                                                  \
95267 +PREFIX##_hash_remove_index (PREFIX##_hash_table *hash,                                 \
95268 +                           __u32                hash_index,                            \
95269 +                           ITEM_TYPE           *del_item)                              \
95270 +{                                                                                      \
95271 +  ITEM_TYPE ** hash_item_p = &hash->_table[hash_index];                                 \
95272 +                                                                                       \
95273 +  PREFIX##_check_hash(hash, hash_index);                                               \
95274 +  TSHASH_REMOVE(hash->_stats);                                                         \
95275 +                                                                                        \
95276 +  while (*hash_item_p != NULL) {                                                        \
95277 +    TSHASH_SCANNED(hash->_stats);                                                      \
95278 +    prefetch(&(*hash_item_p)->LINK_NAME._next);                                                \
95279 +    if (*hash_item_p == del_item) {                                                     \
95280 +      *hash_item_p = (*hash_item_p)->LINK_NAME._next;                                   \
95281 +      return 1;                                                                         \
95282 +    }                                                                                   \
95283 +    hash_item_p = &(*hash_item_p)->LINK_NAME._next;                                     \
95284 +  }                                                                                    \
95285 +  return 0;                                                                            \
95286 +}                                                                                      \
95287 +                                                                                       \
95288 +static __inline__ void                                                                 \
95289 +PREFIX##_hash_insert_index (PREFIX##_hash_table *hash,                                 \
95290 +                           __u32                hash_index,                            \
95291 +                           ITEM_TYPE           *ins_item)                              \
95292 +{                                                                                      \
95293 +  PREFIX##_check_hash(hash, hash_index);                                               \
95294 +  TSHASH_INSERT(hash->_stats);                                                         \
95295 +                                                                                       \
95296 +  ins_item->LINK_NAME._next = hash->_table[hash_index];                                        \
95297 +  hash->_table[hash_index]  = ins_item;                                                        \
95298 +}                                                                                      \
95299 +                                                                                       \
95300 +static __inline__ void                                                                 \
95301 +PREFIX##_hash_insert_index_rcu (PREFIX##_hash_table *hash,                             \
95302 +                               __u32                hash_index,                        \
95303 +                               ITEM_TYPE           *ins_item)                          \
95304 +{                                                                                      \
95305 +  PREFIX##_check_hash(hash, hash_index);                                               \
95306 +  TSHASH_INSERT(hash->_stats);                                                         \
95307 +                                                                                       \
95308 +  ins_item->LINK_NAME._next = hash->_table[hash_index];                                        \
95309 +  smp_wmb();                                                                           \
95310 +  hash->_table[hash_index]  = ins_item;                                                        \
95311 +}                                                                                      \
95312 +                                                                                       \
95313 +static __inline__ ITEM_TYPE*                                                           \
95314 +PREFIX##_hash_find (PREFIX##_hash_table *hash,                                         \
95315 +                   KEY_TYPE const      *find_key)                                      \
95316 +{                                                                                      \
95317 +  return PREFIX##_hash_find_index (hash, HASH_FUNC(hash, find_key), find_key);         \
95318 +}                                                                                      \
95319 +                                                                                       \
95320 +static __inline__ ITEM_TYPE*                                                           \
95321 +PREFIX##_hash_find_lru (PREFIX##_hash_table *hash,                                     \
95322 +                       KEY_TYPE const      *find_key)                                  \
95323 +{                                                                                      \
95324 +  return PREFIX##_hash_find_index_lru (hash, HASH_FUNC(hash, find_key), find_key);     \
95325 +}                                                                                      \
95326 +                                                                                       \
95327 +static __inline__ int                                                                  \
95328 +PREFIX##_hash_remove (PREFIX##_hash_table *hash,                                       \
95329 +                     ITEM_TYPE           *del_item)                                    \
95330 +{                                                                                      \
95331 +  return PREFIX##_hash_remove_index (hash,                                             \
95332 +                                     HASH_FUNC(hash, &del_item->KEY_NAME), del_item);  \
95333 +}                                                                                      \
95334 +                                                                                       \
95335 +static __inline__ int                                                                  \
95336 +PREFIX##_hash_remove_rcu (PREFIX##_hash_table *hash,                                   \
95337 +                     ITEM_TYPE           *del_item)                                    \
95338 +{                                                                                      \
95339 +  return PREFIX##_hash_remove (hash, del_item);                                                \
95340 +}                                                                                      \
95341 +                                                                                       \
95342 +static __inline__ void                                                                 \
95343 +PREFIX##_hash_insert (PREFIX##_hash_table *hash,                                       \
95344 +                     ITEM_TYPE           *ins_item)                                    \
95345 +{                                                                                      \
95346 +  return PREFIX##_hash_insert_index (hash,                                             \
95347 +                                     HASH_FUNC(hash, &ins_item->KEY_NAME), ins_item);  \
95348 +}                                                                                      \
95349 +                                                                                       \
95350 +static __inline__ void                                                                 \
95351 +PREFIX##_hash_insert_rcu (PREFIX##_hash_table *hash,                                   \
95352 +                         ITEM_TYPE           *ins_item)                                \
95353 +{                                                                                      \
95354 +  return PREFIX##_hash_insert_index_rcu (hash, HASH_FUNC(hash, &ins_item->KEY_NAME),           \
95355 +                                         ins_item);                                    \
95356 +}                                                                                      \
95357 +                                                                                       \
95358 +static __inline__ ITEM_TYPE *                                                          \
95359 +PREFIX##_hash_first (PREFIX##_hash_table *hash, __u32 ind)                             \
95360 +{                                                                                      \
95361 +  ITEM_TYPE *first;                                                                    \
95362 +                                                                                       \
95363 +  for (first = NULL; ind < hash->_buckets; ++ ind) {                                   \
95364 +    first = hash->_table[ind];                                                         \
95365 +    if (first != NULL)                                                                 \
95366 +      break;                                                                           \
95367 +  }                                                                                    \
95368 +  return first;                                                                                \
95369 +}                                                                                      \
95370 +                                                                                       \
95371 +static __inline__ ITEM_TYPE *                                                          \
95372 +PREFIX##_hash_next (PREFIX##_hash_table *hash,                                         \
95373 +                   ITEM_TYPE           *item)                                          \
95374 +{                                                                                      \
95375 +  ITEM_TYPE  *next;                                                                    \
95376 +                                                                                       \
95377 +  if (item == NULL)                                                                    \
95378 +    return NULL;                                                                       \
95379 +  next = item->LINK_NAME._next;                                                                \
95380 +  if (next == NULL)                                                                    \
95381 +    next = PREFIX##_hash_first (hash, HASH_FUNC(hash, &item->KEY_NAME) + 1);           \
95382 +  return next;                                                                         \
95383 +}                                                                                      \
95384 +                                                                                       \
95385 +typedef struct {} PREFIX##_hash_dummy
95386 +
95387 +#define for_all_ht_buckets(table, head)                                        \
95388 +for ((head) = &(table) -> _table[ 0 ] ;                                        \
95389 +     (head) != &(table) -> _table[ (table) -> _buckets ] ; ++ (head))
95390 +
95391 +#define for_all_in_bucket(bucket, item, next, field)                           \
95392 +for ((item) = *(bucket), (next) = (item) ? (item) -> field._next : NULL ;      \
95393 +     (item) != NULL ;                                                          \
95394 +     (item) = (next), (next) = (item) ? (item) -> field._next : NULL )
95395 +
95396 +#define for_all_in_htable(table, prefix, item, next)   \
95397 +for ((item) = prefix ## _hash_first ((table), 0),      \
95398 +     (next) = prefix ## _hash_next ((table), (item)) ; \
95399 +     (item) != NULL ;                                  \
95400 +     (item) = (next),                                  \
95401 +     (next) = prefix ## _hash_next ((table), (item)))
95402 +
95403 +/* __REISER4_TYPE_SAFE_HASH_H__ */
95404 +#endif
95405 +
95406 +/* Make Linus happy.
95407 +   Local variables:
95408 +   c-indentation-style: "K&R"
95409 +   mode-name: "LC"
95410 +   c-basic-offset: 8
95411 +   tab-width: 8
95412 +   fill-column: 120
95413 +   End:
95414 +*/
95415 diff -rupN linux-2.6.8-rc3/fs/reiser4/type_safe_list.h linux-2.6.8-rc3-a/fs/reiser4/type_safe_list.h
95416 --- linux-2.6.8-rc3/fs/reiser4/type_safe_list.h 1970-01-01 03:00:00.000000000 +0300
95417 +++ linux-2.6.8-rc3-a/fs/reiser4/type_safe_list.h       2004-08-05 21:20:53.020673714 +0400
95418 @@ -0,0 +1,436 @@
95419 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
95420 +
95421 +#ifndef __REISER4_TYPE_SAFE_LIST_H__
95422 +#define __REISER4_TYPE_SAFE_LIST_H__
95423 +
95424 +#include "debug.h"
95425 +/* A circular doubly linked list that differs from the previous
95426 +   <linux/list.h> implementation because it is parametrized to provide
95427 +   type safety.  This data structure is also useful as a queue or stack.
95428 +
95429 +   The "list template" consists of a set of types and methods for
95430 +   implementing list operations.  All of the types and methods
95431 +   associated with a single list class are assigned unique names and
95432 +   type signatures, thus allowing the compiler to verify correct
95433 +   usage.
95434 +
95435 +   The first parameter of a list class is the item type being stored
95436 +   in the list.  The list class maintains two pointers within each
95437 +   item structure for its "next" and "prev" pointers.
95438 +
95439 +   There are two structures associated with the list, in addition to
95440 +   the item type itself.  The "list link" contains the two pointers
95441 +   that are embedded within the item itself.  The "list head" also
95442 +   contains two pointers which refer to the first item ("front") and
95443 +   last item ("back") of the list.
95444 +
95445 +   The list maintains a "circular" invariant, in that you can always
95446 +   begin at the front and follow "next" pointers until eventually you
95447 +   reach the same point.  The "list head" is included within the
95448 +   cycle, even though it does not have the correct item type.  The
95449 +   "list head" and "list link" types are different objects from the
95450 +   user's perspective, but the core algorithms that operate on this
95451 +   style of list treat the "list head" and "list link" as identical
95452 +   types.  That is why these algorithms are so simple.
95453 +
95454 +   The <linux/list.h> implementation uses the same algorithms as those
95455 +   in this file but uses only a single type "struct list_head".  There
95456 +   are two problems with this approach.  First, there are no type
95457 +   distinctions made between the two objects despite their distinct
95458 +   types, which greatly increases the possibility for mistakes.  For
95459 +   example, the <linux/list.h> list_add function takes two "struct
95460 +   list_head" arguments: the first is the item being inserted and the
95461 +   second is the "struct list_head" which should precede the new
95462 +   insertion to the list.  You can use this function to insert at any
95463 +   point in the list, but by far the most common list operations are
95464 +   to insert at the front or back of the list.  This common case
95465 +   should accept two different argument types: a "list head" and an
95466 +   "item", this allows for no confusion.
95467 +
95468 +   The second problem with using a single "struct list_head" is that
95469 +   it does not distinguish between list objects of distinct list
95470 +   classes.  If a single item can belong to two separate lists, there
95471 +   is easily the possibility of a mistake being made that causes the
95472 +   item to be added to a "list head" using the wrong "list link".  By
95473 +   using a parametrized list class we can statically detect such
95474 +   mistakes, detecting mistakes as soon as they happen.
95475 +
95476 +   To create a new list class takes several steps which are described
95477 +   below.  Suppose for this example that you would like to link
95478 +   together items of type "rx_event".  You should decide on
95479 +   prefix-name to be used on all list functions and structures.  For
95480 +   example, the string "rx_event" can be as a prefix for all the list
95481 +   operations, resulting in a "list head" named rx_event_list_head and
95482 +   a "list link" named rx_event_list_link.  The list operations on
95483 +   this list class would be named "rx_event_list_empty",
95484 +   "rx_event_list_init", "rx_event_list_push_front",
95485 +   "rx_event_list_push_back", and so on.
95486 +*/
95487 +
95488 +#define TYPE_SAFE_LIST_LINK_INIT(name) { &(name), &(name) }
95489 +#define TYPE_SAFE_LIST_HEAD_INIT(name) { (void *)&(name), (void *)&(name) }
95490 +#define TYPE_SAFE_LIST_LINK_ZERO { NULL, NULL }
95491 +#define TYPE_SAFE_LIST_HEAD_ZERO { NULL, NULL }
95492 +
95493 +#define TS_LINK_TO_ITEM(ITEM_TYPE,LINK_NAME,LINK) \
95494 +       ((ITEM_TYPE *)((char *)(LINK)-(unsigned long)(&((ITEM_TYPE *)0)->LINK_NAME)))
95495 +
95496 +/* Step 1: Use the TYPE_SAFE_LIST_DECLARE() macro to define the "list head"
95497 +   and "list link" objects.  This macro takes one arguments, the
95498 +   prefix-name, which is prepended to every structure and function
95499 +   name of the list class.  Following the example, this will create
95500 +   types named rx_event_list_head and rx_event_list_link.  In the
95501 +   example you would write:
95502 +
95503 +   TYPE_SAFE_LIST_DECLARE(rx_event);
95504 +
95505 +*/
95506 +#define TYPE_SAFE_LIST_DECLARE(PREFIX)                                                               \
95507 +                                                                                              \
95508 +typedef struct _##PREFIX##_list_head        PREFIX##_list_head;                               \
95509 +typedef struct _##PREFIX##_list_link        PREFIX##_list_link;                               \
95510 +                                                                                              \
95511 +struct _##PREFIX##_list_link                                                                  \
95512 +{                                                                                             \
95513 +  PREFIX##_list_link *_next;                                                                  \
95514 +  PREFIX##_list_link *_prev;                                                                  \
95515 +};                                                                                            \
95516 +                                                                                              \
95517 +struct _##PREFIX##_list_head                                                                  \
95518 +{                                                                                             \
95519 +  PREFIX##_list_link *_next;                                                                  \
95520 +  PREFIX##_list_link *_prev;                                                                  \
95521 +}
95522 +
95523 +/* Step 2: Once you have defined the two list classes, you should
95524 +   define the item type you intend to use.  The list classes must be
95525 +   declared before the item type because the item type must contain an
95526 +   embedded "list link" object.  Following the example, you might define
95527 +   rx_event as follows:
95528 +
95529 +   typedef struct _rx_event  rx_event;
95530 +
95531 +   struct _rx_event
95532 +   {
95533 +     ... other members ...
95534 +
95535 +     rx_event_list_link  _link;
95536 +   };
95537 +
95538 +   In this case we have given the rx_event a field named "_link" of
95539 +   the appropriate type.
95540 +*/
95541 +
95542 +/* Step 3: The final step will define the list-functions for a
95543 +   specific list class using the macro TYPE_SAFE_LIST_DEFINE.  There are
95544 +   three arguments to the TYPE_SAFE_LIST_DEFINE macro: the prefix-name, the
95545 +   item type name, and field name of the "list link" element within
95546 +   the item type.  In the above example you would supply "rx_event" as
95547 +   the type name and "_link" as the field name (without quotes).
95548 +   E.g.,
95549 +
95550 +   TYPE_SAFE_LIST_DEFINE(rx_event,rx_event,_link)
95551 +
95552 +   The list class you define is now complete with the functions:
95553 +
95554 +   rx_event_list_init             Initialize a list_head
95555 +   rx_event_list_clean            Initialize a list_link
95556 +   rx_event_list_is_clean         True if list_link is not in a list
95557 +   rx_event_list_push_front       Insert to the front of the list
95558 +   rx_event_list_push_back        Insert to the back of the list
95559 +   rx_event_list_insert_before    Insert just before given item in the list
95560 +   rx_event_list_insert_after     Insert just after given item in the list
95561 +   rx_event_list_remove           Remove an item from anywhere in the list
95562 +   rx_event_list_remove_clean     Remove an item from anywhere in the list and clean link_item
95563 +   rx_event_list_remove_get_next  Remove an item from anywhere in the list and return the next element
95564 +   rx_event_list_remove_get_prev  Remove an item from anywhere in the list and return the prev element
95565 +   rx_event_list_pop_front        Remove and return the front of the list, cannot be empty
95566 +   rx_event_list_pop_back         Remove and return the back of the list, cannot be empty
95567 +   rx_event_list_front            Get the front of the list
95568 +   rx_event_list_back             Get the back of the list
95569 +   rx_event_list_next             Iterate front-to-back through the list
95570 +   rx_event_list_prev             Iterate back-to-front through the list
95571 +   rx_event_list_end              Test to end an iteration, either direction
95572 +   rx_event_list_splice           Join two lists at the head
95573 +   rx_event_list_empty            True if the list is empty
95574 +   rx_event_list_object_ok        Check that list element satisfies double
95575 +                                  list invariants. For debugging.
95576 +
95577 +   To iterate over such a list use a for-loop such as:
95578 +
95579 +     rx_event_list_head *head = ...;
95580 +     rx_event *item;
95581 +
95582 +     for (item = rx_event_list_front (head);
95583 +               ! rx_event_list_end   (head, item);
95584 +          item = rx_event_list_next  (item))
95585 +       {...}
95586 +*/
95587 +#define TYPE_SAFE_LIST_DEFINE(PREFIX,ITEM_TYPE,LINK_NAME)                                            \
95588 +                                                                                              \
95589 +static __inline__ int                                                                         \
95590 +PREFIX##_list_link_invariant (const PREFIX##_list_link  *_link)                               \
95591 +{                                                                                             \
95592 +  return (_link != NULL) &&                                                                   \
95593 +         (_link->_prev != NULL) && (_link->_next != NULL ) &&                                \
95594 +         (_link->_prev->_next == _link) &&                                                   \
95595 +         (_link->_next->_prev == _link);                                                     \
95596 +}                                                                                             \
95597 +                                                                                              \
95598 +static __inline__ void                                                                        \
95599 +PREFIX##_list_link_ok (const PREFIX##_list_link  *_link UNUSED_ARG)                           \
95600 +{                                                                                             \
95601 +  assert ("nikita-1054", PREFIX##_list_link_invariant (_link));                               \
95602 +}                                                                                             \
95603 +                                                                                              \
95604 +static __inline__ void                                                                        \
95605 +PREFIX##_list_object_ok (const ITEM_TYPE           *item)                                     \
95606 +{                                                                                             \
95607 +  PREFIX##_list_link_ok (&item->LINK_NAME);                                                   \
95608 +}                                                                                             \
95609 +                                                                                              \
95610 +static __inline__ void                                                                        \
95611 +PREFIX##_list_init (PREFIX##_list_head  *head)                                                \
95612 +{                                                                                             \
95613 +  head->_next = (PREFIX##_list_link*) head;                                                   \
95614 +  head->_prev = (PREFIX##_list_link*) head;                                                   \
95615 +}                                                                                             \
95616 +                                                                                              \
95617 +static __inline__ void                                                                        \
95618 +PREFIX##_list_clean (ITEM_TYPE           *item)                                               \
95619 +{                                                                                             \
95620 +  PREFIX##_list_link *_link = &item->LINK_NAME;                                               \
95621 +                                                                                              \
95622 +  _link->_next = _link;                                                                       \
95623 +  _link->_prev = _link;                                                                       \
95624 +}                                                                                             \
95625 +                                                                                              \
95626 +static __inline__ int                                                                         \
95627 +PREFIX##_list_is_clean (const ITEM_TYPE           *item)                                      \
95628 +{                                                                                             \
95629 +  const PREFIX##_list_link *_link = &item->LINK_NAME;                                         \
95630 +                                                                                              \
95631 +  PREFIX##_list_link_ok (_link);                                                              \
95632 +  return (_link == _link->_next) && (_link == _link->_prev);                                  \
95633 +}                                                                                             \
95634 +                                                                                              \
95635 +static __inline__ void                                                                        \
95636 +PREFIX##_list_insert_int (PREFIX##_list_link  *next,                                          \
95637 +                                PREFIX##_list_link  *item)                                   \
95638 +{                                                                                             \
95639 +  PREFIX##_list_link *prev = next->_prev;                                                     \
95640 +  PREFIX##_list_link_ok (next);                                                               \
95641 +  PREFIX##_list_link_ok (prev);                                                               \
95642 +  next->_prev = item;                                                                         \
95643 +  item->_next = next;                                                                         \
95644 +  item->_prev = prev;                                                                         \
95645 +  prev->_next = item;                                                                         \
95646 +  PREFIX##_list_link_ok (next);                                                               \
95647 +  PREFIX##_list_link_ok (prev);                                                               \
95648 +  PREFIX##_list_link_ok (item);                                                               \
95649 +}                                                                                             \
95650 +                                                                                              \
95651 +static __inline__ void                                                                        \
95652 +PREFIX##_list_push_front (PREFIX##_list_head  *head,                                          \
95653 +                         ITEM_TYPE           *item)                                          \
95654 +{                                                                                             \
95655 +  PREFIX##_list_insert_int (head->_next, & item->LINK_NAME);                                  \
95656 +}                                                                                             \
95657 +                                                                                              \
95658 +static __inline__ void                                                                        \
95659 +PREFIX##_list_push_back (PREFIX##_list_head  *head,                                           \
95660 +                        ITEM_TYPE           *item)                                           \
95661 +{                                                                                             \
95662 +  PREFIX##_list_insert_int ((PREFIX##_list_link *) head, & item->LINK_NAME);                  \
95663 +}                                                                                             \
95664 +                                                                                              \
95665 +static __inline__ void                                                                        \
95666 +PREFIX##_list_insert_before (ITEM_TYPE         *reference,                                    \
95667 +                          ITEM_TYPE           *item)                                         \
95668 +{                                                                                             \
95669 +  PREFIX##_list_insert_int (& reference->LINK_NAME, & item->LINK_NAME);                       \
95670 +}                                                                                             \
95671 +                                                                                              \
95672 +static __inline__ void                                                                        \
95673 +PREFIX##_list_insert_after (ITEM_TYPE         *reference,                                     \
95674 +                         ITEM_TYPE           *item)                                          \
95675 +{                                                                                             \
95676 +  PREFIX##_list_insert_int (reference->LINK_NAME._next, & item->LINK_NAME);                   \
95677 +}                                                                                             \
95678 +                                                                                              \
95679 +static __inline__ PREFIX##_list_link*                                                         \
95680 +PREFIX##_list_remove_int (PREFIX##_list_link *list_link)                                      \
95681 +{                                                                                             \
95682 +  PREFIX##_list_link *next = list_link->_next;                                                \
95683 +  PREFIX##_list_link *prev = list_link->_prev;                                                \
95684 +  PREFIX##_list_link_ok (list_link);                                                          \
95685 +  PREFIX##_list_link_ok (next);                                                               \
95686 +  PREFIX##_list_link_ok (prev);                                                               \
95687 +  next->_prev = prev;                                                                         \
95688 +  prev->_next = next;                                                                         \
95689 +  PREFIX##_list_link_ok (next);                                                               \
95690 +  PREFIX##_list_link_ok (prev);                                                               \
95691 +  return list_link;                                                                           \
95692 +}                                                                                             \
95693 +                                                                                              \
95694 +static __inline__ void                                                                        \
95695 +PREFIX##_list_remove (ITEM_TYPE  *item)                                                       \
95696 +{                                                                                             \
95697 +  PREFIX##_list_remove_int (& item->LINK_NAME);                                               \
95698 +}                                                                                             \
95699 +                                                                                              \
95700 +static __inline__ void                                                                        \
95701 +PREFIX##_list_remove_clean (ITEM_TYPE  *item)                                                 \
95702 +{                                                                                             \
95703 +  PREFIX##_list_remove_int (& item->LINK_NAME);                                               \
95704 +  PREFIX##_list_clean (item);                                                                 \
95705 +}                                                                                             \
95706 +                                                                                              \
95707 +static __inline__ ITEM_TYPE*                                                                  \
95708 +PREFIX##_list_remove_get_next (ITEM_TYPE  *item)                                              \
95709 +{                                                                                             \
95710 +  PREFIX##_list_link *next = item->LINK_NAME._next;                                           \
95711 +  PREFIX##_list_remove_int (& item->LINK_NAME);                                               \
95712 +  return TS_LINK_TO_ITEM(ITEM_TYPE,LINK_NAME,next);                                           \
95713 +}                                                                                             \
95714 +                                                                                              \
95715 +static __inline__ ITEM_TYPE*                                                                  \
95716 +PREFIX##_list_remove_get_prev (ITEM_TYPE  *item)                                              \
95717 +{                                                                                             \
95718 +  PREFIX##_list_link *prev = item->LINK_NAME._prev;                                           \
95719 +  PREFIX##_list_remove_int (& item->LINK_NAME);                                               \
95720 +  return TS_LINK_TO_ITEM(ITEM_TYPE,LINK_NAME,prev);                                           \
95721 +}                                                                                             \
95722 +                                                                                              \
95723 +static __inline__ int                                                                         \
95724 +PREFIX##_list_empty (const PREFIX##_list_head  *head)                                         \
95725 +{                                                                                             \
95726 +  return head == (PREFIX##_list_head*) head->_next;                                           \
95727 +}                                                                                             \
95728 +                                                                                              \
95729 +static __inline__ ITEM_TYPE*                                                                  \
95730 +PREFIX##_list_pop_front (PREFIX##_list_head  *head)                                           \
95731 +{                                                                                             \
95732 +  assert ("nikita-1913", ! PREFIX##_list_empty (head));                                       \
95733 +  return TS_LINK_TO_ITEM(ITEM_TYPE,LINK_NAME,PREFIX##_list_remove_int (head->_next));         \
95734 +}                                                                                             \
95735 +                                                                                              \
95736 +static __inline__ ITEM_TYPE*                                                                  \
95737 +PREFIX##_list_pop_back (PREFIX##_list_head  *head)                                            \
95738 +{                                                                                             \
95739 +  assert ("nikita-1914", ! PREFIX##_list_empty (head)); /* WWI started */                     \
95740 +  return TS_LINK_TO_ITEM(ITEM_TYPE,LINK_NAME,PREFIX##_list_remove_int (head->_prev));         \
95741 +}                                                                                             \
95742 +                                                                                              \
95743 +static __inline__ ITEM_TYPE*                                                                  \
95744 +PREFIX##_list_front (const PREFIX##_list_head  *head)                                         \
95745 +{                                                                                             \
95746 +  return TS_LINK_TO_ITEM(ITEM_TYPE,LINK_NAME,head->_next);                                    \
95747 +}                                                                                             \
95748 +                                                                                              \
95749 +static __inline__ ITEM_TYPE*                                                                  \
95750 +PREFIX##_list_back (const PREFIX##_list_head  *head)                                          \
95751 +{                                                                                             \
95752 +  return TS_LINK_TO_ITEM(ITEM_TYPE,LINK_NAME,head->_prev);                                    \
95753 +}                                                                                             \
95754 +                                                                                              \
95755 +static __inline__ ITEM_TYPE*                                                                  \
95756 +PREFIX##_list_next (const ITEM_TYPE *item)                                                    \
95757 +{                                                                                             \
95758 +  return TS_LINK_TO_ITEM(ITEM_TYPE,LINK_NAME,item->LINK_NAME._next);                          \
95759 +}                                                                                             \
95760 +                                                                                              \
95761 +static __inline__ ITEM_TYPE*                                                                  \
95762 +PREFIX##_list_prev (const ITEM_TYPE *item)                                                    \
95763 +{                                                                                             \
95764 +  return TS_LINK_TO_ITEM(ITEM_TYPE,LINK_NAME,item->LINK_NAME._prev);                          \
95765 +}                                                                                             \
95766 +                                                                                              \
95767 +static __inline__ int                                                                         \
95768 +PREFIX##_list_end (const PREFIX##_list_head  *head,                                           \
95769 +                  const ITEM_TYPE           *item)                                           \
95770 +{                                                                                             \
95771 +  return ((PREFIX##_list_link *) head) == (& item->LINK_NAME);                                \
95772 +}                                                                                             \
95773 +                                                                                              \
95774 +static __inline__ void                                                                        \
95775 +PREFIX##_list_splice (PREFIX##_list_head  *head_join,                                         \
95776 +                     PREFIX##_list_head  *head_empty)                                        \
95777 +{                                                                                             \
95778 +  if (PREFIX##_list_empty (head_empty)) {                                                     \
95779 +    return;                                                                                   \
95780 +  }                                                                                           \
95781 +                                                                                              \
95782 +  head_empty->_prev->_next = (PREFIX##_list_link*) head_join;                                 \
95783 +  head_empty->_next->_prev = head_join->_prev;                                                \
95784 +                                                                                              \
95785 +  head_join->_prev->_next  = head_empty->_next;                                               \
95786 +  head_join->_prev         = head_empty->_prev;                                               \
95787 +                                                                                              \
95788 +  PREFIX##_list_link_ok ((PREFIX##_list_link*) head_join);                                    \
95789 +  PREFIX##_list_link_ok (head_join->_prev);                                                   \
95790 +  PREFIX##_list_link_ok (head_join->_next);                                                   \
95791 +                                                                                              \
95792 +  PREFIX##_list_init (head_empty);                                                            \
95793 +}                                                                                             \
95794 +                                                                                              \
95795 +static __inline__ void                                                                        \
95796 +PREFIX##_list_split(PREFIX##_list_head  *head_split,                                          \
95797 +                   PREFIX##_list_head  *head_new,                                            \
95798 +                   ITEM_TYPE  *item)                                                         \
95799 +{                                                                                             \
95800 +  assert("vs-1471", PREFIX##_list_empty(head_new));                                           \
95801 +                                                                                              \
95802 +  /* attach to new list */                                                                    \
95803 +  head_new->_next = (& item->LINK_NAME);                                                      \
95804 +  head_new->_prev = head_split->_prev;                                                        \
95805 +                                                                                              \
95806 +  /* cut from old list */                                                                     \
95807 +  item->LINK_NAME._prev->_next = (PREFIX##_list_link*)head_split;                             \
95808 +  head_split->_prev = item->LINK_NAME._prev;                                                  \
95809 +                                                                                              \
95810 +  /* link new list */                                                                         \
95811 +  head_new->_next->_prev = (PREFIX##_list_link*)head_new;                                     \
95812 +  head_new->_prev->_next = (PREFIX##_list_link*)head_new;                                     \
95813 +}                                                                                             \
95814 +                                                                                              \
95815 +static __inline__ void                                                                        \
95816 +PREFIX##_list_check (const PREFIX##_list_head  *head)                                         \
95817 +{                                                                                             \
95818 +       const PREFIX##_list_link *link;                                                       \
95819 +                                                                                              \
95820 +       for (link = head->_next ; link != ((PREFIX##_list_link *) head) ; link = link->_next) \
95821 +               PREFIX##_list_link_ok (link);                                                 \
95822 +}                                                                                             \
95823 +                                                                                              \
95824 +typedef struct { int foo; } PREFIX##_list_dummy_decl
95825 +
95826 +/* The final typedef is to allow a semicolon at the end of
95827 + * TYPE_SAFE_LIST_DEFINE(); */
95828 +
95829 +#define for_all_type_safe_list(prefix, head, item)             \
95830 +       for(item = prefix ## _list_front(head),                 \
95831 +                   prefetch(prefix ## _list_next(item));       \
95832 +           !prefix ## _list_end(head, item) ;                  \
95833 +           item = prefix ## _list_next(item),                  \
95834 +                   prefetch(prefix ## _list_next(item)))
95835 +
95836 +#define for_all_type_safe_list_safe(prefix, head, item, next)  \
95837 +       for(item = prefix ## _list_front(head),                 \
95838 +            next = prefix ## _list_next(item);                 \
95839 +           !prefix ## _list_end(head, item) ;                  \
95840 +           item = next,                                        \
95841 +           next = prefix ## _list_next(item))
95842 +
95843 +/* __REISER4_TYPE_SAFE_LIST_H__ */
95844 +#endif
95845 +
95846 +/*
95847 +   Local variables:
95848 +   c-indentation-style: "K&R"
95849 +   mode-name: "LC"
95850 +   c-basic-offset: 8
95851 +   tab-width: 8
95852 +   fill-column: 120
95853 +   End:
95854 +*/
95855 diff -rupN linux-2.6.8-rc3/fs/reiser4/vfs_ops.c linux-2.6.8-rc3-a/fs/reiser4/vfs_ops.c
95856 --- linux-2.6.8-rc3/fs/reiser4/vfs_ops.c        1970-01-01 03:00:00.000000000 +0300
95857 +++ linux-2.6.8-rc3-a/fs/reiser4/vfs_ops.c      2004-08-05 21:20:53.459581138 +0400
95858 @@ -0,0 +1,1709 @@
95859 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
95860 + * reiser4/README */
95861 +
95862 +/* Interface to VFS. Reiser4 {super|export|dentry}_operations are defined
95863 +   here. */
95864 +
95865 +#include "forward.h"
95866 +#include "debug.h"
95867 +#include "dformat.h"
95868 +#include "coord.h"
95869 +#include "plugin/item/item.h"
95870 +#include "plugin/file/file.h"
95871 +#include "plugin/security/perm.h"
95872 +#include "plugin/disk_format/disk_format.h"
95873 +#include "plugin/dir/dir.h"
95874 +#include "plugin/plugin.h"
95875 +#include "plugin/plugin_set.h"
95876 +#include "plugin/object.h"
95877 +#include "txnmgr.h"
95878 +#include "jnode.h"
95879 +#include "znode.h"
95880 +#include "block_alloc.h"
95881 +#include "tree.h"
95882 +#include "log.h"
95883 +#include "vfs_ops.h"
95884 +#include "inode.h"
95885 +#include "page_cache.h"
95886 +#include "ktxnmgrd.h"
95887 +#include "super.h"
95888 +#include "reiser4.h"
95889 +#include "kattr.h"
95890 +#include "entd.h"
95891 +#include "emergency_flush.h"
95892 +#include "prof.h"
95893 +#include "repacker.h"
95894 +#include "init_super.h"
95895 +#include "status_flags.h"
95896 +#include "flush.h"
95897 +#include "dscale.h"
95898 +
95899 +#include <linux/profile.h>
95900 +#include <linux/types.h>
95901 +#include <linux/mount.h>
95902 +#include <linux/vfs.h>
95903 +#include <linux/mm.h>
95904 +#include <linux/buffer_head.h>
95905 +#include <linux/dcache.h>
95906 +#include <linux/list.h>
95907 +#include <linux/pagemap.h>
95908 +#include <linux/slab.h>
95909 +#include <linux/seq_file.h>
95910 +#include <linux/init.h>
95911 +#include <linux/module.h>
95912 +#include <linux/writeback.h>
95913 +#include <linux/blkdev.h>
95914 +#include <linux/quotaops.h>
95915 +#include <linux/security.h>
95916 +#include <linux/reboot.h>
95917 +#include <linux/rcupdate.h>
95918 +
95919 +/* super operations */
95920 +
95921 +static struct inode *reiser4_alloc_inode(struct super_block *super);
95922 +static void reiser4_destroy_inode(struct inode *inode);
95923 +static void reiser4_drop_inode(struct inode *);
95924 +static void reiser4_delete_inode(struct inode *);
95925 +static void reiser4_write_super(struct super_block *);
95926 +static int reiser4_statfs(struct super_block *, struct kstatfs *);
95927 +static int reiser4_show_options(struct seq_file *m, struct vfsmount *mnt);
95928 +static void reiser4_sync_inodes(struct super_block *s, struct writeback_control * wbc);
95929 +
95930 +extern struct dentry_operations reiser4_dentry_operation;
95931 +
95932 +struct file_system_type reiser4_fs_type;
95933 +
95934 +/* ->statfs() VFS method in reiser4 super_operations */
95935 +static int
95936 +reiser4_statfs(struct super_block *super       /* super block of file
95937 +                                                * system in queried */ ,
95938 +              struct kstatfs *statfs   /* buffer to fill with
95939 +                                        * statistics */ )
95940 +{
95941 +       sector_t total;
95942 +       sector_t reserved;
95943 +       sector_t free;
95944 +       sector_t forroot;
95945 +       sector_t deleted;
95946 +       reiser4_context ctx;
95947 +
95948 +       assert("nikita-408", super != NULL);
95949 +       assert("nikita-409", statfs != NULL);
95950 +
95951 +       init_context(&ctx, super);
95952 +       reiser4_stat_inc(vfs_calls.statfs);
95953 +
95954 +       statfs->f_type = statfs_type(super);
95955 +       statfs->f_bsize = super->s_blocksize;
95956 +
95957 +       /*
95958 +        * 5% of total block space is reserved. This is needed for flush and
95959 +        * for truncates (so that we are able to perform truncate/unlink even
95960 +        * on the otherwise completely full file system). If this reservation
95961 +        * is hidden from statfs(2), users will mistakenly guess that they
95962 +        * have enough free space to complete some operation, which is
95963 +        * frustrating.
95964 +        *
95965 +        * Another possible solution is to subtract ->blocks_reserved from
95966 +        * ->f_bfree, but changing available space seems less intrusive than
95967 +        * letting user to see 5% of disk space to be used directly after
95968 +        * mkfs.
95969 +        */
95970 +       total    = reiser4_block_count(super);
95971 +       reserved = get_super_private(super)->blocks_reserved;
95972 +       deleted  = txnmgr_count_deleted_blocks();
95973 +       free     = reiser4_free_blocks(super) + deleted;
95974 +       forroot  = reiser4_reserved_blocks(super, 0, 0);
95975 +
95976 +       /* These counters may be in inconsistent state because we take the
95977 +        * values without keeping any global spinlock.  Here we do a sanity
95978 +        * check that free block counter does not exceed the number of all
95979 +        * blocks.  */
95980 +       if (free > total)
95981 +               free = total;
95982 +       statfs->f_blocks = total - reserved;
95983 +       /* make sure statfs->f_bfree is never larger than statfs->f_blocks */
95984 +       if (free > reserved)
95985 +               free -= reserved;
95986 +       else
95987 +               free = 0;
95988 +       statfs->f_bfree = free;
95989 +
95990 +       if (free > forroot)
95991 +               free -= forroot;
95992 +       else
95993 +               free = 0;
95994 +       statfs->f_bavail = free;
95995 +
95996 +/* FIXME: Seems that various df implementations are way unhappy by such big numbers.
95997 +   So we will leave those as zeroes.
95998 +       statfs->f_files = oids_used(super) + oids_free(super);
95999 +       statfs->f_ffree = oids_free(super);
96000 +*/
96001 +
96002 +       /* maximal acceptable name length depends on directory plugin. */
96003 +       assert("nikita-3351", super->s_root->d_inode != NULL);
96004 +       statfs->f_namelen = reiser4_max_filename_len(super->s_root->d_inode);
96005 +       reiser4_exit_context(&ctx);
96006 +       return 0;
96007 +}
96008 +
96009 +/* this is called whenever mark_inode_dirty is to be called. Stat-data are
96010 + * updated in the tree. */
96011 +reiser4_internal int
96012 +reiser4_mark_inode_dirty(struct inode *inode)
96013 +{
96014 +       assert("vs-1207", is_in_reiser4_context());
96015 +       return reiser4_update_sd(inode);
96016 +}
96017 +
96018 +/* update inode stat-data by calling plugin */
96019 +reiser4_internal int
96020 +reiser4_update_sd(struct inode *object)
96021 +{
96022 +        file_plugin *fplug;
96023 +
96024 +       assert("nikita-2338", object != NULL);
96025 +       /* check for read-only file system. */
96026 +       if (IS_RDONLY(object))
96027 +               return 0;
96028 +
96029 +       fplug = inode_file_plugin(object);
96030 +       assert("nikita-2339", fplug != NULL);
96031 +       return fplug->write_sd_by_inode(object);
96032 +}
96033 +
96034 +/* helper function: increase inode nlink count and call plugin method to save
96035 +   updated stat-data.
96036 +
96037 +   Used by link/create and during creation of dot and dotdot in mkdir
96038 +*/
96039 +reiser4_internal int
96040 +reiser4_add_nlink(struct inode *object /* object to which link is added */ ,
96041 +                 struct inode *parent /* parent where new entry will be */ ,
96042 +                 int write_sd_p        /* true if stat-data has to be
96043 +                                        * updated */ )
96044 +{
96045 +       file_plugin *fplug;
96046 +       int result;
96047 +
96048 +       assert("nikita-1351", object != NULL);
96049 +
96050 +       fplug = inode_file_plugin(object);
96051 +       assert("nikita-1445", fplug != NULL);
96052 +
96053 +       /* ask plugin whether it can add yet another link to this
96054 +          object */
96055 +       if (!fplug->can_add_link(object))
96056 +               return RETERR(-EMLINK);
96057 +
96058 +       assert("nikita-2211", fplug->add_link != NULL);
96059 +       /* call plugin to do actual addition of link */
96060 +       result = fplug->add_link(object, parent);
96061 +
96062 +       mark_inode_update(object, write_sd_p);
96063 +
96064 +       /* optionally update stat data */
96065 +       if (result == 0 && write_sd_p)
96066 +               result = fplug->write_sd_by_inode(object);
96067 +       return result;
96068 +}
96069 +
96070 +/* helper function: decrease inode nlink count and call plugin method to save
96071 +   updated stat-data.
96072 +
96073 +   Used by unlink/create
96074 +*/
96075 +reiser4_internal int
96076 +reiser4_del_nlink(struct inode *object /* object from which link is
96077 +                                        * removed */ ,
96078 +                 struct inode *parent /* parent where entry was */ ,
96079 +                 int write_sd_p        /* true is stat-data has to be
96080 +                                        * updated */ )
96081 +{
96082 +       file_plugin *fplug;
96083 +       int result;
96084 +
96085 +       assert("nikita-1349", object != NULL);
96086 +
96087 +       fplug = inode_file_plugin(object);
96088 +       assert("nikita-1350", fplug != NULL);
96089 +       assert("nikita-1446", object->i_nlink > 0);
96090 +       assert("nikita-2210", fplug->rem_link != NULL);
96091 +
96092 +       /* call plugin to do actual deletion of link */
96093 +       result = fplug->rem_link(object, parent);
96094 +       mark_inode_update(object, write_sd_p);
96095 +       /* optionally update stat data */
96096 +       if (result == 0 && write_sd_p)
96097 +               result = fplug->write_sd_by_inode(object);
96098 +       return result;
96099 +}
96100 +
96101 +/* slab for reiser4_dentry_fsdata */
96102 +static kmem_cache_t *dentry_fsdata_slab;
96103 +
96104 +/*
96105 + * initializer for dentry_fsdata_slab called during boot or module load.
96106 + */
96107 +reiser4_internal int init_dentry_fsdata(void)
96108 +{
96109 +       dentry_fsdata_slab = kmem_cache_create("dentry_fsdata",
96110 +                                              sizeof (reiser4_dentry_fsdata),
96111 +                                              0,
96112 +                                              SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT,
96113 +                                              NULL,
96114 +                                              NULL);
96115 +       return (dentry_fsdata_slab == NULL) ? RETERR(-ENOMEM) : 0;
96116 +}
96117 +
96118 +/*
96119 + * dual to init_dentry_fsdata(). Called on module unload.
96120 + */
96121 +reiser4_internal void done_dentry_fsdata(void)
96122 +{
96123 +       kmem_cache_destroy(dentry_fsdata_slab);
96124 +}
96125 +
96126 +
96127 +/* Return and lazily allocate if necessary per-dentry data that we
96128 +   attach to each dentry. */
96129 +reiser4_internal reiser4_dentry_fsdata *
96130 +reiser4_get_dentry_fsdata(struct dentry *dentry        /* dentry
96131 +                                                * queried */ )
96132 +{
96133 +       assert("nikita-1365", dentry != NULL);
96134 +
96135 +       if (dentry->d_fsdata == NULL) {
96136 +               reiser4_stat_inc(vfs_calls.private_data_alloc);
96137 +               dentry->d_fsdata = kmem_cache_alloc(dentry_fsdata_slab,
96138 +                                                   GFP_KERNEL);
96139 +               if (dentry->d_fsdata == NULL)
96140 +                       return ERR_PTR(RETERR(-ENOMEM));
96141 +               xmemset(dentry->d_fsdata, 0, sizeof (reiser4_dentry_fsdata));
96142 +       }
96143 +       return dentry->d_fsdata;
96144 +}
96145 +
96146 +/* opposite to reiser4_get_dentry_fsdata(), returns per-dentry data into slab
96147 + * allocator */
96148 +reiser4_internal void
96149 +reiser4_free_dentry_fsdata(struct dentry *dentry /* dentry released */ )
96150 +{
96151 +       if (dentry->d_fsdata != NULL) {
96152 +               kmem_cache_free(dentry_fsdata_slab, dentry->d_fsdata);
96153 +               dentry->d_fsdata = NULL;
96154 +       }
96155 +}
96156 +
96157 +/* Release reiser4 dentry. This is d_op->d_release() method. */
96158 +static void
96159 +reiser4_d_release(struct dentry *dentry /* dentry released */ )
96160 +{
96161 +       reiser4_free_dentry_fsdata(dentry);
96162 +}
96163 +
96164 +/* slab for reiser4_dentry_fsdata */
96165 +static kmem_cache_t *file_fsdata_slab;
96166 +
96167 +/*
96168 + * initialize file_fsdata_slab. This is called during boot or module load.
96169 + */
96170 +reiser4_internal int init_file_fsdata(void)
96171 +{
96172 +       file_fsdata_slab = kmem_cache_create("file_fsdata",
96173 +                                            sizeof (reiser4_file_fsdata),
96174 +                                            0,
96175 +                                            SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT,
96176 +                                            NULL,
96177 +                                            NULL);
96178 +       return (file_fsdata_slab == NULL) ? RETERR(-ENOMEM) : 0;
96179 +}
96180 +
96181 +/*
96182 + * dual to init_file_fsdata(). Called during module unload.
96183 + */
96184 +reiser4_internal void done_file_fsdata(void)
96185 +{
96186 +       kmem_cache_destroy(file_fsdata_slab);
96187 +}
96188 +
96189 +/*
96190 + * Create reiser4 specific per-file data: reiser4_file_fsdata.
96191 + */
96192 +reiser4_internal reiser4_file_fsdata *
96193 +create_fsdata(struct file *file, int gfp)
96194 +{
96195 +       reiser4_file_fsdata *fsdata;
96196 +
96197 +       fsdata = kmem_cache_alloc(file_fsdata_slab, gfp);
96198 +       if (fsdata != NULL) {
96199 +               xmemset(fsdata, 0, sizeof *fsdata);
96200 +               fsdata->ra.max_window_size = VM_MAX_READAHEAD * 1024;
96201 +               fsdata->back = file;
96202 +               readdir_list_clean(fsdata);
96203 +       }
96204 +       return fsdata;
96205 +}
96206 +
96207 +/* Return and lazily allocate if necessary per-file data that we attach
96208 +   to each struct file. */
96209 +reiser4_internal reiser4_file_fsdata *
96210 +reiser4_get_file_fsdata(struct file *f /* file
96211 +                                        * queried */ )
96212 +{
96213 +       assert("nikita-1603", f != NULL);
96214 +
96215 +       if (f->private_data == NULL) {
96216 +               reiser4_file_fsdata *fsdata;
96217 +               struct inode *inode;
96218 +
96219 +               reiser4_stat_inc(vfs_calls.private_data_alloc);
96220 +               fsdata = create_fsdata(f, GFP_KERNEL);
96221 +               if (fsdata == NULL)
96222 +                       return ERR_PTR(RETERR(-ENOMEM));
96223 +
96224 +               inode = f->f_dentry->d_inode;
96225 +               spin_lock_inode(inode);
96226 +               if (f->private_data == NULL) {
96227 +                       f->private_data = fsdata;
96228 +                       fsdata = NULL;
96229 +               }
96230 +               spin_unlock_inode(inode);
96231 +               if (fsdata != NULL)
96232 +                       /* other thread initialized ->fsdata */
96233 +                       kmem_cache_free(file_fsdata_slab, fsdata);
96234 +       }
96235 +       assert("nikita-2665", f->private_data != NULL);
96236 +       return f->private_data;
96237 +}
96238 +
96239 +/*
96240 + * Dual to create_fsdata(). Free reiser4_file_fsdata.
96241 + */
96242 +reiser4_internal void
96243 +reiser4_free_fsdata(reiser4_file_fsdata *fsdata)
96244 +{
96245 +       if (fsdata != NULL)
96246 +               kmem_cache_free(file_fsdata_slab, fsdata);
96247 +}
96248 +
96249 +/*
96250 + * Dual to reiser4_get_file_fsdata().
96251 + */
96252 +reiser4_internal void
96253 +reiser4_free_file_fsdata(struct file *f)
96254 +{
96255 +       reiser4_file_fsdata *fsdata;
96256 +       fsdata = f->private_data;
96257 +       if (fsdata != NULL) {
96258 +               readdir_list_remove_clean(fsdata);
96259 +               if (fsdata->cursor == NULL)
96260 +                       reiser4_free_fsdata(fsdata);
96261 +       }
96262 +       f->private_data = NULL;
96263 +}
96264 +
96265 +/* our ->read_inode() is no-op. Reiser4 inodes should be loaded
96266 +    through fs/reiser4/inode.c:reiser4_iget() */
96267 +static void
96268 +noop_read_inode(struct inode *inode UNUSED_ARG)
96269 +{
96270 +}
96271 +
96272 +/* initialization and shutdown */
96273 +
96274 +/* slab cache for inodes */
96275 +static kmem_cache_t *inode_cache;
96276 +
96277 +/* initalisation function passed to the kmem_cache_create() to init new pages
96278 +   grabbed by our inodecache. */
96279 +static void
96280 +init_once(void *obj /* pointer to new inode */ ,
96281 +         kmem_cache_t * cache UNUSED_ARG /* slab cache */ ,
96282 +         unsigned long flags /* cache flags */ )
96283 +{
96284 +       reiser4_inode_object *info;
96285 +
96286 +       info = obj;
96287 +
96288 +       if ((flags & (SLAB_CTOR_VERIFY | SLAB_CTOR_CONSTRUCTOR)) == SLAB_CTOR_CONSTRUCTOR) {
96289 +               /* NOTE-NIKITA add here initializations for locks, list heads,
96290 +                  etc. that will be added to our private inode part. */
96291 +               inode_init_once(&info->vfs_inode);
96292 +               readdir_list_init(get_readdir_list(&info->vfs_inode));
96293 +               init_rwsem(&info->p.coc_sem);
96294 +               sema_init(&info->p.loading, 1);
96295 +               ON_DEBUG(info->p.nr_jnodes = 0);
96296 +               INIT_RADIX_TREE(jnode_tree_by_reiser4_inode(&info->p), GFP_ATOMIC);
96297 +               ON_DEBUG(info->p.captured_eflushed = 0);
96298 +               ON_DEBUG(info->p.anonymous_eflushed = 0);
96299 +               ON_DEBUG(inode_jnodes_list_init(&info->p.jnodes_list));
96300 +       }
96301 +}
96302 +
96303 +/* initialize slab cache where reiser4 inodes will live */
96304 +reiser4_internal int
96305 +init_inodecache(void)
96306 +{
96307 +       inode_cache = kmem_cache_create("reiser4_inode",
96308 +                                       sizeof (reiser4_inode_object),
96309 +                                       0,
96310 +                                       SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT,
96311 +                                       init_once,
96312 +                                       NULL);
96313 +       return (inode_cache != NULL) ? 0 : RETERR(-ENOMEM);
96314 +}
96315 +
96316 +/* initialize slab cache where reiser4 inodes lived */
96317 +static void
96318 +destroy_inodecache(void)
96319 +{
96320 +       if (kmem_cache_destroy(inode_cache) != 0)
96321 +               warning("nikita-1695", "not all inodes were freed");
96322 +}
96323 +
96324 +/* ->alloc_inode() super operation: allocate new inode */
96325 +static struct inode *
96326 +reiser4_alloc_inode(struct super_block *super UNUSED_ARG       /* super block new
96327 +                                                                * inode is
96328 +                                                                * allocated for */ )
96329 +{
96330 +       reiser4_inode_object *obj;
96331 +
96332 +       assert("nikita-1696", super != NULL);
96333 +       reiser4_stat_inc_at(super, vfs_calls.alloc_inode);
96334 +       obj = kmem_cache_alloc(inode_cache, SLAB_KERNEL);
96335 +       if (obj != NULL) {
96336 +               reiser4_inode *info;
96337 +
96338 +               info = &obj->p;
96339 +
96340 +               info->hset = info->pset = plugin_set_get_empty();
96341 +               info->extmask = 0;
96342 +               info->locality_id = 0ull;
96343 +               info->plugin_mask = 0;
96344 +#if !REISER4_INO_IS_OID
96345 +               info->oid_hi = 0;
96346 +#endif
96347 +               seal_init(&info->sd_seal, NULL, NULL);
96348 +               coord_init_invalid(&info->sd_coord, NULL);
96349 +               info->cluster_shift = 0;
96350 +               info->crypt = NULL;
96351 +               info->flags = 0;
96352 +               spin_inode_object_init(info);
96353 +
96354 +               /* initizalize inode's jnode */
96355 +               /*jnode_init(&info->inode_jnode, current_tree, JNODE_INODE);
96356 +                 atomic_set(&info->inode_jnode.x_count, 1);*/
96357 +               info->vroot = UBER_TREE_ADDR;
96358 +               return &obj->vfs_inode;
96359 +       } else
96360 +               return NULL;
96361 +}
96362 +
96363 +/* ->destroy_inode() super operation: recycle inode */
96364 +static void
96365 +reiser4_destroy_inode(struct inode *inode /* inode being destroyed */)
96366 +{
96367 +       reiser4_inode *info;
96368 +
96369 +       reiser4_stat_inc_at(inode->i_sb, vfs_calls.destroy_inode);
96370 +
96371 +       info = reiser4_inode_data(inode);
96372 +
96373 +       assert("vs-1220", jnode_tree_by_reiser4_inode(info)->rnode == NULL);
96374 +       assert("vs-1222", info->captured_eflushed == 0);
96375 +       assert("vs-1428", info->anonymous_eflushed == 0);
96376 +       assert("zam-1050", info->nr_jnodes == 0);
96377 +
96378 +       if (!is_bad_inode(inode) && is_inode_loaded(inode)) {
96379 +               file_plugin * fplug = inode_file_plugin(inode);
96380 +               if (fplug->destroy_inode != NULL)
96381 +                       fplug->destroy_inode(inode);
96382 +       }
96383 +       dispose_cursors(inode);
96384 +       if (info->pset)
96385 +               plugin_set_put(info->pset);
96386 +
96387 +       /* FIXME: assert that info's page radix tree is empty */
96388 +       /*assert("nikita-2872", list_empty(&info->moved_pages));*/
96389 +
96390 +       /* cannot add similar assertion about ->i_list as prune_icache return
96391 +        * inode into slab with dangling ->list.{next,prev}. This is safe,
96392 +        * because they are re-initialized in the new_inode(). */
96393 +       assert("nikita-2895", list_empty(&inode->i_dentry));
96394 +       assert("nikita-2896", hlist_unhashed(&inode->i_hash));
96395 +       assert("nikita-2898", readdir_list_empty(get_readdir_list(inode)));
96396 +       kmem_cache_free(inode_cache, container_of(info, reiser4_inode_object, p));
96397 +}
96398 +
96399 +/* our ->drop_inode() method. This is called by iput_final() when last
96400 + * reference on inode is released */
96401 +static void
96402 +reiser4_drop_inode(struct inode *object)
96403 +{
96404 +       file_plugin *fplug;
96405 +
96406 +       assert("nikita-2643", object != NULL);
96407 +
96408 +       /* -not- creating context in this method, because it is frequently
96409 +          called and all existing ->not_linked() methods are one liners. */
96410 +
96411 +       fplug = inode_file_plugin(object);
96412 +       /* fplug is NULL for fake inode */
96413 +       if (fplug != NULL) {
96414 +               assert("nikita-3251", fplug->drop != NULL);
96415 +               fplug->drop(object);
96416 +       } else
96417 +               generic_forget_inode(object);
96418 +}
96419 +
96420 +/*
96421 + * Called by reiser4_sync_inodes(), during speculative write-back (through
96422 + * pdflush, or balance_dirty_pages()).
96423 + */
96424 +static void
96425 +writeout(struct super_block *sb, struct writeback_control *wbc)
96426 +{
96427 +       long written = 0;
96428 +       int repeats = 0;
96429 +
96430 +       /*
96431 +        * Performs early flushing, trying to free some memory. If there is
96432 +        * nothing to flush, commits some atoms.
96433 +        */
96434 +
96435 +       /* reiser4 has its own means of periodical write-out */
96436 +       if (wbc->for_kupdate)
96437 +               return;
96438 +
96439 +       /* Commit all atoms if reiser4_writepages() is called from sys_sync() or
96440 +          sys_fsync(). */
96441 +       if (wbc->sync_mode != WB_SYNC_NONE) {
96442 +               txnmgr_force_commit_all(sb, 1);
96443 +               return;
96444 +       }
96445 +
96446 +       do {
96447 +               long nr_submitted = 0;
96448 +               struct inode *fake;
96449 +
96450 +               fake = get_super_fake(sb);
96451 +               if (fake != NULL) {
96452 +                       struct address_space *mapping;
96453 +
96454 +                       mapping = fake->i_mapping;
96455 +                       /* do not put more requests to overload write queue */
96456 +                       if (wbc->nonblocking &&
96457 +                           bdi_write_congested(mapping->backing_dev_info)) {
96458 +
96459 +                               blk_run_address_space(mapping);
96460 +                               /*blk_run_queues();*/
96461 +                               wbc->encountered_congestion = 1;
96462 +                               break;
96463 +                       }
96464 +               }
96465 +               repeats ++;
96466 +               flush_some_atom(&nr_submitted, wbc, JNODE_FLUSH_WRITE_BLOCKS);
96467 +               if (!nr_submitted)
96468 +                       break;
96469 +
96470 +               wbc->nr_to_write -= nr_submitted;
96471 +
96472 +               written += nr_submitted;
96473 +
96474 +       } while (wbc->nr_to_write > 0);
96475 +
96476 +}
96477 +
96478 +/* ->sync_inodes() method. This is called by pdflush, and synchronous
96479 + * writeback (throttling by balance_dirty_pages()). */
96480 +static void
96481 +reiser4_sync_inodes(struct super_block * sb, struct writeback_control * wbc)
96482 +{
96483 +       reiser4_context ctx;
96484 +
96485 +       init_context(&ctx, sb);
96486 +       wbc->older_than_this = NULL;
96487 +
96488 +       /*
96489 +        * What we are trying to do here is to capture all "anonymous" pages.
96490 +        */
96491 +       capture_reiser4_inodes(sb, wbc);
96492 +       spin_unlock(&inode_lock);
96493 +       writeout(sb, wbc);
96494 +
96495 +       /* avoid recursive calls to ->sync_inodes */
96496 +       context_set_commit_async(&ctx);
96497 +       reiser4_exit_context(&ctx);
96498 +       spin_lock(&inode_lock);
96499 +}
96500 +
96501 +/* ->delete_inode() super operation */
96502 +static void
96503 +reiser4_delete_inode(struct inode *object)
96504 +{
96505 +       reiser4_context ctx;
96506 +
96507 +       init_context(&ctx, object->i_sb);
96508 +       reiser4_stat_inc(vfs_calls.delete_inode);
96509 +       if (is_inode_loaded(object)) {
96510 +               file_plugin *fplug;
96511 +
96512 +               fplug = inode_file_plugin(object);
96513 +               if (fplug != NULL && fplug->delete != NULL)
96514 +                       fplug->delete(object);
96515 +       }
96516 +
96517 +       object->i_blocks = 0;
96518 +       clear_inode(object);
96519 +       reiser4_exit_context(&ctx);
96520 +}
96521 +
96522 +/* ->delete_inode() super operation */
96523 +static void
96524 +reiser4_clear_inode(struct inode *object)
96525 +{
96526 +       reiser4_inode *r4_inode;
96527 +
96528 +       r4_inode = reiser4_inode_data(object);
96529 +       assert("vs-1688", (r4_inode->anonymous_eflushed == 0 &&
96530 +                          r4_inode->captured_eflushed == 0 &&
96531 +                          r4_inode->nr_jnodes == 0));
96532 +}
96533 +
96534 +const char *REISER4_SUPER_MAGIC_STRING = "ReIsEr4";
96535 +const int REISER4_MAGIC_OFFSET = 16 * 4096;    /* offset to magic string from the
96536 +                                                * beginning of device */
96537 +
96538 +/* type of option parseable by parse_option() */
96539 +typedef enum {
96540 +       /* value of option is arbitrary string */
96541 +       OPT_STRING,
96542 +       /* option specifies bit in a bitmask */
96543 +       OPT_BIT,
96544 +       /* value of option should conform to sprintf() format */
96545 +       OPT_FORMAT,
96546 +       /* option can take one of predefined values */
96547 +       OPT_ONEOF,
96548 +} opt_type_t;
96549 +
96550 +typedef struct opt_bitmask_bit {
96551 +       const char *bit_name;
96552 +       int bit_nr;
96553 +} opt_bitmask_bit;
96554 +
96555 +/* description of option parseable by parse_option() */
96556 +typedef struct opt_desc {
96557 +       /* option name.
96558 +
96559 +          parsed portion of string has a form "name=value".
96560 +       */
96561 +       const char *name;
96562 +       /* type of option */
96563 +       opt_type_t type;
96564 +       union {
96565 +               /* where to store value of string option (type == OPT_STRING) */
96566 +               char **string;
96567 +               /* description of bits for bit option (type == OPT_BIT) */
96568 +               struct {
96569 +                       int nr;
96570 +                       void *addr;
96571 +               } bit;
96572 +               /* description of format and targets for format option (type
96573 +                  == OPT_FORMAT) */
96574 +               struct {
96575 +                       const char *format;
96576 +                       int nr_args;
96577 +                       void *arg1;
96578 +                       void *arg2;
96579 +                       void *arg3;
96580 +                       void *arg4;
96581 +               } f;
96582 +               struct {
96583 +                       int *result;
96584 +                       const char *list[10];
96585 +               } oneof;
96586 +               struct {
96587 +                       void *addr;
96588 +                       int nr_bits;
96589 +                       opt_bitmask_bit *bits;
96590 +               } bitmask;
96591 +       } u;
96592 +} opt_desc_t;
96593 +
96594 +/* parse one option */
96595 +static int
96596 +parse_option(char *opt_string /* starting point of parsing */ ,
96597 +            opt_desc_t * opt /* option description */ )
96598 +{
96599 +       /* foo=bar,
96600 +          ^   ^  ^
96601 +          |   |  +-- replaced to '\0'
96602 +          |   +-- val_start
96603 +          +-- opt_string
96604 +       */
96605 +       char *val_start;
96606 +       int result;
96607 +       const char *err_msg;
96608 +
96609 +       /* NOTE-NIKITA think about using lib/cmdline.c functions here. */
96610 +
96611 +       val_start = strchr(opt_string, '=');
96612 +       if (val_start != NULL) {
96613 +               *val_start = '\0';
96614 +               ++val_start;
96615 +       }
96616 +
96617 +       err_msg = NULL;
96618 +       result = 0;
96619 +       switch (opt->type) {
96620 +       case OPT_STRING:
96621 +               if (val_start == NULL) {
96622 +                       err_msg = "String arg missing";
96623 +                       result = RETERR(-EINVAL);
96624 +               } else
96625 +                       *opt->u.string = val_start;
96626 +               break;
96627 +       case OPT_BIT:
96628 +               if (val_start != NULL)
96629 +                       err_msg = "Value ignored";
96630 +               else
96631 +                       set_bit(opt->u.bit.nr, opt->u.bit.addr);
96632 +               break;
96633 +       case OPT_FORMAT:
96634 +               if (val_start == NULL) {
96635 +                       err_msg = "Formatted arg missing";
96636 +                       result = RETERR(-EINVAL);
96637 +                       break;
96638 +               }
96639 +               if (sscanf(val_start, opt->u.f.format,
96640 +                          opt->u.f.arg1, opt->u.f.arg2, opt->u.f.arg3, opt->u.f.arg4) != opt->u.f.nr_args) {
96641 +                       err_msg = "Wrong conversion";
96642 +                       result = RETERR(-EINVAL);
96643 +               }
96644 +               break;
96645 +       case OPT_ONEOF:{
96646 +                       int i = 0;
96647 +                       err_msg = "Wrong option value";
96648 +                       result = RETERR(-EINVAL);
96649 +                       while ( opt->u.oneof.list[i] ) {
96650 +                               if ( !strcmp(opt->u.oneof.list[i], val_start) ) {
96651 +                                       result = 0;
96652 +                                       *opt->u.oneof.result = i;
96653 +printk("%s choice is %d\n",opt->name, i);
96654 +                                       break;
96655 +                               }
96656 +                               i++;
96657 +                       }
96658 +                       break;
96659 +                      }
96660 +       default:
96661 +               wrong_return_value("nikita-2100", "opt -> type");
96662 +               break;
96663 +       }
96664 +       if (err_msg != NULL) {
96665 +               warning("nikita-2496", "%s when parsing option \"%s%s%s\"",
96666 +                       err_msg, opt->name, val_start ? "=" : "", val_start ? : "");
96667 +       }
96668 +       return result;
96669 +}
96670 +
96671 +/* parse options */
96672 +static int
96673 +parse_options(char *opt_string /* starting point */ ,
96674 +             opt_desc_t * opts /* array with option description */ ,
96675 +             int nr_opts /* number of elements in @opts */ )
96676 +{
96677 +       int result;
96678 +
96679 +       result = 0;
96680 +       while ((result == 0) && opt_string && *opt_string) {
96681 +               int j;
96682 +               char *next;
96683 +
96684 +               next = strchr(opt_string, ',');
96685 +               if (next != NULL) {
96686 +                       *next = '\0';
96687 +                       ++next;
96688 +               }
96689 +               for (j = 0; j < nr_opts; ++j) {
96690 +                       if (!strncmp(opt_string, opts[j].name, strlen(opts[j].name))) {
96691 +                               result = parse_option(opt_string, &opts[j]);
96692 +                               break;
96693 +                       }
96694 +               }
96695 +               if (j == nr_opts) {
96696 +                       warning("nikita-2307", "Unrecognized option: \"%s\"", opt_string);
96697 +                       /* traditionally, -EINVAL is returned on wrong mount
96698 +                          option */
96699 +                       result = RETERR(-EINVAL);
96700 +               }
96701 +               opt_string = next;
96702 +       }
96703 +       return result;
96704 +}
96705 +
96706 +#define NUM_OPT( label, fmt, addr )                            \
96707 +               {                                               \
96708 +                       .name = ( label ),                      \
96709 +                       .type = OPT_FORMAT,                     \
96710 +                       .u = {                                  \
96711 +                               .f = {                          \
96712 +                                       .format  = ( fmt ),     \
96713 +                                       .nr_args = 1,           \
96714 +                                       .arg1 = ( addr ),       \
96715 +                                       .arg2 = NULL,           \
96716 +                                       .arg3 = NULL,           \
96717 +                                       .arg4 = NULL            \
96718 +                               }                               \
96719 +                       }                                       \
96720 +               }
96721 +
96722 +#define SB_FIELD_OPT( field, fmt ) NUM_OPT( #field, fmt, &sbinfo -> field )
96723 +
96724 +#define BIT_OPT(label, bitnr)                                  \
96725 +       {                                                       \
96726 +               .name = label,                                  \
96727 +               .type = OPT_BIT,                                \
96728 +               .u = {                                          \
96729 +                       .bit = {                                \
96730 +                               .nr = bitnr,                    \
96731 +                               .addr = &sbinfo->fs_flags       \
96732 +                       }                                       \
96733 +               }                                               \
96734 +       }
96735 +
96736 +/* parse options during mount */
96737 +reiser4_internal int
96738 +reiser4_parse_options(struct super_block *s, char *opt_string)
96739 +{
96740 +       int result;
96741 +       reiser4_super_info_data *sbinfo = get_super_private(s);
96742 +       char *log_file_name;
96743 +
96744 +       opt_desc_t opts[] = {
96745 +               /* trace_flags=N
96746 +
96747 +                  set trace flags to be N for this mount. N can be C numeric
96748 +                  literal recognized by %i scanf specifier.  It is treated as
96749 +                  bitfield filled by values of debug.h:reiser4_trace_flags
96750 +                  enum
96751 +               */
96752 +               SB_FIELD_OPT(trace_flags, "%i"),
96753 +               /* log_flags=N
96754 +
96755 +                  set log flags to be N for this mount. N can be C numeric
96756 +                  literal recognized by %i scanf specifier.  It is treated as
96757 +                  bitfield filled by values of debug.h:reiser4_log_flags
96758 +                  enum
96759 +               */
96760 +               SB_FIELD_OPT(log_flags, "%i"),
96761 +               /* debug_flags=N
96762 +
96763 +                  set debug flags to be N for this mount. N can be C numeric
96764 +                  literal recognized by %i scanf specifier.  It is treated as
96765 +                  bitfield filled by values of debug.h:reiser4_debug_flags
96766 +                  enum
96767 +               */
96768 +               SB_FIELD_OPT(debug_flags, "%i"),
96769 +               /* tmgr.atom_max_size=N
96770 +
96771 +                  Atoms containing more than N blocks will be forced to
96772 +                  commit. N is decimal.
96773 +               */
96774 +               SB_FIELD_OPT(tmgr.atom_max_size, "%u"),
96775 +               /* tmgr.atom_max_age=N
96776 +
96777 +                  Atoms older than N seconds will be forced to commit. N is
96778 +                  decimal.
96779 +               */
96780 +               SB_FIELD_OPT(tmgr.atom_max_age, "%u"),
96781 +               /* tmgr.atom_max_flushers=N
96782 +
96783 +                  limit of concurrent flushers for one atom. 0 means no limit.
96784 +                */
96785 +               SB_FIELD_OPT(tmgr.atom_max_flushers, "%u"),
96786 +               /* tree.cbk_cache_slots=N
96787 +
96788 +                  Number of slots in the cbk cache.
96789 +               */
96790 +               SB_FIELD_OPT(tree.cbk_cache.nr_slots, "%u"),
96791 +
96792 +               /* If flush finds more than FLUSH_RELOCATE_THRESHOLD adjacent
96793 +                  dirty leaf-level blocks it will force them to be
96794 +                  relocated. */
96795 +               SB_FIELD_OPT(flush.relocate_threshold, "%u"),
96796 +               /* If flush finds can find a block allocation closer than at
96797 +                  most FLUSH_RELOCATE_DISTANCE from the preceder it will
96798 +                  relocate to that position. */
96799 +               SB_FIELD_OPT(flush.relocate_distance, "%u"),
96800 +               /* If we have written this much or more blocks before
96801 +                  encountering busy jnode in flush list - abort flushing
96802 +                  hoping that next time we get called this jnode will be
96803 +                  clean already, and we will save some seeks. */
96804 +               SB_FIELD_OPT(flush.written_threshold, "%u"),
96805 +               /* The maximum number of nodes to scan left on a level during
96806 +                  flush. */
96807 +               SB_FIELD_OPT(flush.scan_maxnodes, "%u"),
96808 +
96809 +               /* preferred IO size */
96810 +               SB_FIELD_OPT(optimal_io_size, "%u"),
96811 +
96812 +               /* carry flags used for insertion of new nodes */
96813 +               SB_FIELD_OPT(tree.carry.new_node_flags, "%u"),
96814 +               /* carry flags used for insertion of new extents */
96815 +               SB_FIELD_OPT(tree.carry.new_extent_flags, "%u"),
96816 +               /* carry flags used for paste operations */
96817 +               SB_FIELD_OPT(tree.carry.paste_flags, "%u"),
96818 +               /* carry flags used for insert operations */
96819 +               SB_FIELD_OPT(tree.carry.insert_flags, "%u"),
96820 +
96821 +#ifdef CONFIG_REISER4_BADBLOCKS
96822 +               /* Alternative master superblock location in case if it's original
96823 +                  location is not writeable/accessable. This is offset in BYTES. */
96824 +               SB_FIELD_OPT(altsuper, "%lu"),
96825 +#endif
96826 +
96827 +               /* turn on BSD-style gid assignment */
96828 +               BIT_OPT("bsdgroups", REISER4_BSD_GID),
96829 +               /* turn on 32 bit times */
96830 +               BIT_OPT("32bittimes", REISER4_32_BIT_TIMES),
96831 +               /* turn off concurrent flushing */
96832 +               BIT_OPT("mtflush", REISER4_MTFLUSH),
96833 +               /* disable pseudo files support */
96834 +               BIT_OPT("nopseudo", REISER4_NO_PSEUDO),
96835 +               /* Don't load all bitmap blocks at mount time, it is useful
96836 +                  for machines with tiny RAM and large disks. */
96837 +               BIT_OPT("dont_load_bitmap", REISER4_DONT_LOAD_BITMAP),
96838 +
96839 +               {
96840 +                       /* tree traversal readahead parameters:
96841 +                          -o readahead:MAXNUM:FLAGS
96842 +                          MAXNUM - max number fo nodes to request readahead for: -1UL will set it to max_sane_readahead()
96843 +                          FLAGS - combination of bits: RA_ADJCENT_ONLY, RA_ALL_LEVELS, CONTINUE_ON_PRESENT
96844 +                       */
96845 +                       .name = "readahead",
96846 +                       .type = OPT_FORMAT,
96847 +                       .u = {
96848 +                               .f = {
96849 +                                       .format  = "%u:%u",
96850 +                                       .nr_args = 2,
96851 +                                       .arg1 = &sbinfo->ra_params.max,
96852 +                                       .arg2 = &sbinfo->ra_params.flags,
96853 +                                       .arg3 = NULL,
96854 +                                       .arg4 = NULL
96855 +                               }
96856 +                       }
96857 +               },
96858 +               /* What to do in case of fs error */
96859 +               {
96860 +                        .name = "onerror",
96861 +                        .type = OPT_ONEOF,
96862 +                        .u = {
96863 +                                .oneof = {
96864 +                                        .result = &sbinfo->onerror,
96865 +                                        .list = {"panic", "remount-ro", "reboot", NULL},
96866 +                                }
96867 +                        }
96868 +                },
96869 +
96870 +#if REISER4_LOG
96871 +               {
96872 +                       .name = "log_file",
96873 +                       .type = OPT_STRING,
96874 +                       .u = {
96875 +                               .string = &log_file_name
96876 +                       }
96877 +               },
96878 +#endif
96879 +       };
96880 +
96881 +       sbinfo->tmgr.atom_max_size = txnmgr_get_max_atom_size(s);
96882 +       sbinfo->tmgr.atom_max_age = REISER4_ATOM_MAX_AGE / HZ;
96883 +
96884 +       sbinfo->tree.cbk_cache.nr_slots = CBK_CACHE_SLOTS;
96885 +
96886 +       sbinfo->flush.relocate_threshold = FLUSH_RELOCATE_THRESHOLD;
96887 +       sbinfo->flush.relocate_distance = FLUSH_RELOCATE_DISTANCE;
96888 +       sbinfo->flush.written_threshold = FLUSH_WRITTEN_THRESHOLD;
96889 +       sbinfo->flush.scan_maxnodes = FLUSH_SCAN_MAXNODES;
96890 +
96891 +
96892 +       sbinfo->optimal_io_size = REISER4_OPTIMAL_IO_SIZE;
96893 +
96894 +       sbinfo->tree.carry.new_node_flags = REISER4_NEW_NODE_FLAGS;
96895 +       sbinfo->tree.carry.new_extent_flags = REISER4_NEW_EXTENT_FLAGS;
96896 +       sbinfo->tree.carry.paste_flags = REISER4_PASTE_FLAGS;
96897 +       sbinfo->tree.carry.insert_flags = REISER4_INSERT_FLAGS;
96898 +
96899 +       log_file_name = NULL;
96900 +
96901 +       /*
96902 +         init default readahead params
96903 +       */
96904 +       sbinfo->ra_params.max = num_physpages / 4;
96905 +       sbinfo->ra_params.flags = 0;
96906 +
96907 +       result = parse_options(opt_string, opts, sizeof_array(opts));
96908 +       if (result != 0)
96909 +               return result;
96910 +
96911 +       sbinfo->tmgr.atom_max_age *= HZ;
96912 +       if (sbinfo->tmgr.atom_max_age <= 0)
96913 +               /* overflow */
96914 +               sbinfo->tmgr.atom_max_age = REISER4_ATOM_MAX_AGE;
96915 +
96916 +       /* round optimal io size up to 512 bytes */
96917 +       sbinfo->optimal_io_size >>= VFS_BLKSIZE_BITS;
96918 +       sbinfo->optimal_io_size <<= VFS_BLKSIZE_BITS;
96919 +       if (sbinfo->optimal_io_size == 0) {
96920 +               warning("nikita-2497", "optimal_io_size is too small");
96921 +               return RETERR(-EINVAL);
96922 +       }
96923 +#if REISER4_LOG
96924 +       if (log_file_name != NULL)
96925 +               result = open_log_file(s, log_file_name, REISER4_TRACE_BUF_SIZE, &sbinfo->log_file);
96926 +       else
96927 +               sbinfo->log_file.type = log_to_bucket;
96928 +#endif
96929 +
96930 +       /* disable single-threaded flush as it leads to deadlock */
96931 +       sbinfo->fs_flags |= (1 << REISER4_MTFLUSH);
96932 +       return result;
96933 +}
96934 +
96935 +/* show mount options in /proc/mounts */
96936 +static int
96937 +reiser4_show_options(struct seq_file *m, struct vfsmount *mnt)
96938 +{
96939 +       struct super_block *super;
96940 +       reiser4_super_info_data *sbinfo;
96941 +
96942 +       super = mnt->mnt_sb;
96943 +       sbinfo = get_super_private(super);
96944 +
96945 +       seq_printf(m, ",trace=0x%x", sbinfo->trace_flags);
96946 +       seq_printf(m, ",log=0x%x", sbinfo->log_flags);
96947 +       seq_printf(m, ",debug=0x%x", sbinfo->debug_flags);
96948 +       seq_printf(m, ",atom_max_size=0x%x", sbinfo->tmgr.atom_max_size);
96949 +
96950 +       return 0;
96951 +}
96952 +
96953 +/*
96954 + * Lock profiling code.
96955 + *
96956 + * see spin_macros.h and spinprof.[ch]
96957 + *
96958 + */
96959 +
96960 +/* defined profiling regions for spin lock types */
96961 +DEFINE_SPIN_PROFREGIONS(epoch);
96962 +DEFINE_SPIN_PROFREGIONS(jnode);
96963 +DEFINE_SPIN_PROFREGIONS(jload);
96964 +DEFINE_SPIN_PROFREGIONS(stack);
96965 +DEFINE_SPIN_PROFREGIONS(super);
96966 +DEFINE_SPIN_PROFREGIONS(atom);
96967 +DEFINE_SPIN_PROFREGIONS(txnh);
96968 +DEFINE_SPIN_PROFREGIONS(txnmgr);
96969 +DEFINE_SPIN_PROFREGIONS(ktxnmgrd);
96970 +DEFINE_SPIN_PROFREGIONS(inode_object);
96971 +DEFINE_SPIN_PROFREGIONS(fq);
96972 +DEFINE_SPIN_PROFREGIONS(super_eflush);
96973 +
96974 +/* define profiling regions for read-write locks */
96975 +DEFINE_RW_PROFREGIONS(zlock);
96976 +DEFINE_RW_PROFREGIONS(dk);
96977 +DEFINE_RW_PROFREGIONS(tree);
96978 +DEFINE_RW_PROFREGIONS(cbk_cache);
96979 +
96980 +/* register profiling regions defined above */
96981 +static int register_profregions(void)
96982 +{
96983 +       register_super_eflush_profregion();
96984 +       register_epoch_profregion();
96985 +       register_jnode_profregion();
96986 +       register_jload_profregion();
96987 +       register_stack_profregion();
96988 +       register_super_profregion();
96989 +       register_atom_profregion();
96990 +       register_txnh_profregion();
96991 +       register_txnmgr_profregion();
96992 +       register_ktxnmgrd_profregion();
96993 +       register_inode_object_profregion();
96994 +       register_fq_profregion();
96995 +
96996 +       register_zlock_profregion();
96997 +       register_cbk_cache_profregion();
96998 +       register_dk_profregion();
96999 +       register_tree_profregion();
97000 +
97001 +       return 0;
97002 +}
97003 +
97004 +/* unregister profiling regions defined above */
97005 +static void unregister_profregions(void)
97006 +{
97007 +       unregister_super_eflush_profregion();
97008 +       unregister_epoch_profregion();
97009 +       unregister_jload_profregion();
97010 +       unregister_jnode_profregion();
97011 +       unregister_stack_profregion();
97012 +       unregister_super_profregion();
97013 +       unregister_atom_profregion();
97014 +       unregister_txnh_profregion();
97015 +       unregister_txnmgr_profregion();
97016 +       unregister_ktxnmgrd_profregion();
97017 +       unregister_inode_object_profregion();
97018 +       unregister_fq_profregion();
97019 +
97020 +       unregister_zlock_profregion();
97021 +       unregister_cbk_cache_profregion();
97022 +       unregister_dk_profregion();
97023 +       unregister_tree_profregion();
97024 +}
97025 +
97026 +/* ->write_super() method. Called by sync(2). */
97027 +static void
97028 +reiser4_write_super(struct super_block *s)
97029 +{
97030 +       int ret;
97031 +       reiser4_context ctx;
97032 +
97033 +       assert("vs-1700", !rofs_super(s));
97034 +
97035 +       init_context(&ctx, s);
97036 +       reiser4_stat_inc(vfs_calls.write_super);
97037 +
97038 +       ret = capture_super_block(s);
97039 +       if (ret != 0)
97040 +               warning("vs-1701",
97041 +                       "capture_super_block failed in write_super: %d", ret);
97042 +       ret = txnmgr_force_commit_all(s, 1);
97043 +       if (ret != 0)
97044 +               warning("jmacd-77113",
97045 +                       "txn_force failed in write_super: %d", ret);
97046 +
97047 +       s->s_dirt = 0;
97048 +
97049 +       reiser4_exit_context(&ctx);
97050 +}
97051 +
97052 +static void
97053 +reiser4_put_super(struct super_block *s)
97054 +{
97055 +       reiser4_super_info_data *sbinfo;
97056 +       reiser4_context context;
97057 +
97058 +       sbinfo = get_super_private(s);
97059 +       assert("vs-1699", sbinfo);
97060 +
97061 +       init_context(&context, s);
97062 +       done_reiser4_repacker(s);
97063 +       stop_ktxnmgrd(&sbinfo->tmgr);
97064 +       reiser4_sysfs_done(s);
97065 +
97066 +       /* have disk format plugin to free its resources */
97067 +       if (get_super_private(s)->df_plug->release)
97068 +               get_super_private(s)->df_plug->release(s);
97069 +
97070 +       done_ktxnmgrd_context(&sbinfo->tmgr);
97071 +       done_entd_context(s);
97072 +
97073 +       check_block_counters(s);
97074 +
97075 +       rcu_barrier();
97076 +       /* done_formatted_fake just has finished with last jnodes (bitmap
97077 +        * ones) */
97078 +       done_tree(&sbinfo->tree);
97079 +       /* call finish_rcu(), because some znode were "released" in
97080 +        * done_tree(). */
97081 +       rcu_barrier();
97082 +       done_formatted_fake(s);
97083 +
97084 +       /* no assertions below this line */
97085 +       reiser4_exit_context(&context);
97086 +
97087 +       reiser4_stat_done(&sbinfo->stats);
97088 +
97089 +       kfree(sbinfo);
97090 +       s->s_fs_info = NULL;
97091 +}
97092 +
97093 +/* ->get_sb() method of file_system operations. */
97094 +static struct super_block *
97095 +reiser4_get_sb(struct file_system_type *fs_type        /* file
97096 +                                                * system
97097 +                                                * type */ ,
97098 +              int flags /* flags */ ,
97099 +              const char *dev_name /* device name */ ,
97100 +              void *data /* mount options */ )
97101 +{
97102 +       return get_sb_bdev(fs_type, flags, dev_name, data, reiser4_fill_super);
97103 +}
97104 +
97105 +int d_cursor_init(void);
97106 +void d_cursor_done(void);
97107 +
97108 +/*
97109 + * Reiser4 initialization/shutdown.
97110 + *
97111 + * Code below performs global reiser4 initialization that is done either as
97112 + * part of kernel initialization (when reiser4 is statically built-in), or
97113 + * during reiser4 module load (when compiled as module).
97114 + */
97115 +
97116 +/*
97117 + * Initialization stages for reiser4.
97118 + *
97119 + * These enumerate various things that have to be done during reiser4
97120 + * startup. Initialization code (init_reiser4()) keeps track of what stage was
97121 + * reached, so that proper undo can be done if error occurs during
97122 + * initialization.
97123 + */
97124 +typedef enum {
97125 +       INIT_NONE,               /* nothing is initialized yet */
97126 +       INIT_INODECACHE,         /* inode cache created */
97127 +       INIT_CONTEXT_MGR,        /* list of active contexts created */
97128 +       INIT_ZNODES,             /* znode slab created */
97129 +       INIT_PLUGINS,            /* plugins initialized */
97130 +       INIT_PLUGIN_SET,         /* psets initialized */
97131 +       INIT_TXN,                /* transaction manager initialized */
97132 +       INIT_FAKES,              /* fake inode initialized */
97133 +       INIT_JNODES,             /* jnode slab initialized */
97134 +       INIT_EFLUSH,             /* emergency flush initialized */
97135 +       INIT_SPINPROF,           /* spin lock profiling initialized */
97136 +       INIT_SYSFS,              /* sysfs exports initialized */
97137 +       INIT_LNODES,             /* lnodes initialized */
97138 +       INIT_FQS,                /* flush queues initialized */
97139 +       INIT_DENTRY_FSDATA,      /* dentry_fsdata slab initialized */
97140 +       INIT_FILE_FSDATA,        /* file_fsdata slab initialized */
97141 +       INIT_D_CURSOR,           /* d_cursor suport initialized */
97142 +       INIT_FS_REGISTERED,      /* reiser4 file system type registered */
97143 +} reiser4_init_stage;
97144 +
97145 +static reiser4_init_stage init_stage;
97146 +
97147 +/* finish with reiser4: this is called either at shutdown or at module unload. */
97148 +static void
97149 +shutdown_reiser4(void)
97150 +{
97151 +#define DONE_IF( stage, exp )                  \
97152 +       if( init_stage == ( stage ) ) {         \
97153 +               exp;                            \
97154 +               -- init_stage;                  \
97155 +       }
97156 +
97157 +       /*
97158 +        * undo initializations already done by init_reiser4().
97159 +        */
97160 +
97161 +       DONE_IF(INIT_FS_REGISTERED, unregister_filesystem(&reiser4_fs_type));
97162 +       DONE_IF(INIT_D_CURSOR, d_cursor_done());
97163 +       DONE_IF(INIT_FILE_FSDATA, done_file_fsdata());
97164 +       DONE_IF(INIT_DENTRY_FSDATA, done_dentry_fsdata());
97165 +       DONE_IF(INIT_FQS, done_fqs());
97166 +       DONE_IF(INIT_LNODES, lnodes_done());
97167 +       DONE_IF(INIT_SYSFS, reiser4_sysfs_done_once());
97168 +       DONE_IF(INIT_SPINPROF, unregister_profregions());
97169 +       DONE_IF(INIT_EFLUSH, eflush_done());
97170 +       DONE_IF(INIT_JNODES, jnode_done_static());
97171 +       DONE_IF(INIT_FAKES,;);
97172 +       DONE_IF(INIT_TXN, txnmgr_done_static());
97173 +       DONE_IF(INIT_PLUGIN_SET,plugin_set_done());
97174 +       DONE_IF(INIT_PLUGINS,;);
97175 +       DONE_IF(INIT_ZNODES, znodes_done());
97176 +       DONE_IF(INIT_CONTEXT_MGR,;);
97177 +       DONE_IF(INIT_INODECACHE, destroy_inodecache());
97178 +       assert("nikita-2516", init_stage == INIT_NONE);
97179 +
97180 +#undef DONE_IF
97181 +}
97182 +
97183 +/* initialize reiser4: this is called either at bootup or at module load. */
97184 +static int __init
97185 +init_reiser4(void)
97186 +{
97187 +#define CHECK_INIT_RESULT( exp )               \
97188 +({                                             \
97189 +       result = exp;                           \
97190 +       if( result == 0 )                       \
97191 +               ++ init_stage;                  \
97192 +       else {                                  \
97193 +               shutdown_reiser4();             \
97194 +               return result;                  \
97195 +       }                                       \
97196 +})
97197 +
97198 +       int result;
97199 +       /*
97200 +       printk(KERN_INFO
97201 +              "Loading Reiser4. "
97202 +              "See www.namesys.com for a description of Reiser4.\n");
97203 +       */
97204 +       init_stage = INIT_NONE;
97205 +
97206 +       CHECK_INIT_RESULT(init_inodecache());
97207 +       CHECK_INIT_RESULT(init_context_mgr());
97208 +       CHECK_INIT_RESULT(znodes_init());
97209 +       CHECK_INIT_RESULT(init_plugins());
97210 +       CHECK_INIT_RESULT(plugin_set_init());
97211 +       CHECK_INIT_RESULT(txnmgr_init_static());
97212 +       CHECK_INIT_RESULT(init_fakes());
97213 +       CHECK_INIT_RESULT(jnode_init_static());
97214 +       CHECK_INIT_RESULT(eflush_init());
97215 +       CHECK_INIT_RESULT(register_profregions());
97216 +       CHECK_INIT_RESULT(reiser4_sysfs_init_once());
97217 +       CHECK_INIT_RESULT(lnodes_init());
97218 +       CHECK_INIT_RESULT(init_fqs());
97219 +       CHECK_INIT_RESULT(init_dentry_fsdata());
97220 +       CHECK_INIT_RESULT(init_file_fsdata());
97221 +       CHECK_INIT_RESULT(d_cursor_init());
97222 +       CHECK_INIT_RESULT(register_filesystem(&reiser4_fs_type));
97223 +
97224 +       calibrate_prof();
97225 +
97226 +       assert("nikita-2515", init_stage == INIT_FS_REGISTERED);
97227 +       return 0;
97228 +#undef CHECK_INIT_RESULT
97229 +}
97230 +
97231 +static void __exit
97232 +done_reiser4(void)
97233 +{
97234 +       shutdown_reiser4();
97235 +}
97236 +
97237 +reiser4_internal void reiser4_handle_error(void)
97238 +{
97239 +       struct super_block *sb = reiser4_get_current_sb();
97240 +
97241 +       if ( !sb )
97242 +               return;
97243 +       reiser4_status_write(REISER4_STATUS_DAMAGED, 0, "Filesystem error occured");
97244 +       switch ( get_super_private(sb)->onerror ) {
97245 +       case 0:
97246 +               reiser4_panic("foobar-42", "Filesystem error occured\n");
97247 +       case 1:
97248 +               if ( sb->s_flags & MS_RDONLY )
97249 +                       return;
97250 +               sb->s_flags |= MS_RDONLY;
97251 +               break;
97252 +       case 2:
97253 +               machine_restart(NULL);
97254 +       }
97255 +}
97256 +
97257 +module_init(init_reiser4);
97258 +module_exit(done_reiser4);
97259 +
97260 +MODULE_DESCRIPTION("Reiser4 filesystem");
97261 +MODULE_AUTHOR("Hans Reiser <Reiser@Namesys.COM>");
97262 +
97263 +MODULE_LICENSE("GPL");
97264 +
97265 +/* description of the reiser4 file system type in the VFS eyes. */
97266 +struct file_system_type reiser4_fs_type = {
97267 +       .owner = THIS_MODULE,
97268 +       .name = "reiser4",
97269 +       .fs_flags = FS_REQUIRES_DEV,
97270 +       .get_sb = reiser4_get_sb,
97271 +       .kill_sb = kill_block_super,/*reiser4_kill_super,*/
97272 +       .next = NULL
97273 +};
97274 +
97275 +struct super_operations reiser4_super_operations = {
97276 +       .alloc_inode = reiser4_alloc_inode,
97277 +       .destroy_inode = reiser4_destroy_inode,
97278 +       .read_inode = noop_read_inode,
97279 +       .dirty_inode = NULL,
97280 +       .write_inode = NULL,
97281 +       .put_inode = NULL,
97282 +       .drop_inode = reiser4_drop_inode,
97283 +       .delete_inode = reiser4_delete_inode,
97284 +       .put_super = reiser4_put_super,
97285 +       .write_super = reiser4_write_super,
97286 +       .sync_fs = NULL,
97287 +       .write_super_lockfs = NULL,
97288 +       .unlockfs           = NULL,
97289 +       .statfs = reiser4_statfs,
97290 +       .remount_fs         = NULL,
97291 +       .clear_inode  = reiser4_clear_inode,
97292 +       .umount_begin       = NULL,
97293 +       .sync_inodes = reiser4_sync_inodes,
97294 +       .show_options = reiser4_show_options
97295 +};
97296 +
97297 +/*
97298 + * Object serialization support.
97299 + *
97300 + * To support knfsd file system provides export_operations that are used to
97301 + * construct and interpret NFS file handles. As a generalization of this,
97302 + * reiser4 object plugins have serialization support: it provides methods to
97303 + * create on-wire representation of identity of reiser4 object, and
97304 + * re-create/locate object given its on-wire identity.
97305 + *
97306 + */
97307 +
97308 +/*
97309 + * return number of bytes that on-wire representation of @inode's identity
97310 + * consumes.
97311 + */
97312 +static int
97313 +encode_inode_size(struct inode *inode)
97314 +{
97315 +       assert("nikita-3514", inode != NULL);
97316 +       assert("nikita-3515", inode_file_plugin(inode) != NULL);
97317 +       assert("nikita-3516", inode_file_plugin(inode)->wire.size != NULL);
97318 +
97319 +       return inode_file_plugin(inode)->wire.size(inode) + sizeof(d16);
97320 +}
97321 +
97322 +/*
97323 + * store on-wire representation of @inode's identity at the area beginning at
97324 + * @start.
97325 + */
97326 +static char *
97327 +encode_inode(struct inode *inode, char *start)
97328 +{
97329 +       assert("nikita-3517", inode != NULL);
97330 +       assert("nikita-3518", inode_file_plugin(inode) != NULL);
97331 +       assert("nikita-3519", inode_file_plugin(inode)->wire.write != NULL);
97332 +
97333 +       /*
97334 +        * first, store two-byte identifier of object plugin, then
97335 +        */
97336 +       save_plugin_id(file_plugin_to_plugin(inode_file_plugin(inode)),
97337 +                      (d16 *)start);
97338 +       start += sizeof(d16);
97339 +       /*
97340 +        * call plugin to serialize object's identity
97341 +        */
97342 +       return inode_file_plugin(inode)->wire.write(inode, start);
97343 +}
97344 +
97345 +/*
97346 + * Supported file-handle types
97347 + */
97348 +typedef enum {
97349 +       FH_WITH_PARENT    = 0x10,  /* file handle with parent */
97350 +       FH_WITHOUT_PARENT = 0x11   /* file handle without parent */
97351 +} reiser4_fhtype;
97352 +
97353 +#define NFSERROR (255)
97354 +
97355 +/* this returns number of 32 bit long numbers encoded in @lenp. 255 is
97356 + * returned if file handle can not be stored */
97357 +static int
97358 +reiser4_encode_fh(struct dentry *dentry, __u32 *data, int *lenp, int need_parent)
97359 +{
97360 +       struct inode *inode;
97361 +       struct inode *parent;
97362 +       char *addr;
97363 +       int need;
97364 +       int delta;
97365 +       int result;
97366 +       reiser4_context context;
97367 +
97368 +       /*
97369 +        * knfsd asks as to serialize object in @dentry, and, optionally its
97370 +        * parent (if need_parent != 0).
97371 +        *
97372 +        * encode_inode() and encode_inode_size() is used to build
97373 +        * representation of object and its parent. All hard work is done by
97374 +        * object plugins.
97375 +        */
97376 +
97377 +       inode = dentry->d_inode;
97378 +       parent = dentry->d_parent->d_inode;
97379 +
97380 +       addr = (char *)data;
97381 +
97382 +       need = encode_inode_size(inode);
97383 +       if (need < 0)
97384 +               return NFSERROR;
97385 +       if (need_parent) {
97386 +               delta = encode_inode_size(parent);
97387 +               if (delta < 0)
97388 +                       return NFSERROR;
97389 +               need += delta;
97390 +       }
97391 +
97392 +       init_context(&context, dentry->d_inode->i_sb);
97393 +
97394 +       if (need <= sizeof(__u32) * (*lenp)) {
97395 +               addr = encode_inode(inode, addr);
97396 +               if (need_parent)
97397 +                       addr = encode_inode(parent, addr);
97398 +
97399 +               /* store in lenp number of 32bit words required for file
97400 +                * handle. */
97401 +               *lenp = (need + sizeof(__u32) - 1) >> 2;
97402 +               result = need_parent ? FH_WITH_PARENT : FH_WITHOUT_PARENT;
97403 +       } else
97404 +               /* no enough space in file handle */
97405 +               result = NFSERROR;
97406 +       reiser4_exit_context(&context);
97407 +       return result;
97408 +}
97409 +
97410 +/*
97411 + * read serialized object identity from @addr and store information about
97412 + * object in @obj. This is dual to encode_inode().
97413 + */
97414 +static char *
97415 +decode_inode(struct super_block *s, char *addr, reiser4_object_on_wire *obj)
97416 +{
97417 +       file_plugin *fplug;
97418 +
97419 +       /* identifier of object plugin is stored in the first two bytes,
97420 +        * followed by... */
97421 +       fplug = file_plugin_by_disk_id(get_tree(s), (d16 *)addr);
97422 +       if (fplug != NULL) {
97423 +               addr += sizeof(d16);
97424 +               obj->plugin = fplug;
97425 +               assert("nikita-3520", fplug->wire.read != NULL);
97426 +               /* plugin specific encoding of object identity. */
97427 +               addr = fplug->wire.read(addr, obj);
97428 +       } else
97429 +               addr = ERR_PTR(RETERR(-EINVAL));
97430 +       return addr;
97431 +}
97432 +
97433 +/* initialize place-holder for object */
97434 +static void
97435 +object_on_wire_init(reiser4_object_on_wire *o)
97436 +{
97437 +       o->plugin = NULL;
97438 +}
97439 +
97440 +/* finish with @o */
97441 +static void
97442 +object_on_wire_done(reiser4_object_on_wire *o)
97443 +{
97444 +       if (o->plugin != NULL)
97445 +               o->plugin->wire.done(o);
97446 +}
97447 +
97448 +/* decode knfsd file handle. This is dual to reiser4_encode_fh() */
97449 +static struct dentry *
97450 +reiser4_decode_fh(struct super_block *s, __u32 *data,
97451 +                 int len, int fhtype,
97452 +                 int (*acceptable)(void *context, struct dentry *de),
97453 +                 void *context)
97454 +{
97455 +       reiser4_context ctx;
97456 +       reiser4_object_on_wire object;
97457 +       reiser4_object_on_wire parent;
97458 +       char *addr;
97459 +       int   with_parent;
97460 +
97461 +       init_context(&ctx, s);
97462 +
97463 +       assert("vs-1482",
97464 +              fhtype == FH_WITH_PARENT || fhtype == FH_WITHOUT_PARENT);
97465 +
97466 +       with_parent = (fhtype == FH_WITH_PARENT);
97467 +
97468 +       addr = (char *)data;
97469 +
97470 +       object_on_wire_init(&object);
97471 +       object_on_wire_init(&parent);
97472 +
97473 +       addr = decode_inode(s, addr, &object);
97474 +       if (!IS_ERR(addr)) {
97475 +               if (with_parent)
97476 +                       addr = decode_inode(s, addr, &parent);
97477 +               if (!IS_ERR(addr)) {
97478 +                       struct dentry *d;
97479 +                       typeof(s->s_export_op->find_exported_dentry) fn;
97480 +
97481 +                       fn = s->s_export_op->find_exported_dentry;
97482 +                       assert("nikita-3521", fn != NULL);
97483 +                       d = fn(s, &object, with_parent ? &parent : NULL,
97484 +                              acceptable, context);
97485 +                       if (d != NULL && !IS_ERR(d))
97486 +                               /* FIXME check for -ENOMEM */
97487 +                               reiser4_get_dentry_fsdata(d)->stateless = 1;
97488 +                       addr = (char *)d;
97489 +               }
97490 +       }
97491 +
97492 +       object_on_wire_done(&object);
97493 +       object_on_wire_done(&parent);
97494 +
97495 +       reiser4_exit_context(&ctx);
97496 +       return (void *)addr;
97497 +}
97498 +
97499 +static struct dentry *
97500 +reiser4_get_dentry(struct super_block *sb, void *data)
97501 +{
97502 +       reiser4_object_on_wire *o;
97503 +
97504 +       assert("nikita-3522", sb != NULL);
97505 +       assert("nikita-3523", data != NULL);
97506 +       /*
97507 +        * this is only supposed to be called by
97508 +        *
97509 +        *     reiser4_decode_fh->find_exported_dentry
97510 +        *
97511 +        * so, reiser4_context should be here already.
97512 +        *
97513 +        */
97514 +       assert("nikita-3526", is_in_reiser4_context());
97515 +
97516 +       o = (reiser4_object_on_wire *)data;
97517 +       assert("nikita-3524", o->plugin != NULL);
97518 +       assert("nikita-3525", o->plugin->wire.get != NULL);
97519 +
97520 +       return o->plugin->wire.get(sb, o);
97521 +}
97522 +
97523 +static struct dentry *
97524 +reiser4_get_dentry_parent(struct dentry *child)
97525 +{
97526 +       struct inode *dir;
97527 +       dir_plugin *dplug;
97528 +
97529 +       assert("nikita-3527", child != NULL);
97530 +       /* see comment in reiser4_get_dentry() about following assertion */
97531 +       assert("nikita-3528", is_in_reiser4_context());
97532 +
97533 +       dir = child->d_inode;
97534 +       assert("nikita-3529", dir != NULL);
97535 +       dplug = inode_dir_plugin(dir);
97536 +       assert("nikita-3531", ergo(dplug != NULL, dplug->get_parent != NULL));
97537 +       if (dplug != NULL)
97538 +               return dplug->get_parent(dir);
97539 +       else
97540 +               return ERR_PTR(RETERR(-ENOTDIR));
97541 +}
97542 +
97543 +struct export_operations reiser4_export_operations = {
97544 +       .encode_fh = reiser4_encode_fh,
97545 +       .decode_fh = reiser4_decode_fh,
97546 +       .get_parent = reiser4_get_dentry_parent,
97547 +       .get_dentry = reiser4_get_dentry
97548 +};
97549 +
97550 +struct dentry_operations reiser4_dentry_operations = {
97551 +       .d_revalidate = NULL,
97552 +       .d_hash = NULL,
97553 +       .d_compare = NULL,
97554 +       .d_delete = NULL,
97555 +       .d_release = reiser4_d_release,
97556 +       .d_iput = NULL,
97557 +};
97558 +
97559 +/* Make Linus happy.
97560 +   Local variables:
97561 +   c-indentation-style: "K&R"
97562 +   mode-name: "LC"
97563 +   c-basic-offset: 8
97564 +   tab-width: 8
97565 +   fill-column: 120
97566 +   End:
97567 +*/
97568 diff -rupN linux-2.6.8-rc3/fs/reiser4/vfs_ops.h linux-2.6.8-rc3-a/fs/reiser4/vfs_ops.h
97569 --- linux-2.6.8-rc3/fs/reiser4/vfs_ops.h        1970-01-01 03:00:00.000000000 +0300
97570 +++ linux-2.6.8-rc3-a/fs/reiser4/vfs_ops.h      2004-08-05 21:20:53.110654735 +0400
97571 @@ -0,0 +1,131 @@
97572 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
97573 + * reiser4/README */
97574 +
97575 +/* vfs_ops.c's exported symbols */
97576 +
97577 +#if !defined( __FS_REISER4_VFS_OPS_H__ )
97578 +#define __FS_REISER4_VFS_OPS_H__
97579 +
97580 +#include "forward.h"
97581 +#include "coord.h"
97582 +#include "seal.h"
97583 +#include "type_safe_list.h"
97584 +#include "plugin/dir/dir.h"
97585 +#include "plugin/file/file.h"
97586 +#include "super.h"
97587 +#include "readahead.h"
97588 +
97589 +#include <linux/types.h>       /* for loff_t */
97590 +#include <linux/fs.h>          /* for struct address_space */
97591 +#include <linux/dcache.h>      /* for struct dentry */
97592 +#include <linux/mm.h>
97593 +#include <linux/backing-dev.h>
97594 +
97595 +extern int reiser4_mark_inode_dirty(struct inode *object);
97596 +extern int reiser4_update_sd(struct inode *object);
97597 +extern int reiser4_add_nlink(struct inode *, struct inode *, int);
97598 +extern int reiser4_del_nlink(struct inode *, struct inode *, int);
97599 +
97600 +extern struct file_operations reiser4_file_operations;
97601 +extern struct inode_operations reiser4_inode_operations;
97602 +extern struct inode_operations reiser4_symlink_inode_operations;
97603 +extern struct inode_operations reiser4_special_inode_operations;
97604 +extern struct super_operations reiser4_super_operations;
97605 +extern struct export_operations reiser4_export_operations;
97606 +extern struct address_space_operations reiser4_as_operations;
97607 +extern struct dentry_operations reiser4_dentry_operations;
97608 +extern int reiser4_invalidatepage(struct page *page, unsigned long offset);
97609 +extern int reiser4_releasepage(struct page *page, int gfp);
97610 +extern int reiser4_writepages(struct address_space *, struct writeback_control *wbc);
97611 +extern int reiser4_start_up_io(struct page *page);
97612 +extern void move_inode_out_from_sync_inodes_loop(struct address_space * mapping);
97613 +extern void reiser4_clear_page_dirty(struct page *);
97614 +
97615 +/*
97616 + * this is used to speed up lookups for directory entry: on initial call to
97617 + * ->lookup() seal and coord of directory entry (if found, that is) are stored
97618 + * in struct dentry and reused later to avoid tree traversals.
97619 + */
97620 +typedef struct de_location {
97621 +       /* seal covering directory entry */
97622 +       seal_t entry_seal;
97623 +       /* coord of directory entry */
97624 +       coord_t entry_coord;
97625 +       /* ordinal number of directory entry among all entries with the same
97626 +          key. (Starting from 0.) */
97627 +       int pos;
97628 +} de_location;
97629 +
97630 +/* &reiser4_dentry_fsdata - reiser4-specific data attached to dentries.
97631 +
97632 +   This is allocated dynamically and released in d_op->d_release()
97633 +
97634 +   Currently it only contains cached location (hint) of directory entry, but
97635 +   it is expected that other information will be accumulated here.
97636 +*/
97637 +typedef struct reiser4_dentry_fsdata {
97638 +       /* here will go fields filled by ->lookup() to speedup next
97639 +          create/unlink, like blocknr of znode with stat-data, or key
97640 +          of stat-data.
97641 +       */
97642 +       de_location dec;
97643 +       int stateless; /* created through reiser4_decode_fh, needs special
97644 +                       * treatment in readdir. */
97645 +} reiser4_dentry_fsdata;
97646 +
97647 +/* declare data types and manipulation functions for readdir list. */
97648 +TYPE_SAFE_LIST_DECLARE(readdir);
97649 +
97650 +struct dir_cursor;
97651 +
97652 +/* &reiser4_dentry_fsdata - reiser4-specific data attached to files.
97653 +
97654 +   This is allocated dynamically and released in reiser4_release() */
97655 +struct reiser4_file_fsdata {
97656 +       /* pointer back to the struct file which this reiser4_file_fsdata is
97657 +        * part of */
97658 +       struct file *back;
97659 +       /* detached cursor for stateless readdir. */
97660 +       struct dir_cursor *cursor;
97661 +       /* We need both directory and regular file parts here, because there
97662 +          are file system objects that are files and directories. */
97663 +       struct {
97664 +               readdir_pos readdir;
97665 +               readdir_list_link linkage;
97666 +       } dir;
97667 +       /* hints to speed up operations with regular files: read and write. */
97668 +       struct {
97669 +               hint_t hint;
97670 +               /* this is set by read_extent before calling
97671 +                * page_cache_readahead */
97672 +               void *coord;
97673 +       } reg;
97674 +       struct reiser4_file_ra_state ra;
97675 +};
97676 +
97677 +TYPE_SAFE_LIST_DEFINE(readdir, reiser4_file_fsdata, dir.linkage);
97678 +
97679 +extern reiser4_dentry_fsdata *reiser4_get_dentry_fsdata(struct dentry *dentry);
97680 +extern reiser4_file_fsdata *reiser4_get_file_fsdata(struct file *f);
97681 +extern void reiser4_free_dentry_fsdata(struct dentry *dentry);
97682 +extern void reiser4_free_file_fsdata(struct file *f);
97683 +extern void reiser4_free_fsdata(reiser4_file_fsdata *fsdata);
97684 +
97685 +extern reiser4_file_fsdata *create_fsdata(struct file *file, int gfp);
97686 +
97687 +extern void reiser4_handle_error(void);
97688 +extern int reiser4_parse_options (struct super_block *, char *);
97689 +
97690 +/* __FS_REISER4_VFS_OPS_H__ */
97691 +#endif
97692 +
97693 +/* Make Linus happy.
97694 +   Local variables:
97695 +   c-indentation-style: "K&R"
97696 +   mode-name: "LC"
97697 +   c-basic-offset: 8
97698 +   tab-width: 8
97699 +   fill-column: 120
97700 +   scroll-step: 1
97701 +   End:
97702 +*/
97703 diff -rupN linux-2.6.8-rc3/fs/reiser4/wander.c linux-2.6.8-rc3-a/fs/reiser4/wander.c
97704 --- linux-2.6.8-rc3/fs/reiser4/wander.c 1970-01-01 03:00:00.000000000 +0300
97705 +++ linux-2.6.8-rc3-a/fs/reiser4/wander.c       2004-08-05 21:20:53.371599695 +0400
97706 @@ -0,0 +1,2239 @@
97707 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
97708 + * reiser4/README */
97709 +
97710 +/* Reiser4 Wandering Log */
97711 +
97712 +/* You should read http://www.namesys.com/txn-doc.html
97713 +
97714 +   That describes how filesystem operations are performed as atomic
97715 +   transactions, and how we try to arrange it so that we can write most of the
97716 +   data only once while performing the operation atomically.
97717 +
97718 +   For the purposes of this code, it is enough for it to understand that it
97719 +   has been told a given block should be written either once, or twice (if
97720 +   twice then once to the wandered location and once to the real location).
97721 +
97722 +   This code guarantees that those blocks that are defined to be part of an
97723 +   atom either all take effect or none of them take effect.
97724 +
97725 +   Relocate set nodes are submitted to write by the jnode_flush() routine, and
97726 +   the overwrite set is submitted by reiser4_write_log().  This is because with
97727 +   the overwrite set we seek to optimize writes, and with the relocate set we
97728 +   seek to cause disk order to correlate with the parent first pre-order.
97729 +
97730 +   reiser4_write_log() allocates and writes wandered blocks and maintains
97731 +   additional on-disk structures of the atom as wander records (each wander
97732 +   record occupies one block) for storing of the "wandered map" (a table which
97733 +   contains a relation between wandered and real block numbers) and other
97734 +   information which might be needed at transaction recovery time.
97735 +
97736 +   The wander records are unidirectionally linked into a circle: each wander
97737 +   record contains a block number of the next wander record, the last wander
97738 +   record points to the first one.
97739 +
97740 +   One wander record (named "tx head" in this file) has a format which is
97741 +   different from the other wander records. The "tx head" has a reference to the
97742 +   "tx head" block of the previously committed atom.  Also, "tx head" contains
97743 +   fs information (the free blocks counter, and the oid allocator state) which
97744 +   is logged in a special way .
97745 +
97746 +   There are two journal control blocks, named journal header and journal
97747 +   footer which have fixed on-disk locations.  The journal header has a
97748 +   reference to the "tx head" block of the last committed atom.  The journal
97749 +   footer points to the "tx head" of the last flushed atom.  The atom is
97750 +   "played" when all blocks from its overwrite set are written to disk the
97751 +   second time (i.e. written to their real locations).
97752 +
97753 +   NOTE: People who know reiserfs internals and its journal structure might be
97754 +   confused with these terms journal footer and journal header. There is a table
97755 +   with terms of similar semantics in reiserfs (reiser3) and reiser4:
97756 +
97757 +   REISER3 TERM        |  REISER4 TERM         | DESCRIPTION
97758 +   --------------------+-----------------------+----------------------------
97759 +   commit record       |  journal header       | atomic write of this record
97760 +                       |                       | ends transaction commit
97761 +   --------------------+-----------------------+----------------------------
97762 +   journal header      |  journal footer       | atomic write of this record
97763 +                       |                       | ends post-commit writes.
97764 +                       |                       | After successful
97765 +                       |                       | writing of this journal
97766 +                       |                       | blocks (in reiser3) or
97767 +                       |                       | wandered blocks/records are
97768 +                       |                       | free for re-use.
97769 +   --------------------+-----------------------+----------------------------
97770 +
97771 +   The atom commit process is the following:
97772 +
97773 +   1. The overwrite set is taken from atom's clean list, and its size is
97774 +      counted.
97775 +
97776 +   2. The number of necessary wander records (including tx head) is calculated,
97777 +      and the wander record blocks are allocated.
97778 +
97779 +   3. Allocate wandered blocks and populate wander records by wandered map.
97780 +
97781 +   4. submit write requests for wander records and wandered blocks.
97782 +
97783 +   5. wait until submitted write requests complete.
97784 +
97785 +   6. update journal header: change the pointer to the block number of just
97786 +   written tx head, submit an i/o for modified journal header block and wait
97787 +   for i/o completion.
97788 +
97789 +   NOTE: The special logging for bitmap blocks and some reiser4 super block
97790 +   fields makes processes of atom commit, flush and recovering a bit more
97791 +   complex (see comments in the source code for details).
97792 +
97793 +   The atom playing process is the following:
97794 +
97795 +   1. Write atom's overwrite set in-place.
97796 +
97797 +   2. Wait on i/o.
97798 +
97799 +   3. Update journal footer: change the pointer to block number of tx head
97800 +   block of the atom we currently flushing, submit an i/o, wait on i/o
97801 +   completion.
97802 +
97803 +   4. Free disk space which was used for wandered blocks and wander records.
97804 +
97805 +   After the freeing of wandered blocks and wander records we have that journal
97806 +   footer points to the on-disk structure which might be overwritten soon.
97807 +   Neither the log writer nor the journal recovery procedure use that pointer
97808 +   for accessing the data.  When the journal recovery procedure finds the oldest
97809 +   transaction it compares the journal footer pointer value with the "prev_tx"
97810 +   pointer value in tx head, if values are equal the oldest not flushed
97811 +   transaction is found.
97812 +
97813 +   NOTE on disk space leakage: the information about of what blocks and how many
97814 +   blocks are allocated for wandered blocks, wandered records is not written to
97815 +   the disk because of special logging for bitmaps and some super blocks
97816 +   counters.  After a system crash we the reiser4 does not remember those
97817 +   objects allocation, thus we have no such a kind of disk space leakage.
97818 +*/
97819 +
97820 +/* Special logging of reiser4 super block fields. */
97821 +
97822 +/* There are some reiser4 super block fields (free block count and OID allocator
97823 +   state (number of files and next free OID) which are logged separately from
97824 +   super block to avoid unnecessary atom fusion.
97825 +
97826 +   So, the reiser4 super block can be not captured by a transaction with
97827 +   allocates/deallocates disk blocks or create/delete file objects.  Moreover,
97828 +   the reiser4 on-disk super block is not touched when such a transaction is
97829 +   committed and flushed.  Those "counters logged specially" are logged in "tx
97830 +   head" blocks and in the journal footer block.
97831 +
97832 +   A step-by-step description of special logging:
97833 +
97834 +   0. The per-atom information about deleted or created files and allocated or
97835 +   freed blocks is collected during the transaction.  The atom's
97836 +   ->nr_objects_created and ->nr_objects_deleted are for object
97837 +   deletion/creation tracking, the numbers of allocated and freed blocks are
97838 +   calculated using atom's delete set and atom's capture list -- all new and
97839 +   relocated nodes should be on atom's clean list and should have JNODE_RELOC
97840 +   bit set.
97841 +
97842 +   1. The "logged specially" reiser4 super block fields have their "committed"
97843 +   versions in the reiser4 in-memory super block.  They get modified only at
97844 +   atom commit time.  The atom's commit thread has an exclusive access to those
97845 +   "committed" fields because the log writer implementation supports only one
97846 +   atom commit a time (there is a per-fs "commit" semaphore).  At
97847 +   that time "committed" counters are modified using per-atom information
97848 +   collected during the transaction. These counters are stored on disk as a
97849 +   part of tx head block when atom is committed.
97850 +
97851 +   2. When the atom is flushed the value of the free block counter and the OID
97852 +   allocator state get written to the journal footer block.  A special journal
97853 +   procedure (journal_recover_sb_data()) takes those values from the journal
97854 +   footer and updates the reiser4 in-memory super block.
97855 +
97856 +   NOTE: That means free block count and OID allocator state are logged
97857 +   separately from the reiser4 super block regardless of the fact that the
97858 +   reiser4 super block has fields to store both the free block counter and the
97859 +   OID allocator.
97860 +
97861 +   Writing the whole super block at commit time requires knowing true values of
97862 +   all its fields without changes made by not yet committed transactions. It is
97863 +   possible by having their "committed" version of the super block like the
97864 +   reiser4 bitmap blocks have "committed" and "working" versions.  However,
97865 +   another scheme was implemented which stores special logged values in the
97866 +   unused free space inside transaction head block.  In my opinion it has an
97867 +   advantage of not writing whole super block when only part of it was
97868 +   modified. */
97869 +
97870 +#include "debug.h"
97871 +#include "dformat.h"
97872 +#include "txnmgr.h"
97873 +#include "jnode.h"
97874 +#include "znode.h"
97875 +#include "block_alloc.h"
97876 +#include "page_cache.h"
97877 +#include "wander.h"
97878 +#include "reiser4.h"
97879 +#include "super.h"
97880 +#include "vfs_ops.h"
97881 +#include "writeout.h"
97882 +#include "inode.h"
97883 +#include "entd.h"
97884 +
97885 +#include <linux/types.h>
97886 +#include <linux/fs.h>          /* for struct super_block  */
97887 +#include <linux/mm.h>          /* for struct page */
97888 +#include <linux/pagemap.h>
97889 +#include <linux/bio.h>         /* for struct bio */
97890 +#include <linux/blkdev.h>
97891 +
97892 +static int write_jnodes_to_disk_extent(
97893 +       capture_list_head * head, jnode *, int, const reiser4_block_nr *, flush_queue_t *, int );
97894 +
97895 +/* The commit_handle is a container for objects needed at atom commit time  */
97896 +struct commit_handle {
97897 +       /* A pointer to the list of OVRWR nodes */
97898 +       capture_list_head * overwrite_set;
97899 +       /* atom's overwrite set size */
97900 +       int overwrite_set_size;
97901 +       /* jnodes for wander record blocks */
97902 +       capture_list_head tx_list;
97903 +       /* number of wander records */
97904 +       int tx_size;
97905 +       /* 'committed' sb counters are saved here until atom is completely
97906 +          flushed  */
97907 +       __u64 free_blocks;
97908 +       __u64 nr_files;
97909 +       __u64 next_oid;
97910 +       /* A pointer to the atom which is being committed */
97911 +       txn_atom *atom;
97912 +       /* A pointer to current super block */
97913 +       struct super_block *super;
97914 +       /* The counter of modified bitmaps */
97915 +       reiser4_block_nr nr_bitmap;
97916 +};
97917 +
97918 +static void
97919 +init_commit_handle(struct commit_handle *ch, txn_atom * atom)
97920 +{
97921 +       xmemset(ch, 0, sizeof (struct commit_handle));
97922 +       capture_list_init(&ch->tx_list);
97923 +
97924 +       ch->atom = atom;
97925 +       ch->super = reiser4_get_current_sb();
97926 +}
97927 +
97928 +static void
97929 +done_commit_handle(struct commit_handle *ch UNUSED_ARG)
97930 +{
97931 +       assert("zam-690", capture_list_empty(&ch->tx_list));
97932 +}
97933 +
97934 +/* fill journal header block data  */
97935 +static void
97936 +format_journal_header(struct commit_handle *ch)
97937 +{
97938 +       struct reiser4_super_info_data *sbinfo;
97939 +       struct journal_header *header;
97940 +       jnode *txhead;
97941 +
97942 +       sbinfo = get_super_private(ch->super);
97943 +       assert("zam-479", sbinfo != NULL);
97944 +       assert("zam-480", sbinfo->journal_header != NULL);
97945 +
97946 +       txhead = capture_list_front(&ch->tx_list);
97947 +
97948 +       jload(sbinfo->journal_header);
97949 +
97950 +       header = (struct journal_header *) jdata(sbinfo->journal_header);
97951 +       assert("zam-484", header != NULL);
97952 +
97953 +       cputod64(*jnode_get_block(txhead), &header->last_committed_tx);
97954 +
97955 +       jrelse(sbinfo->journal_header);
97956 +}
97957 +
97958 +/* fill journal footer block data */
97959 +static void
97960 +format_journal_footer(struct commit_handle *ch)
97961 +{
97962 +       struct reiser4_super_info_data *sbinfo;
97963 +       struct journal_footer *footer;
97964 +
97965 +       jnode *tx_head;
97966 +
97967 +       sbinfo = get_super_private(ch->super);
97968 +
97969 +       tx_head = capture_list_front(&ch->tx_list);
97970 +
97971 +       assert("zam-493", sbinfo != NULL);
97972 +       assert("zam-494", sbinfo->journal_header != NULL);
97973 +
97974 +       check_me("zam-691", jload(sbinfo->journal_footer) == 0);
97975 +
97976 +       footer = (struct journal_footer *) jdata(sbinfo->journal_footer);
97977 +       assert("zam-495", footer != NULL);
97978 +
97979 +       cputod64(*jnode_get_block(tx_head), &footer->last_flushed_tx);
97980 +       cputod64(ch->free_blocks, &footer->free_blocks);
97981 +
97982 +       cputod64(ch->nr_files, &footer->nr_files);
97983 +       cputod64(ch->next_oid, &footer->next_oid);
97984 +
97985 +       jrelse(sbinfo->journal_footer);
97986 +}
97987 +
97988 +/* wander record capacity depends on current block size */
97989 +static int
97990 +wander_record_capacity(const struct super_block *super)
97991 +{
97992 +       return (super->s_blocksize - sizeof (struct wander_record_header)) / sizeof (struct wander_entry);
97993 +}
97994 +
97995 +/* Fill first wander record (tx head) in accordance with supplied given data */
97996 +static void
97997 +format_tx_head(struct commit_handle *ch)
97998 +{
97999 +       jnode *tx_head;
98000 +       jnode *next;
98001 +       struct tx_header *header;
98002 +
98003 +       tx_head = capture_list_front(&ch->tx_list);
98004 +       assert("zam-692", !capture_list_end(&ch->tx_list, tx_head));
98005 +
98006 +       next = capture_list_next(tx_head);
98007 +       if (capture_list_end(&ch->tx_list, next))
98008 +               next = tx_head;
98009 +
98010 +       header = (struct tx_header *) jdata(tx_head);
98011 +
98012 +       assert("zam-460", header != NULL);
98013 +       assert("zam-462", ch->super->s_blocksize >= sizeof (struct tx_header));
98014 +
98015 +       xmemset(jdata(tx_head), 0, (size_t) ch->super->s_blocksize);
98016 +       xmemcpy(jdata(tx_head), TX_HEADER_MAGIC, TX_HEADER_MAGIC_SIZE);
98017 +
98018 +       cputod32((__u32) ch->tx_size, &header->total);
98019 +       cputod64(get_super_private(ch->super)->last_committed_tx, &header->prev_tx);
98020 +       cputod64(*jnode_get_block(next), &header->next_block);
98021 +
98022 +       cputod64(ch->free_blocks, &header->free_blocks);
98023 +       cputod64(ch->nr_files, &header->nr_files);
98024 +       cputod64(ch->next_oid, &header->next_oid);
98025 +}
98026 +
98027 +/* prepare ordinary wander record block (fill all service fields) */
98028 +static void
98029 +format_wander_record(struct commit_handle *ch, jnode * node, int serial)
98030 +{
98031 +       struct wander_record_header *LRH;
98032 +       jnode *next;
98033 +
98034 +       assert("zam-464", node != NULL);
98035 +
98036 +       LRH = (struct wander_record_header *) jdata(node);
98037 +       next = capture_list_next(node);
98038 +
98039 +       if (capture_list_end(&ch->tx_list, next))
98040 +               next = capture_list_front(&ch->tx_list);
98041 +
98042 +       assert("zam-465", LRH != NULL);
98043 +       assert("zam-463", ch->super->s_blocksize > sizeof (struct wander_record_header));
98044 +
98045 +       xmemset(jdata(node), 0, (size_t) ch->super->s_blocksize);
98046 +       xmemcpy(jdata(node), WANDER_RECORD_MAGIC, WANDER_RECORD_MAGIC_SIZE);
98047 +
98048 +//      cputod64((__u64)reiser4_trans_id(super), &h->id);
98049 +       cputod32((__u32) ch->tx_size, &LRH->total);
98050 +       cputod32((__u32) serial, &LRH->serial);
98051 +       cputod64((__u64) * jnode_get_block(next), &LRH->next_block);
98052 +}
98053 +
98054 +/* add one wandered map entry to formatted wander record */
98055 +static void
98056 +store_entry(jnode * node, int index, const reiser4_block_nr * a, const reiser4_block_nr * b)
98057 +{
98058 +       char *data;
98059 +       struct wander_entry *pairs;
98060 +
98061 +       data = jdata(node);
98062 +       assert("zam-451", data != NULL);
98063 +
98064 +       pairs = (struct wander_entry *) (data + sizeof (struct wander_record_header));
98065 +
98066 +       cputod64(*a, &pairs[index].original);
98067 +       cputod64(*b, &pairs[index].wandered);
98068 +}
98069 +
98070 +/* currently, wander records contains contain only wandered map, which depend on
98071 +   overwrite set size */
98072 +static void
98073 +get_tx_size(struct commit_handle *ch)
98074 +{
98075 +       assert("zam-440", ch->overwrite_set_size != 0);
98076 +       assert("zam-695", ch->tx_size == 0);
98077 +
98078 +       /* count all ordinary wander records
98079 +          (<overwrite_set_size> - 1) / <wander_record_capacity> + 1 and add one
98080 +          for tx head block */
98081 +       ch->tx_size = (ch->overwrite_set_size - 1) / wander_record_capacity(ch->super) + 2;
98082 +}
98083 +
98084 +/* A special structure for using in store_wmap_actor() for saving its state
98085 +   between calls */
98086 +struct store_wmap_params {
98087 +       jnode *cur;             /* jnode of current wander record to fill */
98088 +       int idx;                /* free element index in wander record  */
98089 +       int capacity;           /* capacity  */
98090 +
98091 +#if REISER4_DEBUG
98092 +       capture_list_head *tx_list;
98093 +#endif
98094 +};
98095 +
98096 +/* an actor for use in blocknr_set_iterator routine which populates the list
98097 +   of pre-formatted wander records by wandered map info */
98098 +static int
98099 +store_wmap_actor(txn_atom * atom UNUSED_ARG, const reiser4_block_nr * a, const reiser4_block_nr * b, void *data)
98100 +{
98101 +       struct store_wmap_params *params = data;
98102 +
98103 +       if (params->idx >= params->capacity) {
98104 +               /* a new wander record should be taken from the tx_list */
98105 +               params->cur = capture_list_next(params->cur);
98106 +               assert("zam-454", !capture_list_end(params->tx_list, params->cur));
98107 +
98108 +               params->idx = 0;
98109 +       }
98110 +
98111 +       store_entry(params->cur, params->idx, a, b);
98112 +       params->idx++;
98113 +
98114 +       return 0;
98115 +}
98116 +
98117 +/* This function is called after Relocate set gets written to disk, Overwrite
98118 +   set is written to wandered locations and all wander records are written
98119 +   also. Updated journal header blocks contains a pointer (block number) to
98120 +   first wander record of the just written transaction */
98121 +static int
98122 +update_journal_header(struct commit_handle *ch)
98123 +{
98124 +       struct reiser4_super_info_data *sbinfo = get_super_private(ch->super);
98125 +
98126 +       jnode *jh = sbinfo->journal_header;
98127 +       jnode *head = capture_list_front(&ch->tx_list);
98128 +
98129 +       int ret;
98130 +
98131 +       format_journal_header(ch);
98132 +
98133 +       ret = write_jnodes_to_disk_extent(&ch->tx_list, jh, 1, jnode_get_block(jh), NULL, 0);
98134 +
98135 +       if (ret)
98136 +               return ret;
98137 +
98138 +       blk_run_address_space(sbinfo->fake->i_mapping);
98139 +       /*blk_run_queues();*/
98140 +
98141 +       ret = jwait_io(jh, WRITE);
98142 +
98143 +       if (ret)
98144 +               return ret;
98145 +
98146 +       sbinfo->last_committed_tx = *jnode_get_block(head);
98147 +
98148 +       return 0;
98149 +}
98150 +
98151 +/* This function is called after write-back is finished. We update journal
98152 +   footer block and free blocks which were occupied by wandered blocks and
98153 +   transaction wander records */
98154 +static int
98155 +update_journal_footer(struct commit_handle *ch)
98156 +{
98157 +       reiser4_super_info_data *sbinfo = get_super_private(ch->super);
98158 +
98159 +       jnode *jf = sbinfo->journal_footer;
98160 +
98161 +       int ret;
98162 +
98163 +       format_journal_footer(ch);
98164 +
98165 +       ret = write_jnodes_to_disk_extent(&ch->tx_list, jf, 1, jnode_get_block(jf), NULL, 0);
98166 +       if (ret)
98167 +               return ret;
98168 +
98169 +       blk_run_address_space(sbinfo->fake->i_mapping);
98170 +       /*blk_run_queue();*/
98171 +
98172 +       ret = jwait_io(jf, WRITE);
98173 +       if (ret)
98174 +               return ret;
98175 +
98176 +       return 0;
98177 +}
98178 +
98179 +/* free block numbers of wander records of already written in place transaction */
98180 +static void
98181 +dealloc_tx_list(struct commit_handle *ch)
98182 +{
98183 +       while (!capture_list_empty(&ch->tx_list)) {
98184 +               jnode *cur = capture_list_pop_front(&ch->tx_list);
98185 +
98186 +               ON_DEBUG(capture_list_clean(cur));
98187 +               reiser4_dealloc_block(jnode_get_block(cur), BLOCK_NOT_COUNTED, BA_FORMATTED);
98188 +
98189 +               unpin_jnode_data(cur);
98190 +               drop_io_head(cur);
98191 +       }
98192 +}
98193 +
98194 +/* An actor for use in block_nr_iterator() routine which frees wandered blocks
98195 +   from atom's overwrite set. */
98196 +static int
98197 +dealloc_wmap_actor(txn_atom * atom UNUSED_ARG,
98198 +                  const reiser4_block_nr * a UNUSED_ARG, const reiser4_block_nr * b, void *data UNUSED_ARG)
98199 +{
98200 +
98201 +       assert("zam-499", b != NULL);
98202 +       assert("zam-500", *b != 0);
98203 +       assert("zam-501", !blocknr_is_fake(b));
98204 +
98205 +       reiser4_dealloc_block(b, BLOCK_NOT_COUNTED, BA_FORMATTED);
98206 +       return 0;
98207 +}
98208 +
98209 +/* free wandered block locations of already written in place transaction */
98210 +static void
98211 +dealloc_wmap(struct commit_handle *ch)
98212 +{
98213 +       assert("zam-696", ch->atom != NULL);
98214 +
98215 +       blocknr_set_iterator(ch->atom, &ch->atom->wandered_map, dealloc_wmap_actor, NULL, 1);
98216 +}
98217 +
98218 +/* helper function for alloc wandered blocks, which refill set of block
98219 +   numbers needed for wandered blocks  */
98220 +static int
98221 +get_more_wandered_blocks(int count, reiser4_block_nr * start, int *len)
98222 +{
98223 +       reiser4_blocknr_hint hint;
98224 +       int ret;
98225 +
98226 +       reiser4_block_nr wide_len = count;
98227 +
98228 +       /* FIXME-ZAM: A special policy needed for allocation of wandered blocks
98229 +          ZAM-FIXME-HANS: yes, what happened to our discussion of using a fixed
98230 +          reserved allocation area so as to get the best qualities of fixed
98231 +          journals? */
98232 +       blocknr_hint_init(&hint);
98233 +       hint.block_stage = BLOCK_GRABBED;
98234 +
98235 +       ret = reiser4_alloc_blocks(&hint, start, &wide_len,
98236 +               BA_FORMATTED | BA_USE_DEFAULT_SEARCH_START);
98237 +
98238 +       *len = (int) wide_len;
98239 +
98240 +       return ret;
98241 +}
98242 +
98243 +/*
98244 + * roll back changes made before issuing BIO in the case of IO error.
98245 + */
98246 +static void
98247 +undo_bio(struct bio *bio)
98248 +{
98249 +       int i;
98250 +
98251 +       for (i = 0; i < bio->bi_vcnt; ++i) {
98252 +               struct page *pg;
98253 +               jnode *node;
98254 +
98255 +               pg = bio->bi_io_vec[i].bv_page;
98256 +               ClearPageWriteback(pg);
98257 +               node = jprivate(pg);
98258 +               LOCK_JNODE(node);
98259 +               JF_CLR(node, JNODE_WRITEBACK);
98260 +               JF_SET(node, JNODE_DIRTY);
98261 +               UNLOCK_JNODE(node);
98262 +       }
98263 +       bio_put(bio);
98264 +}
98265 +
98266 +#if REISER4_COPY_ON_CAPTURE
98267 +
98268 +extern spinlock_t scan_lock;
98269 +
98270 +/* put overwrite set back to atom's clean list */
98271 +static void put_overwrite_set(struct commit_handle * ch)
98272 +{
98273 +       jnode * cur;
98274 +
98275 +       spin_lock(&scan_lock);
98276 +       cur = capture_list_front(ch->overwrite_set);
98277 +       while (!capture_list_end(ch->overwrite_set, cur)) {
98278 +               assert("vs-1443", NODE_LIST(cur) == OVRWR_LIST);
98279 +               JF_SET(cur, JNODE_SCANNED);
98280 +               spin_unlock(&scan_lock);
98281 +               JF_CLR(cur, JNODE_JLOADED_BY_GET_OVERWRITE_SET);
98282 +               jrelse_tail(cur);
98283 +               spin_lock(&scan_lock);
98284 +               JF_CLR(cur, JNODE_SCANNED);
98285 +               cur = capture_list_next(cur);
98286 +       }
98287 +       spin_unlock(&scan_lock);
98288 +}
98289 +
98290 +/* Count overwrite set size, grab disk space for wandered blocks allocation.
98291 +   Since we have a separate list for atom's overwrite set we just scan the list,
98292 +   count bitmap and other not leaf nodes which wandered blocks allocation we
98293 +   have to grab space for. */
98294 +static int
98295 +get_overwrite_set(struct commit_handle *ch)
98296 +{
98297 +       int ret;
98298 +       jnode *cur;
98299 +       __u64 nr_not_leaves = 0;
98300 +#if REISER4_DEBUG
98301 +       __u64 nr_formatted_leaves = 0;
98302 +       __u64 nr_unformatted_leaves = 0;
98303 +#endif
98304 +
98305 +
98306 +       assert("zam-697", ch->overwrite_set_size == 0);
98307 +
98308 +       ch->overwrite_set = ATOM_OVRWR_LIST(ch->atom);
98309 +
98310 +       spin_lock(&scan_lock);
98311 +       cur = capture_list_front(ch->overwrite_set);
98312 +
98313 +       while (!capture_list_end(ch->overwrite_set, cur)) {
98314 +               jnode *next;
98315 +
98316 +               /* FIXME: for all but first this bit is set already */
98317 +               assert("vs-1444", NODE_LIST(cur) == OVRWR_LIST);
98318 +               JF_SET(cur, JNODE_SCANNED);
98319 +               next = capture_list_next(cur);
98320 +               if (!capture_list_end(ch->overwrite_set, next))
98321 +                       JF_SET(next, JNODE_SCANNED);
98322 +               spin_unlock(&scan_lock);
98323 +
98324 +               if (jnode_is_znode(cur) && znode_above_root(JZNODE(cur))) {
98325 +                       ON_TRACE(TRACE_LOG, "fake znode found , WANDER=(%d)\n", JF_ISSET(cur, JNODE_OVRWR));
98326 +               }
98327 +
98328 +               /* Count bitmap locks for getting correct statistics what number
98329 +                * of blocks were cleared by the transaction commit. */
98330 +               if (jnode_get_type(cur) == JNODE_BITMAP)
98331 +                       ch->nr_bitmap ++;
98332 +
98333 +               assert("zam-939", JF_ISSET(cur, JNODE_OVRWR) || jnode_get_type(cur) == JNODE_BITMAP);
98334 +
98335 +               if (jnode_is_znode(cur) && znode_above_root(JZNODE(cur))) {
98336 +                       /* we replace fake znode by another (real)
98337 +                          znode which is suggested by disk_layout
98338 +                          plugin */
98339 +
98340 +                       /* FIXME: it looks like fake znode should be
98341 +                          replaced by jnode supplied by
98342 +                          disk_layout. */
98343 +
98344 +                       struct super_block *s = reiser4_get_current_sb();
98345 +                       reiser4_super_info_data *sbinfo = get_current_super_private();
98346 +
98347 +                       if (sbinfo->df_plug->log_super) {
98348 +                               jnode *sj = sbinfo->df_plug->log_super(s);
98349 +
98350 +                               assert("zam-593", sj != NULL);
98351 +
98352 +                               if (IS_ERR(sj))
98353 +                                       return PTR_ERR(sj);
98354 +
98355 +                               LOCK_ATOM(ch->atom);
98356 +                               LOCK_JNODE(sj);
98357 +                               JF_SET(sj, JNODE_OVRWR);
98358 +                               insert_into_atom_ovrwr_list(ch->atom, sj);
98359 +                               UNLOCK_JNODE(sj);
98360 +                               UNLOCK_ATOM(ch->atom);
98361 +
98362 +                               /* jload it as the rest of overwrite set */
98363 +                               jload_gfp(sj, GFP_KERNEL, 0);
98364 +
98365 +                               ch->overwrite_set_size++;
98366 +                       }
98367 +                       LOCK_ATOM(ch->atom);
98368 +                       LOCK_JNODE(cur);
98369 +                       uncapture_block(cur);
98370 +                       UNLOCK_ATOM(ch->atom);
98371 +                       jput(cur);
98372 +
98373 +                       spin_lock(&scan_lock);
98374 +                       JF_CLR(cur, JNODE_SCANNED);
98375 +                       cur = next;
98376 +                       nr_not_leaves ++;
98377 +               } else {
98378 +                       int ret;
98379 +                       ch->overwrite_set_size++;
98380 +                       ret = jload_gfp(cur, GFP_KERNEL, 0);
98381 +                       if (ret)
98382 +                               reiser4_panic("zam-783", "cannot load e-flushed jnode back (ret = %d)\n", ret);
98383 +
98384 +                       /* Count not leaves here because we have to grab disk
98385 +                        * space for wandered blocks. They were not counted as
98386 +                        * "flush reserved". This should be done after doing
98387 +                        * jload() to avoid races with emergency
98388 +                        * flush. Counting should be done _after_ nodes are
98389 +                        * pinned * into memory by jload(). */
98390 +                       if (!jnode_is_leaf(cur))
98391 +                               nr_not_leaves ++;
98392 +                       /* this is to check atom's flush reserved space for
98393 +                        * overwritten leaves */
98394 +                       else {
98395 +#if REISER4_DEBUG
98396 +                               /* at this point @cur either has
98397 +                                * JNODE_FLUSH_RESERVED or is
98398 +                                * eflushed. Locking is not strong enough to
98399 +                                * write an assertion checking for this. */
98400 +                               if (jnode_is_znode(cur))
98401 +                                       nr_formatted_leaves ++;
98402 +                               else
98403 +                                       nr_unformatted_leaves ++;
98404 +#endif
98405 +                               JF_CLR(cur, JNODE_FLUSH_RESERVED);
98406 +                       }
98407 +                       spin_lock(&scan_lock);
98408 +                       JF_SET(cur, JNODE_JLOADED_BY_GET_OVERWRITE_SET);
98409 +                       assert("", cur->pg);
98410 +                       JF_CLR(cur, JNODE_SCANNED);
98411 +                       cur = next;
98412 +               }
98413 +
98414 +       }
98415 +       spin_unlock(&scan_lock);
98416 +
98417 +       /* Grab space for writing (wandered blocks) of not leaves found in
98418 +        * overwrite set. */
98419 +       ret = reiser4_grab_space_force(nr_not_leaves, BA_RESERVED);
98420 +       if (ret)
98421 +               return ret;
98422 +
98423 +       /* Disk space for allocation of wandered blocks of leaf nodes already
98424 +        * reserved as "flush reserved", move it to grabbed space counter. */
98425 +       spin_lock_atom(ch->atom);
98426 +       assert("zam-940", nr_formatted_leaves + nr_unformatted_leaves <= ch->atom->flush_reserved);
98427 +       flush_reserved2grabbed(ch->atom, ch->atom->flush_reserved);
98428 +       spin_unlock_atom(ch->atom);
98429 +
98430 +       return ch->overwrite_set_size;
98431 +}
98432 +
98433 +/* Submit a write request for @nr jnodes beginning from the @first, other
98434 +   jnodes are after the @first on the double-linked "capture" list.  All
98435 +   jnodes will be written to the disk region of @nr blocks starting with
98436 +   @block_p block number.  If @fq is not NULL it means that waiting for i/o
98437 +   completion will be done more efficiently by using flush_queue_t objects
98438 +
98439 +ZAM-FIXME-HANS: brief me on why this function exists, and why bios are
98440 +aggregated in this function instead of being left to the layers below
98441 +
98442 +FIXME: ZAM->HANS: What layer are you talking about? Can you point me to that?
98443 +Why that layer needed? Why BIOs cannot be constructed here?
98444 +*/
98445 +static int
98446 +write_jnodes_to_disk_extent(capture_list_head * head, jnode * first, int nr,
98447 +                           const reiser4_block_nr * block_p, flush_queue_t * fq, int flags)
98448 +{
98449 +       struct super_block *super = reiser4_get_current_sb();
98450 +       int for_reclaim = flags & WRITEOUT_FOR_PAGE_RECLAIM;
98451 +       int max_blocks;
98452 +       jnode *cur = first;
98453 +       reiser4_block_nr block;
98454 +
98455 +       assert("zam-571", first != NULL);
98456 +       assert("zam-572", block_p != NULL);
98457 +       assert("zam-570", nr > 0);
98458 +
98459 +       block = *block_p;
98460 +
98461 +       ON_TRACE (TRACE_IO_W, "write of %d blocks starting from %llu\n",
98462 +                 nr, (unsigned long long)block);
98463 +
98464 +       max_blocks = bdev_get_queue(super->s_bdev)->max_sectors >> (super->s_blocksize_bits - 9);
98465 +
98466 +       while (nr > 0) {
98467 +               struct bio *bio;
98468 +               int nr_blocks = min(nr, max_blocks);
98469 +               int i;
98470 +               int nr_used;
98471 +
98472 +               bio = bio_alloc(GFP_NOIO, nr_blocks);
98473 +               if (!bio)
98474 +                       return RETERR(-ENOMEM);
98475 +
98476 +               bio->bi_bdev = super->s_bdev;
98477 +               bio->bi_sector = block * (super->s_blocksize >> 9);
98478 +               for (nr_used = 0, i = 0; i < nr_blocks; i++) {
98479 +                       struct page *pg;
98480 +                       ON_DEBUG(int jnode_is_releasable(jnode *));
98481 +
98482 +                       assert("vs-1423", ergo(jnode_is_znode(cur) || jnode_is_unformatted(cur),  JF_ISSET(cur, JNODE_SCANNED)));
98483 +                       pg = jnode_page(cur);
98484 +                       assert("zam-573", pg != NULL);
98485 +
98486 +                       page_cache_get(pg);
98487 +
98488 +                       lock_and_wait_page_writeback(pg);
98489 +
98490 +                       LOCK_JNODE(cur);
98491 +                       assert("nikita-3553", jnode_page(cur) == pg);
98492 +                       assert("nikita-3554", jprivate(pg) == cur);
98493 +
98494 +                       assert("nikita-3166",
98495 +                              ergo(!JF_ISSET(cur, JNODE_CC), pg->mapping == jnode_get_mapping(cur)));
98496 +                       if (!JF_ISSET(cur, JNODE_WRITEBACK)) {
98497 +                               assert("nikita-3165", !jnode_is_releasable(cur));
98498 +                               UNLOCK_JNODE(cur);
98499 +                               if (!bio_add_page(bio,
98500 +                                                 pg, super->s_blocksize, 0)) {
98501 +                                       /*
98502 +                                        * underlying device is satiated. Stop
98503 +                                        * adding pages to the bio.
98504 +                                        */
98505 +                                       unlock_page(pg);
98506 +                                       page_cache_release(pg);
98507 +                                       break;
98508 +                               }
98509 +
98510 +                               LOCK_JNODE(cur);
98511 +                               JF_SET(cur, JNODE_WRITEBACK);
98512 +                               JF_CLR(cur, JNODE_DIRTY);
98513 +                               UNLOCK_JNODE(cur);
98514 +
98515 +                               SetPageWriteback(pg);
98516 +                               if (for_reclaim)
98517 +                                       ent_writes_page(super, pg);
98518 +                               spin_lock(&pg->mapping->page_lock);
98519 +
98520 +                               if (REISER4_STATS && !PageDirty(pg))
98521 +                                       reiser4_stat_inc(pages_clean);
98522 +
98523 +                               /* don't check return value: submit page even if
98524 +                                  it wasn't dirty. */
98525 +                               test_clear_page_dirty(pg);
98526 +
98527 +                               list_del(&pg->list);
98528 +                               list_add(&pg->list, &pg->mapping->locked_pages);
98529 +
98530 +                               spin_unlock(&pg->mapping->page_lock);
98531 +
98532 +                               nr_used ++;
98533 +                       } else {
98534 +                               /* jnode being WRITEBACK might be replaced on
98535 +                                  ovrwr_nodes list with jnode CC. We just
98536 +                                  encountered this CC jnode. Do not submit i/o
98537 +                                  for it */
98538 +                               assert("zam-912", JF_ISSET(cur, JNODE_CC));
98539 +                               UNLOCK_JNODE(cur);
98540 +                       }
98541 +                       unlock_page(pg);
98542 +
98543 +                       nr --;
98544 +                       cur = capture_list_next(cur);
98545 +               }
98546 +               if (nr_used > 0) {
98547 +                       assert("nikita-3455",
98548 +                              bio->bi_size == super->s_blocksize * nr_used);
98549 +                       assert("nikita-3456", bio->bi_vcnt == nr_used);
98550 +
98551 +                       /* Check if we are allowed to write at all */
98552 +                       if (super->s_flags & MS_RDONLY)
98553 +                               undo_bio(bio);
98554 +                       else {
98555 +                               add_fq_to_bio(fq, bio);
98556 +                               reiser4_submit_bio(WRITE, bio);
98557 +                       }
98558 +
98559 +                       block += nr_used - 1;
98560 +                       update_blocknr_hint_default (super, &block);
98561 +                       block += 1;
98562 +               } else {
98563 +                       reiser4_stat_inc(txnmgr.empty_bio);
98564 +                       bio_put(bio);
98565 +               }
98566 +       }
98567 +       return 0;
98568 +}
98569 +
98570 +/* @nr jnodes starting from @j are marked as JNODE_SCANNED. Clear this bit for
98571 +   all those jnodes */
98572 +static void
98573 +unscan_sequence_nolock(jnode *j, int nr)
98574 +{
98575 +       int i;
98576 +
98577 +       for (i = 0; i < nr; i ++) {
98578 +               assert("vs-1631", JF_ISSET(j, JNODE_SCANNED));
98579 +               JF_CLR(j, JNODE_SCANNED);
98580 +               j = capture_list_next(j);
98581 +       }
98582 +}
98583 +
98584 +static void
98585 +unscan_sequence(jnode *j, int nr)
98586 +{
98587 +       spin_lock(&scan_lock);
98588 +       unscan_sequence_nolock(j, nr);
98589 +       spin_unlock(&scan_lock);
98590 +}
98591 +
98592 +/* This is a procedure which recovers a contiguous sequences of disk block
98593 +   numbers in the given list of j-nodes and submits write requests on this
98594 +   per-sequence basis */
98595 +reiser4_internal int
98596 +write_jnode_list(capture_list_head *head, flush_queue_t *fq, long *nr_submitted, int flags)
98597 +{
98598 +       int ret;
98599 +       jnode *beg, *end;
98600 +
98601 +       spin_lock(&scan_lock);
98602 +       beg = capture_list_front(head);
98603 +       while (!capture_list_end(head, beg)) {
98604 +               int nr = 1;
98605 +               jnode *cur;
98606 +
98607 +               JF_SET(beg, JNODE_SCANNED);
98608 +               end = beg;
98609 +               cur = capture_list_next(beg);
98610 +
98611 +               while (!capture_list_end(head, cur)) {
98612 +                       if (*jnode_get_block(cur) != *jnode_get_block(beg) + nr)
98613 +                               /* jnode from which next sequence of blocks starts */
98614 +                               break;
98615 +
98616 +                       JF_SET(cur, JNODE_SCANNED);
98617 +                       ++ nr;
98618 +                       end = cur;
98619 +                       cur = capture_list_next(cur);
98620 +               }
98621 +               spin_unlock(&scan_lock);
98622 +
98623 +               ret = write_jnodes_to_disk_extent(head, beg, nr, jnode_get_block(beg), fq, flags);
98624 +               if (ret) {
98625 +                       unscan_sequence(beg, nr);
98626 +                       return ret;
98627 +               }
98628 +
98629 +               if (nr_submitted)
98630 +                       *nr_submitted += nr;
98631 +
98632 +               spin_lock(&scan_lock);
98633 +               unscan_sequence_nolock(beg, nr);
98634 +               beg = capture_list_next(end);
98635 +       }
98636 +
98637 +       spin_unlock(&scan_lock);
98638 +       return 0;
98639 +}
98640 +
98641 +/* add given wandered mapping to atom's wandered map
98642 +   this starts from jnode which is in JNODE_SCANNED state.  */
98643 +static int
98644 +add_region_to_wmap(jnode * cur, int len, const reiser4_block_nr * block_p)
98645 +{
98646 +       int ret;
98647 +       blocknr_set_entry *new_bsep = NULL;
98648 +       reiser4_block_nr block;
98649 +       int first;
98650 +       txn_atom *atom;
98651 +
98652 +       assert("zam-568", block_p != NULL);
98653 +       block = *block_p;
98654 +       assert("zam-569", len > 0);
98655 +
98656 +       while ((len--) > 0) {
98657 +               assert("vs-1422", JF_ISSET(cur, JNODE_SCANNED));
98658 +
98659 +               do {
98660 +                       atom = get_current_atom_locked();
98661 +                       assert("zam-536", !blocknr_is_fake(jnode_get_block(cur)));
98662 +                       ret = blocknr_set_add_pair(atom, &atom->wandered_map, &new_bsep, jnode_get_block(cur), &block);
98663 +               } while (ret == -E_REPEAT);
98664 +
98665 +               if (ret) {
98666 +                       /* deallocate blocks which were not added to wandered
98667 +                          map */
98668 +                       reiser4_block_nr wide_len = len;
98669 +
98670 +                       reiser4_dealloc_blocks(&block, &wide_len, BLOCK_NOT_COUNTED,
98671 +                               BA_FORMATTED/* formatted, without defer */);
98672 +
98673 +                       return ret;
98674 +               }
98675 +
98676 +               UNLOCK_ATOM(atom);
98677 +
98678 +               cur = capture_list_next(cur);
98679 +               ++block;
98680 +               first = 0;
98681 +       }
98682 +
98683 +       return 0;
98684 +}
98685 +
98686 +/* Allocate wandered blocks for current atom's OVERWRITE SET and immediately
98687 +   submit IO for allocated blocks.  We assume that current atom is in a stage
98688 +   when any atom fusion is impossible and atom is unlocked and it is safe. */
98689 +static int
98690 +alloc_wandered_blocks(struct commit_handle *ch, flush_queue_t * fq)
98691 +{
98692 +       reiser4_block_nr block;
98693 +
98694 +       int rest;
98695 +       int len, prev_len = 0, i;
98696 +       int ret;
98697 +       jnode *cur, *beg, *end;
98698 +
98699 +       assert("zam-534", ch->overwrite_set_size > 0);
98700 +
98701 +       cur = beg = end = NULL;
98702 +
98703 +       for (rest = ch->overwrite_set_size; rest > 0; rest -= len) {
98704 +               ret = get_more_wandered_blocks(rest, &block, &len);
98705 +               if (ret) {
98706 +                       if (beg != NULL)
98707 +                               unscan_sequence_nolock(beg, prev_len);
98708 +                       return ret;
98709 +               }
98710 +
98711 +               spin_lock(&scan_lock);
98712 +               if (beg == NULL)
98713 +                       cur = capture_list_front(ch->overwrite_set);
98714 +               else {
98715 +                       unscan_sequence_nolock(beg, prev_len);
98716 +                       cur = capture_list_next(end);
98717 +               }
98718 +               beg = cur;
98719 +
98720 +               /* mark @len jnodes starting from @cur as scanned */
98721 +               for (i = 0; i < len; i ++) {
98722 +                       assert("vs-1633", !capture_list_end(ch->overwrite_set, cur));
98723 +                       assert("vs-1632", !JF_ISSET(cur, JNODE_SCANNED));
98724 +                       JF_SET(cur, JNODE_SCANNED);
98725 +                       end = cur;
98726 +                       cur = capture_list_next(cur);
98727 +               }
98728 +               prev_len = len;
98729 +               spin_unlock(&scan_lock);
98730 +
98731 +               ret = add_region_to_wmap(beg, len, &block);
98732 +               if (ret) {
98733 +                       unscan_sequence(beg, len);
98734 +                       return ret;
98735 +               }
98736 +               ret = write_jnodes_to_disk_extent(ch->overwrite_set, beg, len, &block, fq, 0);
98737 +               if (ret) {
98738 +                       unscan_sequence(beg, len);
98739 +                       return ret;
98740 +               }
98741 +               assert("vs-1638", rest >= len);
98742 +       }
98743 +
98744 +       assert("vs-1634", rest == 0);
98745 +       assert("vs-1635", beg != NULL && end != NULL);
98746 +       assert("vs-1639", cur == capture_list_next(end));
98747 +       assert("vs-1636", capture_list_end(ch->overwrite_set, cur));
98748 +       unscan_sequence(beg, len);
98749 +
98750 +       return 0;
98751 +}
98752 +
98753 +#else /* !REISER4_COPY_ON_CAPTURE */
98754 +
98755 +/* put overwrite set back to atom's clean list */
98756 +static void put_overwrite_set(struct commit_handle * ch)
98757 +{
98758 +       jnode * cur;
98759 +
98760 +       for_all_type_safe_list(capture, ch->overwrite_set, cur)
98761 +               jrelse_tail(cur);
98762 +}
98763 +
98764 +/* Count overwrite set size, grab disk space for wandered blocks allocation.
98765 +   Since we have a separate list for atom's overwrite set we just scan the list,
98766 +   count bitmap and other not leaf nodes which wandered blocks allocation we
98767 +   have to grab space for. */
98768 +static int
98769 +get_overwrite_set(struct commit_handle *ch)
98770 +{
98771 +       int ret;
98772 +       jnode *cur;
98773 +       __u64 nr_not_leaves = 0;
98774 +#if REISER4_DEBUG
98775 +       __u64 nr_formatted_leaves = 0;
98776 +       __u64 nr_unformatted_leaves = 0;
98777 +#endif
98778 +
98779 +
98780 +       assert("zam-697", ch->overwrite_set_size == 0);
98781 +
98782 +       ch->overwrite_set = ATOM_OVRWR_LIST(ch->atom);
98783 +       cur = capture_list_front(ch->overwrite_set);
98784 +
98785 +       while (!capture_list_end(ch->overwrite_set, cur)) {
98786 +               jnode *next = capture_list_next(cur);
98787 +
98788 +               if (jnode_is_znode(cur) && znode_above_root(JZNODE(cur))) {
98789 +                       ON_TRACE(TRACE_LOG, "fake znode found , WANDER=(%d)\n", JF_ISSET(cur, JNODE_OVRWR));
98790 +               }
98791 +
98792 +               /* Count bitmap locks for getting correct statistics what number
98793 +                * of blocks were cleared by the transaction commit. */
98794 +               if (jnode_get_type(cur) == JNODE_BITMAP)
98795 +                       ch->nr_bitmap ++;
98796 +
98797 +               assert("zam-939", JF_ISSET(cur, JNODE_OVRWR) || jnode_get_type(cur) == JNODE_BITMAP);
98798 +
98799 +               if (jnode_is_znode(cur) && znode_above_root(JZNODE(cur))) {
98800 +                       /* we replace fake znode by another (real)
98801 +                          znode which is suggested by disk_layout
98802 +                          plugin */
98803 +
98804 +                       /* FIXME: it looks like fake znode should be
98805 +                          replaced by jnode supplied by
98806 +                          disk_layout. */
98807 +
98808 +                       struct super_block *s = reiser4_get_current_sb();
98809 +                       reiser4_super_info_data *sbinfo = get_current_super_private();
98810 +
98811 +                       if (sbinfo->df_plug->log_super) {
98812 +                               jnode *sj = sbinfo->df_plug->log_super(s);
98813 +
98814 +                               assert("zam-593", sj != NULL);
98815 +
98816 +                               if (IS_ERR(sj))
98817 +                                       return PTR_ERR(sj);
98818 +
98819 +                               LOCK_JNODE(sj);
98820 +                               JF_SET(sj, JNODE_OVRWR);
98821 +                               insert_into_atom_ovrwr_list(ch->atom, sj);
98822 +                               UNLOCK_JNODE(sj);
98823 +
98824 +                               /* jload it as the rest of overwrite set */
98825 +                               jload_gfp(sj, GFP_KERNEL, 0);
98826 +
98827 +                               ch->overwrite_set_size++;
98828 +                       }
98829 +                       LOCK_JNODE(cur);
98830 +                       uncapture_block(cur);
98831 +                       jput(cur);
98832 +
98833 +               } else {
98834 +                       int ret;
98835 +                       ch->overwrite_set_size++;
98836 +                       ret = jload_gfp(cur, GFP_KERNEL, 0);
98837 +                       if (ret)
98838 +                               reiser4_panic("zam-783", "cannot load e-flushed jnode back (ret = %d)\n", ret);
98839 +               }
98840 +
98841 +               /* Count not leaves here because we have to grab disk space
98842 +                * for wandered blocks. They were not counted as "flush
98843 +                * reserved". Counting should be done _after_ nodes are pinned
98844 +                * into memory by jload(). */
98845 +               if (!jnode_is_leaf(cur))
98846 +                       nr_not_leaves ++;
98847 +               else {
98848 +#if REISER4_DEBUG
98849 +                       /* at this point @cur either has JNODE_FLUSH_RESERVED
98850 +                        * or is eflushed. Locking is not strong enough to
98851 +                        * write an assertion checking for this. */
98852 +                       if (jnode_is_znode(cur))
98853 +                               nr_formatted_leaves ++;
98854 +                       else
98855 +                               nr_unformatted_leaves ++;
98856 +#endif
98857 +                       JF_CLR(cur, JNODE_FLUSH_RESERVED);
98858 +               }
98859 +
98860 +               cur = next;
98861 +       }
98862 +
98863 +       /* Grab space for writing (wandered blocks) of not leaves found in
98864 +        * overwrite set. */
98865 +       ret = reiser4_grab_space_force(nr_not_leaves, BA_RESERVED);
98866 +       if (ret)
98867 +               return ret;
98868 +
98869 +       /* Disk space for allocation of wandered blocks of leaf nodes already
98870 +        * reserved as "flush reserved", move it to grabbed space counter. */
98871 +       spin_lock_atom(ch->atom);
98872 +       assert("zam-940", nr_formatted_leaves + nr_unformatted_leaves <= ch->atom->flush_reserved);
98873 +       flush_reserved2grabbed(ch->atom, ch->atom->flush_reserved);
98874 +       spin_unlock_atom(ch->atom);
98875 +
98876 +       return ch->overwrite_set_size;
98877 +}
98878 +
98879 +/* Submit a write request for @nr jnodes beginning from the @first, other jnodes
98880 +   are after the @first on the double-linked "capture" list.  All jnodes will be
98881 +   written to the disk region of @nr blocks starting with @block_p block number.
98882 +   If @fq is not NULL it means that waiting for i/o completion will be done more
98883 +   efficiently by using flush_queue_t objects.
98884 +
98885 +   This function is the one which writes list of jnodes in batch mode. It does
98886 +   all low-level things as bio construction and page states manipulation.
98887 +*/
98888 +static int
98889 +write_jnodes_to_disk_extent(capture_list_head * head, jnode * first, int nr,
98890 +                           const reiser4_block_nr * block_p, flush_queue_t * fq, int flags)
98891 +{
98892 +       struct super_block *super = reiser4_get_current_sb();
98893 +       int for_reclaim = flags & WRITEOUT_FOR_PAGE_RECLAIM;
98894 +       int max_blocks;
98895 +       jnode *cur = first;
98896 +       reiser4_block_nr block;
98897 +
98898 +       assert("zam-571", first != NULL);
98899 +       assert("zam-572", block_p != NULL);
98900 +       assert("zam-570", nr > 0);
98901 +
98902 +       block = *block_p;
98903 +
98904 +       ON_TRACE (TRACE_IO_W, "write of %d blocks starting from %llu\n",
98905 +                 nr, (unsigned long long)block);
98906 +
98907 +       max_blocks = bdev_get_queue(super->s_bdev)->max_sectors >> (super->s_blocksize_bits - 9);
98908 +
98909 +       while (nr > 0) {
98910 +               struct bio *bio;
98911 +               int nr_blocks = min(nr, max_blocks);
98912 +               int i;
98913 +               int nr_used;
98914 +
98915 +               bio = bio_alloc(GFP_NOIO, nr_blocks);
98916 +               if (!bio)
98917 +                       return RETERR(-ENOMEM);
98918 +
98919 +               bio->bi_bdev = super->s_bdev;
98920 +               bio->bi_sector = block * (super->s_blocksize >> 9);
98921 +               for (nr_used = 0, i = 0; i < nr_blocks; i++) {
98922 +                       struct page *pg;
98923 +                       ON_DEBUG(int jnode_is_releasable(jnode *));
98924 +
98925 +                       pg = jnode_page(cur);
98926 +                       assert("zam-573", pg != NULL);
98927 +
98928 +                       page_cache_get(pg);
98929 +
98930 +                       lock_and_wait_page_writeback(pg);
98931 +
98932 +                       if (!bio_add_page(bio, pg, super->s_blocksize, 0)) {
98933 +                               /*
98934 +                                * underlying device is satiated. Stop adding
98935 +                                * pages to the bio.
98936 +                                */
98937 +                               unlock_page(pg);
98938 +                               page_cache_release(pg);
98939 +                               break;
98940 +                       }
98941 +
98942 +                       LOCK_JNODE(cur);
98943 +                       ON_DEBUG_MODIFY(znode_set_checksum(cur, 1));
98944 +                       assert("nikita-3166",
98945 +                              pg->mapping == jnode_get_mapping(cur));
98946 +                       assert("zam-912", !JF_ISSET(cur, JNODE_WRITEBACK));
98947 +                       assert("nikita-3165", !jnode_is_releasable(cur));
98948 +                       JF_SET(cur, JNODE_WRITEBACK);
98949 +                       JF_CLR(cur, JNODE_DIRTY);
98950 +                       UNLOCK_JNODE(cur);
98951 +
98952 +                       if (REISER4_STATS && !PageDirty(pg))
98953 +                               reiser4_stat_inc(pages_clean);
98954 +
98955 +                       set_page_writeback(pg);
98956 +                        if (for_reclaim)
98957 +                               ent_writes_page(super, pg);
98958 +                       /* clear DIRTY or REISER4_MOVED tag if it is set */
98959 +                       reiser4_clear_page_dirty(pg);
98960 +
98961 +                       unlock_page(pg);
98962 +
98963 +                       cur = capture_list_next(cur);
98964 +                       nr_used ++;
98965 +               }
98966 +               if (nr_used > 0) {
98967 +                       assert("nikita-3453",
98968 +                              bio->bi_size == super->s_blocksize * nr_used);
98969 +                       assert("nikita-3454", bio->bi_vcnt == nr_used);
98970 +
98971 +                       /* Check if we are allowed to write at all */
98972 +                       if (super->s_flags & MS_RDONLY)
98973 +                               undo_bio(bio);
98974 +                       else {
98975 +                               add_fq_to_bio(fq, bio);
98976 +                               reiser4_submit_bio(WRITE, bio);
98977 +                       }
98978 +
98979 +                       block += nr_used - 1;
98980 +                       update_blocknr_hint_default (super, &block);
98981 +                       block += 1;
98982 +               } else {
98983 +                       reiser4_stat_inc(txnmgr.empty_bio);
98984 +                       bio_put(bio);
98985 +               }
98986 +               nr -= nr_used;
98987 +       }
98988 +
98989 +       return 0;
98990 +}
98991 +
98992 +/* This is a procedure which recovers a contiguous sequences of disk block
98993 +   numbers in the given list of j-nodes and submits write requests on this
98994 +   per-sequence basis */
98995 +reiser4_internal int
98996 +write_jnode_list (capture_list_head * head, flush_queue_t * fq, long *nr_submitted, int flags)
98997 +{
98998 +       int ret;
98999 +       jnode *beg = capture_list_front(head);
99000 +
99001 +       while (!capture_list_end(head, beg)) {
99002 +               int nr = 1;
99003 +               jnode *cur = capture_list_next(beg);
99004 +
99005 +               while (!capture_list_end(head, cur)) {
99006 +                       if (*jnode_get_block(cur) != *jnode_get_block(beg) + nr)
99007 +                               break;
99008 +                       ++nr;
99009 +                       cur = capture_list_next(cur);
99010 +               }
99011 +
99012 +               ret = write_jnodes_to_disk_extent(head, beg, nr, jnode_get_block(beg), fq, flags);
99013 +               if (ret)
99014 +                       return ret;
99015 +
99016 +               if (nr_submitted)
99017 +                       *nr_submitted += nr;
99018 +
99019 +               beg = cur;
99020 +       }
99021 +
99022 +       return 0;
99023 +}
99024 +
99025 +/* add given wandered mapping to atom's wandered map */
99026 +static int
99027 +add_region_to_wmap(jnode * cur, int len, const reiser4_block_nr * block_p)
99028 +{
99029 +       int ret;
99030 +       blocknr_set_entry *new_bsep = NULL;
99031 +       reiser4_block_nr block;
99032 +
99033 +       txn_atom *atom;
99034 +
99035 +       assert("zam-568", block_p != NULL);
99036 +       block = *block_p;
99037 +       assert("zam-569", len > 0);
99038 +
99039 +       while ((len--) > 0) {
99040 +               do {
99041 +                       atom = get_current_atom_locked();
99042 +                       assert("zam-536", !blocknr_is_fake(jnode_get_block(cur)));
99043 +                       ret = blocknr_set_add_pair(atom, &atom->wandered_map, &new_bsep, jnode_get_block(cur), &block);
99044 +               } while (ret == -E_REPEAT);
99045 +
99046 +               if (ret) {
99047 +                       /* deallocate blocks which were not added to wandered
99048 +                          map */
99049 +                       reiser4_block_nr wide_len = len;
99050 +
99051 +                       reiser4_dealloc_blocks(&block, &wide_len, BLOCK_NOT_COUNTED,
99052 +                               BA_FORMATTED/* formatted, without defer */);
99053 +
99054 +                       return ret;
99055 +               }
99056 +
99057 +               UNLOCK_ATOM(atom);
99058 +
99059 +               cur = capture_list_next(cur);
99060 +               ++block;
99061 +       }
99062 +
99063 +       return 0;
99064 +}
99065 +
99066 +/* Allocate wandered blocks for current atom's OVERWRITE SET and immediately
99067 +   submit IO for allocated blocks.  We assume that current atom is in a stage
99068 +   when any atom fusion is impossible and atom is unlocked and it is safe. */
99069 +reiser4_internal int
99070 +alloc_wandered_blocks(struct commit_handle *ch, flush_queue_t * fq)
99071 +{
99072 +       reiser4_block_nr block;
99073 +
99074 +       int rest;
99075 +       int len;
99076 +       int ret;
99077 +
99078 +       jnode *cur;
99079 +
99080 +       assert("zam-534", ch->overwrite_set_size > 0);
99081 +
99082 +       rest = ch->overwrite_set_size;
99083 +
99084 +       cur = capture_list_front(ch->overwrite_set);
99085 +       while (!capture_list_end(ch->overwrite_set, cur)) {
99086 +               assert("zam-567", JF_ISSET(cur, JNODE_OVRWR));
99087 +
99088 +               ret = get_more_wandered_blocks(rest, &block, &len);
99089 +               if (ret)
99090 +                       return ret;
99091 +
99092 +               rest -= len;
99093 +
99094 +               ret = add_region_to_wmap(cur, len, &block);
99095 +               if (ret)
99096 +                       return ret;
99097 +
99098 +               ret = write_jnodes_to_disk_extent(ch->overwrite_set, cur, len, &block, fq, 0);
99099 +               if (ret)
99100 +                       return ret;
99101 +
99102 +               while ((len--) > 0) {
99103 +                       assert("zam-604", !capture_list_end(ch->overwrite_set, cur));
99104 +                       cur = capture_list_next(cur);
99105 +               }
99106 +       }
99107 +
99108 +       return 0;
99109 +}
99110 +
99111 +#endif /* ! REISER4_COPY_ON_CAPTURE */
99112 +
99113 +/* allocate given number of nodes over the journal area and link them into a
99114 +   list, return pointer to the first jnode in the list */
99115 +static int
99116 +alloc_tx(struct commit_handle *ch, flush_queue_t * fq)
99117 +{
99118 +       reiser4_blocknr_hint hint;
99119 +
99120 +       reiser4_block_nr allocated = 0;
99121 +       reiser4_block_nr first, len;
99122 +
99123 +       jnode *cur;
99124 +       jnode *txhead;
99125 +       int ret;
99126 +
99127 +       assert("zam-698", ch->tx_size > 0);
99128 +       assert("zam-699", capture_list_empty(&ch->tx_list));
99129 +
99130 +       while (allocated < (unsigned) ch->tx_size) {
99131 +               len = (ch->tx_size - allocated);
99132 +
99133 +               blocknr_hint_init(&hint);
99134 +
99135 +               hint.block_stage = BLOCK_GRABBED;
99136 +
99137 +               /* FIXME: there should be some block allocation policy for
99138 +                  nodes which contain wander records */
99139 +
99140 +               /* We assume that disk space for wandered record blocks can be
99141 +                * taken from reserved area. */
99142 +               ret = reiser4_alloc_blocks (&hint, &first, &len,
99143 +                       BA_FORMATTED | BA_RESERVED | BA_USE_DEFAULT_SEARCH_START);
99144 +
99145 +               blocknr_hint_done(&hint);
99146 +
99147 +               if (ret)
99148 +                       return ret;
99149 +
99150 +               allocated += len;
99151 +
99152 +               /* create jnodes for all wander records */
99153 +               while (len--) {
99154 +                       cur = alloc_io_head(&first);
99155 +
99156 +                       if (cur == NULL) {
99157 +                               ret = RETERR(-ENOMEM);
99158 +                               goto free_not_assigned;
99159 +                       }
99160 +
99161 +                       ret = jinit_new(cur, GFP_KERNEL);
99162 +
99163 +                       if (ret != 0) {
99164 +                               jfree(cur);
99165 +                               goto free_not_assigned;
99166 +                       }
99167 +
99168 +                       pin_jnode_data(cur);
99169 +
99170 +                       capture_list_push_back(&ch->tx_list, cur);
99171 +
99172 +                       first++;
99173 +               }
99174 +       }
99175 +
99176 +       {                       /* format a on-disk linked list of wander records */
99177 +               int serial = 1;
99178 +
99179 +               txhead = capture_list_front(&ch->tx_list);
99180 +               format_tx_head(ch);
99181 +
99182 +               cur = capture_list_next(txhead);
99183 +               while (!capture_list_end(&ch->tx_list, cur)) {
99184 +                       format_wander_record(ch, cur, serial++);
99185 +                       cur = capture_list_next(cur);
99186 +               }
99187 +
99188 +       }
99189 +
99190 +       {                       /* Fill wander records with Wandered Set */
99191 +               struct store_wmap_params params;
99192 +               txn_atom *atom;
99193 +
99194 +               params.cur = capture_list_next(txhead);
99195 +
99196 +               params.idx = 0;
99197 +               params.capacity = wander_record_capacity(reiser4_get_current_sb());
99198 +
99199 +               atom = get_current_atom_locked();
99200 +               blocknr_set_iterator(atom, &atom->wandered_map, &store_wmap_actor, &params, 0);
99201 +               UNLOCK_ATOM(atom);
99202 +       }
99203 +
99204 +       {                       /* relse all jnodes from tx_list */
99205 +               cur = capture_list_front(&ch->tx_list);
99206 +               while (!capture_list_end(&ch->tx_list, cur)) {
99207 +                       jrelse(cur);
99208 +                       cur = capture_list_next(cur);
99209 +               }
99210 +       }
99211 +
99212 +       ret = write_jnode_list(&ch->tx_list, fq, NULL, 0);
99213 +
99214 +       return ret;
99215 +
99216 +free_not_assigned:
99217 +       /* We deallocate blocks not yet assigned to jnodes on tx_list. The
99218 +          caller takes care about invalidating of tx list  */
99219 +       reiser4_dealloc_blocks(&first, &len, BLOCK_NOT_COUNTED, BA_FORMATTED);
99220 +
99221 +       return ret;
99222 +}
99223 +
99224 +/* We assume that at this moment all captured blocks are marked as RELOC or
99225 +   WANDER (belong to Relocate o Overwrite set), all nodes from Relocate set
99226 +   are submitted to write.
99227 +*/
99228 +
99229 +reiser4_internal int reiser4_write_logs(long * nr_submitted)
99230 +{
99231 +       txn_atom *atom;
99232 +
99233 +       struct super_block *super = reiser4_get_current_sb();
99234 +       reiser4_super_info_data *sbinfo = get_super_private(super);
99235 +
99236 +       struct commit_handle ch;
99237 +
99238 +       int ret;
99239 +
99240 +#if REISER4_STATS
99241 +       unsigned long commit_start_time = jiffies;
99242 +#endif
99243 +       writeout_mode_enable();
99244 +
99245 +       /* block allocator may add j-nodes to the clean_list */
99246 +       ret = pre_commit_hook();
99247 +       if (ret)
99248 +               return ret;
99249 +
99250 +       /* No locks are required if we take atom which stage >=
99251 +        * ASTAGE_PRE_COMMIT */
99252 +       atom = get_current_context()->trans->atom;
99253 +       assert("zam-965", atom != NULL);
99254 +
99255 +       /* relocate set is on the atom->clean_nodes list after
99256 +        * current_atom_complete_writes() finishes. It can be safely
99257 +        * uncaptured after commit_semaphore is taken, because any atom that
99258 +        * captures these nodes is guaranteed to commit after current one.
99259 +        *
99260 +        * This can only be done after pre_commit_hook(), because it is where
99261 +        * early flushed jnodes with CREATED bit are transferred to the
99262 +        * overwrite list. */
99263 +       invalidate_list(ATOM_CLEAN_LIST(atom));
99264 +       LOCK_ATOM(atom);
99265 +       /* There might be waiters for the relocate nodes which we have
99266 +        * released, wake them up. */
99267 +       atom_send_event(atom);
99268 +       UNLOCK_ATOM(atom);
99269 +
99270 +       /* trace_mark(wander); */
99271 +       write_current_logf(WRITE_IO_LOG, "mark=wander\n");
99272 +
99273 +       if (REISER4_DEBUG) {
99274 +                int level;
99275 +
99276 +                for (level = 0; level < REAL_MAX_ZTREE_HEIGHT + 1; ++ level)
99277 +                         assert("nikita-3352",
99278 +                                capture_list_empty(ATOM_DIRTY_LIST(atom, level)));
99279 +       }
99280 +
99281 +       sbinfo->nr_files_committed += (unsigned) atom->nr_objects_created;
99282 +       sbinfo->nr_files_committed -= (unsigned) atom->nr_objects_deleted;
99283 +
99284 +       init_commit_handle(&ch, atom);
99285 +
99286 +       ch.free_blocks = sbinfo->blocks_free_committed;
99287 +       ch.nr_files = sbinfo->nr_files_committed;
99288 +       /* ZAM-FIXME-HANS: email me what the contention level is for the super
99289 +        * lock. */
99290 +       ch.next_oid = oid_next(super);
99291 +
99292 +       /* count overwrite set and place it in a separate list */
99293 +       ret = get_overwrite_set(&ch);
99294 +
99295 +       if (ret <= 0) {
99296 +               /* It is possible that overwrite set is empty here, it means
99297 +                  all captured nodes are clean */
99298 +               goto up_and_ret;
99299 +       }
99300 +
99301 +       /* Inform the caller about what number of dirty pages will be
99302 +        * submitted to disk. */
99303 +       *nr_submitted += ch.overwrite_set_size - ch.nr_bitmap;
99304 +
99305 +       ON_TRACE(TRACE_LOG, "commit atom (id = %u, count = %u)\n", atom->atom_id, atom->capture_count);
99306 +
99307 +       /* count all records needed for storing of the wandered set */
99308 +       get_tx_size(&ch);
99309 +
99310 +       /* Grab more space for wandered records. */
99311 +       ret = reiser4_grab_space_force((__u64)(ch.tx_size), BA_RESERVED);
99312 +       if (ret)
99313 +               goto up_and_ret;
99314 +
99315 +       {
99316 +               flush_queue_t *fq;
99317 +
99318 +               fq = get_fq_for_current_atom();
99319 +
99320 +               if (IS_ERR(fq)) {
99321 +                       ret = PTR_ERR(fq);
99322 +                       goto up_and_ret;
99323 +               }
99324 +
99325 +               UNLOCK_ATOM(fq->atom);
99326 +
99327 +               do {
99328 +                       ret = alloc_wandered_blocks(&ch, fq);
99329 +                       if (ret)
99330 +                               break;
99331 +
99332 +                       ret = alloc_tx(&ch, fq);
99333 +                       if (ret)
99334 +                               break;
99335 +               } while (0);
99336 +
99337 +
99338 +               /* Release all grabbed space if it was not fully used for
99339 +                * wandered blocks/records allocation. */
99340 +               all_grabbed2free();
99341 +
99342 +               fq_put(fq);
99343 +               if (ret)
99344 +                       goto up_and_ret;
99345 +       }
99346 +
99347 +       ret = current_atom_finish_all_fq();
99348 +       if (ret)
99349 +               goto up_and_ret;
99350 +
99351 +       ON_TRACE(TRACE_LOG, "overwrite set (%u blocks) written to wandered locations\n", ch.overwrite_set_size);
99352 +
99353 +       if ((ret = update_journal_header(&ch)))
99354 +               goto up_and_ret;
99355 +
99356 +       ON_TRACE(TRACE_LOG,
99357 +                "journal header updated (tx head at block %s)\n",
99358 +                sprint_address(jnode_get_block(capture_list_front(&ch.tx_list))));
99359 +
99360 +       reiser4_stat_inc(txnmgr.commits);
99361 +
99362 +       UNDER_SPIN_VOID(atom, atom, atom_set_stage(atom, ASTAGE_POST_COMMIT));
99363 +
99364 +       /* trace_mark(ovrwr); */
99365 +       write_current_logf(WRITE_IO_LOG, "mark=ovrwr\n");
99366 +
99367 +       post_commit_hook();
99368 +
99369 +       {
99370 +               /* force j-nodes write back */
99371 +
99372 +               flush_queue_t *fq;
99373 +
99374 +               fq = get_fq_for_current_atom();
99375 +
99376 +               if (IS_ERR(fq)) {
99377 +                       ret = PTR_ERR(fq);
99378 +                       goto up_and_ret;
99379 +               }
99380 +
99381 +               UNLOCK_ATOM(fq->atom);
99382 +
99383 +               ret = write_jnode_list(ch.overwrite_set, fq, NULL, WRITEOUT_FOR_PAGE_RECLAIM);
99384 +
99385 +               fq_put(fq);
99386 +
99387 +               if (ret)
99388 +                       goto up_and_ret;
99389 +       }
99390 +
99391 +       ret = current_atom_finish_all_fq();
99392 +
99393 +       if (ret)
99394 +               goto up_and_ret;
99395 +
99396 +       ON_TRACE(TRACE_LOG, "overwrite set written in place\n");
99397 +
99398 +       if ((ret = update_journal_footer(&ch)))
99399 +               goto up_and_ret;
99400 +
99401 +       ON_TRACE(TRACE_LOG,
99402 +                "journal footer updated (tx head at block %s)\n",
99403 +                sprint_address(jnode_get_block(capture_list_front(&ch.tx_list))));
99404 +
99405 +       post_write_back_hook();
99406 +
99407 +       reiser4_stat_inc(txnmgr.post_commit_writes);
99408 +       reiser4_stat_add(txnmgr.time_spent_in_commits, jiffies - commit_start_time);
99409 +
99410 +up_and_ret:
99411 +       if (ret) {
99412 +               /* there could be fq attached to current atom; the only way to
99413 +                  remove them is: */
99414 +               current_atom_finish_all_fq();
99415 +       }
99416 +
99417 +       /* free blocks of flushed transaction */
99418 +       dealloc_tx_list(&ch);
99419 +       dealloc_wmap(&ch);
99420 +
99421 +       put_overwrite_set(&ch);
99422 +
99423 +       done_commit_handle(&ch);
99424 +
99425 +       writeout_mode_disable();
99426 +
99427 +       return ret;
99428 +}
99429 +
99430 +/* consistency checks for journal data/control blocks: header, footer, log
99431 +   records, transactions head blocks. All functions return zero on success. */
99432 +
99433 +static int
99434 +check_journal_header(const jnode * node UNUSED_ARG)
99435 +{
99436 +       /* FIXME: journal header has no magic field yet. */
99437 +       return 0;
99438 +}
99439 +
99440 +/* wait for write completion for all jnodes from given list */
99441 +static int
99442 +wait_on_jnode_list(capture_list_head * head)
99443 +{
99444 +       jnode *scan;
99445 +       int ret = 0;
99446 +
99447 +       for_all_type_safe_list(capture, head, scan) {
99448 +               struct page *pg = jnode_page(scan);
99449 +
99450 +               if (pg) {
99451 +                       if (PageWriteback(pg))
99452 +                               wait_on_page_writeback(pg);
99453 +
99454 +                       if (PageError(pg))
99455 +                               ret++;
99456 +               }
99457 +       }
99458 +
99459 +       return ret;
99460 +}
99461 +
99462 +static int
99463 +check_journal_footer(const jnode * node UNUSED_ARG)
99464 +{
99465 +       /* FIXME: journal footer has no magic field yet. */
99466 +       return 0;
99467 +}
99468 +
99469 +static int
99470 +check_tx_head(const jnode * node)
99471 +{
99472 +       struct tx_header *header = (struct tx_header *) jdata(node);
99473 +
99474 +       if (memcmp(&header->magic, TX_HEADER_MAGIC, TX_HEADER_MAGIC_SIZE) != 0) {
99475 +               warning("zam-627", "tx head at block %s corrupted\n", sprint_address(jnode_get_block(node)));
99476 +               return RETERR(-EIO);
99477 +       }
99478 +
99479 +       return 0;
99480 +}
99481 +
99482 +static int
99483 +check_wander_record(const jnode * node)
99484 +{
99485 +       struct wander_record_header *RH = (struct wander_record_header *) jdata(node);
99486 +
99487 +       if (memcmp(&RH->magic, WANDER_RECORD_MAGIC, WANDER_RECORD_MAGIC_SIZE) != 0) {
99488 +               warning("zam-628", "wander record at block %s corrupted\n", sprint_address(jnode_get_block(node)));
99489 +               return RETERR(-EIO);
99490 +       }
99491 +
99492 +       return 0;
99493 +}
99494 +
99495 +/* fill commit_handler structure by everything what is needed for update_journal_footer */
99496 +static int
99497 +restore_commit_handle(struct commit_handle *ch, jnode * tx_head)
99498 +{
99499 +       struct tx_header *TXH;
99500 +       int ret;
99501 +
99502 +       ret = jload(tx_head);
99503 +
99504 +       if (ret)
99505 +               return ret;
99506 +
99507 +       TXH = (struct tx_header *) jdata(tx_head);
99508 +
99509 +       ch->free_blocks = d64tocpu(&TXH->free_blocks);
99510 +       ch->nr_files = d64tocpu(&TXH->nr_files);
99511 +       ch->next_oid = d64tocpu(&TXH->next_oid);
99512 +
99513 +       jrelse(tx_head);
99514 +
99515 +       capture_list_push_front(&ch->tx_list, tx_head);
99516 +
99517 +       return 0;
99518 +}
99519 +
99520 +/* replay one transaction: restore and write overwrite set in place */
99521 +static int
99522 +replay_transaction(const struct super_block *s,
99523 +                  jnode * tx_head,
99524 +                  const reiser4_block_nr * log_rec_block_p,
99525 +                  const reiser4_block_nr * end_block, unsigned int nr_wander_records)
99526 +{
99527 +       reiser4_block_nr log_rec_block = *log_rec_block_p;
99528 +       struct commit_handle ch;
99529 +       capture_list_head overwrite_set;
99530 +       jnode *log;
99531 +       int ret;
99532 +
99533 +       init_commit_handle(&ch, NULL);
99534 +       capture_list_init(&overwrite_set);
99535 +       ch.overwrite_set = &overwrite_set;
99536 +
99537 +       restore_commit_handle(&ch, tx_head);
99538 +
99539 +       while (log_rec_block != *end_block) {
99540 +               struct wander_record_header *header;
99541 +               struct wander_entry *entry;
99542 +
99543 +               int i;
99544 +
99545 +               if (nr_wander_records == 0) {
99546 +                       warning("zam-631",
99547 +                               "number of wander records in the linked list" " greater than number stored in tx head.\n");
99548 +                       ret = RETERR(-EIO);
99549 +                       goto free_ow_set;
99550 +               }
99551 +
99552 +               log = alloc_io_head(&log_rec_block);
99553 +               if (log == NULL)
99554 +                       return RETERR(-ENOMEM);
99555 +
99556 +               ret = jload(log);
99557 +               if (ret < 0) {
99558 +                       drop_io_head(log);
99559 +                       return ret;
99560 +               }
99561 +
99562 +               ret = check_wander_record(log);
99563 +               if (ret) {
99564 +                       jrelse(log);
99565 +                       drop_io_head(log);
99566 +                       return ret;
99567 +               }
99568 +
99569 +               header = (struct wander_record_header *) jdata(log);
99570 +               log_rec_block = d64tocpu(&header->next_block);
99571 +
99572 +               entry = (struct wander_entry *) (header + 1);
99573 +
99574 +               /* restore overwrite set from wander record content */
99575 +               for (i = 0; i < wander_record_capacity(s); i++) {
99576 +                       reiser4_block_nr block;
99577 +                       jnode *node;
99578 +
99579 +                       block = d64tocpu(&entry->wandered);
99580 +
99581 +                       if (block == 0)
99582 +                               break;
99583 +
99584 +                       node = alloc_io_head(&block);
99585 +                       if (node == NULL) {
99586 +                               ret = RETERR(-ENOMEM);
99587 +                               /*
99588 +                                * FIXME-VS:???
99589 +                                */
99590 +                               jrelse(log);
99591 +                               drop_io_head(log);
99592 +                               goto free_ow_set;
99593 +                       }
99594 +
99595 +                       ret = jload(node);
99596 +
99597 +                       if (ret < 0) {
99598 +                               drop_io_head(node);
99599 +                               /*
99600 +                                * FIXME-VS:???
99601 +                                */
99602 +                               jrelse(log);
99603 +                               drop_io_head(log);
99604 +                               goto free_ow_set;
99605 +                       }
99606 +
99607 +                       block = d64tocpu(&entry->original);
99608 +
99609 +                       assert("zam-603", block != 0);
99610 +
99611 +                       jnode_set_block(node, &block);
99612 +
99613 +                       capture_list_push_back(ch.overwrite_set, node);
99614 +
99615 +                       ++entry;
99616 +               }
99617 +
99618 +               jrelse(log);
99619 +               drop_io_head(log);
99620 +
99621 +               --nr_wander_records;
99622 +       }
99623 +
99624 +       if (nr_wander_records != 0) {
99625 +               warning("zam-632", "number of wander records in the linked list"
99626 +                       " less than number stored in tx head.\n");
99627 +               ret = RETERR(-EIO);
99628 +               goto free_ow_set;
99629 +       }
99630 +
99631 +       {                       /* write wandered set in place */
99632 +               write_jnode_list(ch.overwrite_set, 0, NULL, 0);
99633 +               ret = wait_on_jnode_list(ch.overwrite_set);
99634 +
99635 +               if (ret) {
99636 +                       ret = RETERR(-EIO);
99637 +                       goto free_ow_set;
99638 +               }
99639 +       }
99640 +
99641 +       ret = update_journal_footer(&ch);
99642 +
99643 +free_ow_set:
99644 +
99645 +       while (!capture_list_empty(ch.overwrite_set)) {
99646 +               jnode *cur = capture_list_front(ch.overwrite_set);
99647 +               capture_list_remove_clean (cur);
99648 +               jrelse(cur);
99649 +               drop_io_head(cur);
99650 +       }
99651 +
99652 +       capture_list_remove_clean (tx_head);
99653 +
99654 +       done_commit_handle(&ch);
99655 +
99656 +       return ret;
99657 +}
99658 +
99659 +/* find oldest committed and not played transaction and play it. The transaction
99660 + * was committed and journal header block was updated but the blocks from the
99661 + * process of writing the atom's overwrite set in-place and updating of journal
99662 + * footer block were not completed. This function completes the process by
99663 + * recovering the atom's overwrite set from their wandered locations and writes
99664 + * them in-place and updating the journal footer. */
99665 +static int
99666 +replay_oldest_transaction(struct super_block *s)
99667 +{
99668 +       reiser4_super_info_data *sbinfo = get_super_private(s);
99669 +       jnode *jf = sbinfo->journal_footer;
99670 +       unsigned int total;
99671 +       struct journal_footer *F;
99672 +       struct tx_header *T;
99673 +
99674 +       reiser4_block_nr prev_tx;
99675 +       reiser4_block_nr last_flushed_tx;
99676 +       reiser4_block_nr log_rec_block = 0;
99677 +
99678 +       jnode *tx_head;
99679 +
99680 +       int ret;
99681 +
99682 +       if ((ret = jload(jf)) < 0)
99683 +               return ret;
99684 +
99685 +       F = (struct journal_footer *) jdata(jf);
99686 +
99687 +       last_flushed_tx = d64tocpu(&F->last_flushed_tx);
99688 +
99689 +       jrelse(jf);
99690 +
99691 +       if (sbinfo->last_committed_tx == last_flushed_tx) {
99692 +               /* all transactions are replayed */
99693 +               return 0;
99694 +       }
99695 +
99696 +       ON_TRACE(TRACE_REPLAY, "not flushed transactions found.");
99697 +
99698 +       prev_tx = sbinfo->last_committed_tx;
99699 +
99700 +       /* searching for oldest not flushed transaction */
99701 +       while (1) {
99702 +               tx_head = alloc_io_head(&prev_tx);
99703 +               if (!tx_head)
99704 +                       return RETERR(-ENOMEM);
99705 +
99706 +               ret = jload(tx_head);
99707 +               if (ret < 0) {
99708 +                       drop_io_head(tx_head);
99709 +                       return ret;
99710 +               }
99711 +
99712 +               ret = check_tx_head(tx_head);
99713 +               if (ret) {
99714 +                       jrelse(tx_head);
99715 +                       drop_io_head(tx_head);
99716 +                       return ret;
99717 +               }
99718 +
99719 +               T = (struct tx_header *) jdata(tx_head);
99720 +
99721 +               prev_tx = d64tocpu(&T->prev_tx);
99722 +
99723 +               if (prev_tx == last_flushed_tx)
99724 +                       break;
99725 +
99726 +               jrelse(tx_head);
99727 +               drop_io_head(tx_head);
99728 +       }
99729 +
99730 +       total = d32tocpu(&T->total);
99731 +       log_rec_block = d64tocpu(&T->next_block);
99732 +
99733 +       ON_TRACE(TRACE_REPLAY,
99734 +                "not flushed transaction found (head block %s, %u wander records)\n",
99735 +                sprint_address(jnode_get_block(tx_head)), total);
99736 +
99737 +       pin_jnode_data(tx_head);
99738 +       jrelse(tx_head);
99739 +
99740 +       ret = replay_transaction(s, tx_head, &log_rec_block, jnode_get_block(tx_head), total - 1);
99741 +
99742 +       unpin_jnode_data(tx_head);
99743 +       drop_io_head(tx_head);
99744 +
99745 +       if (ret)
99746 +               return ret;
99747 +       return -E_REPEAT;
99748 +}
99749 +
99750 +/* The reiser4 journal current implementation was optimized to not to capture
99751 +   super block if certain super blocks fields are modified. Currently, the set
99752 +   is (<free block count>, <OID allocator>). These fields are logged by
99753 +   special way which includes storing them in each transaction head block at
99754 +   atom commit time and writing that information to journal footer block at
99755 +   atom flush time.  For getting info from journal footer block to the
99756 +   in-memory super block there is a special function
99757 +   reiser4_journal_recover_sb_data() which should be called after disk format
99758 +   plugin re-reads super block after journal replaying.
99759 +*/
99760 +
99761 +/* get the information from journal footer in-memory super block */
99762 +reiser4_internal int
99763 +reiser4_journal_recover_sb_data(struct super_block *s)
99764 +{
99765 +       reiser4_super_info_data *sbinfo = get_super_private(s);
99766 +       struct journal_footer *jf;
99767 +       int ret;
99768 +
99769 +       assert("zam-673", sbinfo->journal_footer != NULL);
99770 +
99771 +       ret = jload(sbinfo->journal_footer);
99772 +       if (ret != 0)
99773 +               return ret;
99774 +
99775 +       ret = check_journal_footer(sbinfo->journal_footer);
99776 +       if (ret != 0)
99777 +               goto out;
99778 +
99779 +       jf = (struct journal_footer *) jdata(sbinfo->journal_footer);
99780 +
99781 +       /* was there at least one flushed transaction?  */
99782 +       if (d64tocpu(&jf->last_flushed_tx)) {
99783 +
99784 +               /* restore free block counter logged in this transaction */
99785 +               reiser4_set_free_blocks(s, d64tocpu(&jf->free_blocks));
99786 +
99787 +               /* restore oid allocator state */
99788 +               oid_init_allocator(s,
99789 +                                  d64tocpu(&jf->nr_files),
99790 +                                  d64tocpu(&jf->next_oid));
99791 +       }
99792 +out:
99793 +       jrelse(sbinfo->journal_footer);
99794 +       return ret;
99795 +}
99796 +
99797 +/* reiser4 replay journal procedure */
99798 +reiser4_internal int
99799 +reiser4_journal_replay(struct super_block *s)
99800 +{
99801 +       reiser4_super_info_data *sbinfo = get_super_private(s);
99802 +       jnode *jh, *jf;
99803 +
99804 +       struct journal_header *header;
99805 +       int nr_tx_replayed = 0;
99806 +
99807 +       int ret;
99808 +
99809 +       assert("zam-582", sbinfo != NULL);
99810 +
99811 +       jh = sbinfo->journal_header;
99812 +       jf = sbinfo->journal_footer;
99813 +
99814 +       if (!jh || !jf) {
99815 +               /* it is possible that disk layout does not support journal
99816 +                  structures, we just warn about this */
99817 +               warning("zam-583",
99818 +                       "journal control blocks were not loaded by disk layout plugin.  "
99819 +                       "journal replaying is not possible.\n");
99820 +               return 0;
99821 +       }
99822 +
99823 +       /* Take free block count from journal footer block. The free block
99824 +          counter value corresponds the last flushed transaction state */
99825 +       ret = jload(jf);
99826 +       if (ret < 0)
99827 +               return ret;
99828 +
99829 +       ret = check_journal_footer(jf);
99830 +       if (ret) {
99831 +               jrelse(jf);
99832 +               return ret;
99833 +       }
99834 +
99835 +       jrelse(jf);
99836 +
99837 +       /* store last committed transaction info in reiser4 in-memory super
99838 +          block */
99839 +       ret = jload(jh);
99840 +       if (ret < 0)
99841 +               return ret;
99842 +
99843 +       ret = check_journal_header(jh);
99844 +       if (ret) {
99845 +               jrelse(jh);
99846 +               return ret;
99847 +       }
99848 +
99849 +       header = (struct journal_header *) jdata(jh);
99850 +       sbinfo->last_committed_tx = d64tocpu(&header->last_committed_tx);
99851 +
99852 +       jrelse(jh);
99853 +
99854 +       /* replay committed transactions */
99855 +       while ((ret = replay_oldest_transaction(s)) == -E_REPEAT)
99856 +               nr_tx_replayed++;
99857 +
99858 +       ON_TRACE(TRACE_REPLAY, "%d transactions replayed ret = %d", nr_tx_replayed, ret);
99859 +
99860 +       return ret;
99861 +}
99862 +/* load journal control block (either journal header or journal footer block) */
99863 +static int
99864 +load_journal_control_block(jnode ** node, const reiser4_block_nr * block)
99865 +{
99866 +       int ret;
99867 +
99868 +       *node = alloc_io_head(block);
99869 +       if (!(*node))
99870 +               return RETERR(-ENOMEM);
99871 +
99872 +       ret = jload(*node);
99873 +
99874 +       if (ret) {
99875 +               drop_io_head(*node);
99876 +               *node = NULL;
99877 +               return ret;
99878 +       }
99879 +
99880 +       pin_jnode_data(*node);
99881 +       jrelse(*node);
99882 +
99883 +       return 0;
99884 +}
99885 +
99886 +/* unload journal header or footer and free jnode */
99887 +static void
99888 +unload_journal_control_block(jnode ** node)
99889 +{
99890 +       if (*node) {
99891 +               unpin_jnode_data(*node);
99892 +               drop_io_head(*node);
99893 +               *node = NULL;
99894 +       }
99895 +}
99896 +
99897 +/* release journal control blocks */
99898 +reiser4_internal void
99899 +done_journal_info(struct super_block *s)
99900 +{
99901 +       reiser4_super_info_data *sbinfo = get_super_private(s);
99902 +
99903 +       assert("zam-476", sbinfo != NULL);
99904 +
99905 +       unload_journal_control_block(&sbinfo->journal_header);
99906 +       unload_journal_control_block(&sbinfo->journal_footer);
99907 +}
99908 +
99909 +/* load journal control blocks */
99910 +reiser4_internal int
99911 +init_journal_info(struct super_block *s)
99912 +{
99913 +       reiser4_super_info_data *sbinfo = get_super_private(s);
99914 +       journal_location *loc;
99915 +       int ret;
99916 +
99917 +       loc = &sbinfo->jloc;
99918 +
99919 +       assert("zam-651", loc != NULL);
99920 +       assert("zam-652", loc->header != 0);
99921 +       assert("zam-653", loc->footer != 0);
99922 +
99923 +       ret = load_journal_control_block(&sbinfo->journal_header, &loc->header);
99924 +
99925 +       if (ret)
99926 +               return ret;
99927 +
99928 +       ret = load_journal_control_block(&sbinfo->journal_footer, &loc->footer);
99929 +
99930 +       if (ret) {
99931 +               unload_journal_control_block(&sbinfo->journal_header);
99932 +       }
99933 +
99934 +       return ret;
99935 +}
99936 +
99937 +/* Make Linus happy.
99938 +   Local variables:
99939 +   c-indentation-style: "K&R"
99940 +   mode-name: "LC"
99941 +   c-basic-offset: 8
99942 +   tab-width: 8
99943 +   fill-column: 80
99944 +   End:
99945 +*/
99946 diff -rupN linux-2.6.8-rc3/fs/reiser4/wander.h linux-2.6.8-rc3-a/fs/reiser4/wander.h
99947 --- linux-2.6.8-rc3/fs/reiser4/wander.h 1970-01-01 03:00:00.000000000 +0300
99948 +++ linux-2.6.8-rc3-a/fs/reiser4/wander.h       2004-08-05 21:20:52.855708509 +0400
99949 @@ -0,0 +1,135 @@
99950 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
99951 +
99952 +#if !defined (__FS_REISER4_WANDER_H__)
99953 +#define __FS_REISER4_WANDER_H__
99954 +
99955 +#include "dformat.h"
99956 +
99957 +#include <linux/fs.h>          /* for struct super_block  */
99958 +
99959 +/* REISER4 JOURNAL ON-DISK DATA STRUCTURES   */
99960 +
99961 +#define TX_HEADER_MAGIC  "TxMagic4"
99962 +#define WANDER_RECORD_MAGIC "LogMagc4"
99963 +
99964 +#define TX_HEADER_MAGIC_SIZE  (8)
99965 +#define WANDER_RECORD_MAGIC_SIZE (8)
99966 +
99967 +/* journal header block format */
99968 +struct journal_header {
99969 +       /* last written transaction head location */
99970 +       d64 last_committed_tx;
99971 +};
99972 +
99973 +typedef struct journal_location {
99974 +       reiser4_block_nr footer;
99975 +       reiser4_block_nr header;
99976 +} journal_location;
99977 +
99978 +/* The wander.c head comment describes usage and semantic of all these structures */
99979 +/* journal footer block format */
99980 +struct journal_footer {
99981 +       /* last flushed transaction location. */
99982 +       /* This block number is no more valid after the transaction it points
99983 +          to gets flushed, this number is used only at journal replaying time
99984 +          for detection of the end of on-disk list of committed transactions
99985 +          which were not flushed completely */
99986 +       d64 last_flushed_tx;
99987 +
99988 +       /* free block counter is written in journal footer at transaction
99989 +          flushing , not in super block because free blocks counter is logged
99990 +          by another way than super block fields (root pointer, for
99991 +          example). */
99992 +       d64 free_blocks;
99993 +
99994 +       /* number of used OIDs and maximal used OID are logged separately from
99995 +          super block */
99996 +       d64 nr_files;
99997 +       d64 next_oid;
99998 +};
99999 +
100000 +/* Each wander record (except the first one) has unified format with wander
100001 +   record header followed by an array of log entries */
100002 +struct wander_record_header {
100003 +       /* when there is no predefined location for wander records, this magic
100004 +          string should help reiser4fsck. */
100005 +       char magic[WANDER_RECORD_MAGIC_SIZE];
100006 +
100007 +       /* transaction id */
100008 +       d64 id;
100009 +
100010 +       /* total number of wander records in current transaction  */
100011 +       d32 total;
100012 +
100013 +       /* this block number in transaction */
100014 +       d32 serial;
100015 +
100016 +       /* number of previous block in commit */
100017 +       d64 next_block;
100018 +};
100019 +
100020 +/* The first wander record (transaction head) of written transaction has the
100021 +   special format */
100022 +struct tx_header {
100023 +       /* magic string makes first block in transaction different from other
100024 +          logged blocks, it should help fsck. */
100025 +       char magic[TX_HEADER_MAGIC_SIZE];
100026 +
100027 +       /* transaction id */
100028 +       d64 id;
100029 +
100030 +       /* total number of records (including this first tx head) in the
100031 +          transaction */
100032 +       d32 total;
100033 +
100034 +       /* align next field to 8-byte boundary; this field always is zero */
100035 +       d32 padding;
100036 +
100037 +       /* block number of previous transaction head */
100038 +       d64 prev_tx;
100039 +
100040 +       /* next wander record location */
100041 +       d64 next_block;
100042 +
100043 +       /* committed versions of free blocks counter */
100044 +       d64 free_blocks;
100045 +
100046 +       /* number of used OIDs (nr_files) and maximal used OID are logged
100047 +          separately from super block */
100048 +       d64 nr_files;
100049 +       d64 next_oid;
100050 +};
100051 +
100052 +/* A transaction gets written to disk as a set of wander records (each wander
100053 +   record size is fs block) */
100054 +
100055 +/* As it was told above a wander The rest of wander record is filled by these log entries, unused space filled
100056 +   by zeroes */
100057 +struct wander_entry {
100058 +       d64 original;           /* block original location */
100059 +       d64 wandered;           /* block wandered location */
100060 +};
100061 +
100062 +/* REISER4 JOURNAL WRITER FUNCTIONS   */
100063 +
100064 +extern int reiser4_write_logs(long *);
100065 +extern int reiser4_journal_replay(struct super_block *);
100066 +extern int reiser4_journal_recover_sb_data(struct super_block *);
100067 +
100068 +extern int init_journal_info(struct super_block *);
100069 +extern void done_journal_info(struct super_block *);
100070 +
100071 +extern int write_jnode_list (capture_list_head*, flush_queue_t*, long*, int);
100072 +
100073 +#endif                         /* __FS_REISER4_WANDER_H__ */
100074 +
100075 +/* Make Linus happy.
100076 +   Local variables:
100077 +   c-indentation-style: "K&R"
100078 +   mode-name: "LC"
100079 +   c-basic-offset: 8
100080 +   tab-width: 8
100081 +   fill-column: 80
100082 +   scroll-step: 1
100083 +   End:
100084 +*/
100085 diff -rupN linux-2.6.8-rc3/fs/reiser4/writeout.c linux-2.6.8-rc3-a/fs/reiser4/writeout.c
100086 --- linux-2.6.8-rc3/fs/reiser4/writeout.c       1970-01-01 03:00:00.000000000 +0300
100087 +++ linux-2.6.8-rc3-a/fs/reiser4/writeout.c     2004-08-05 21:20:52.980682149 +0400
100088 @@ -0,0 +1,24 @@
100089 +/* Copyright 2002, 2003, 2004 by Hans Reiser, licensing governed by reiser4/README  */
100090 +
100091 +#include "reiser4.h"
100092 +#include "debug.h"
100093 +#include "context.h"
100094 +#include "txnmgr.h"
100095 +#include "writeout.h"
100096 +
100097 +reiser4_internal int get_writeout_flags(void)
100098 +{
100099 +       reiser4_context * ctx = get_current_context();
100100 +
100101 +       return (ctx->entd || get_rapid_flush_mode()) ? WRITEOUT_FOR_PAGE_RECLAIM : 0;
100102 +}
100103 +
100104 +/* Make Linus happy.
100105 +   Local variables:
100106 +   c-indentation-style: "K&R"
100107 +   mode-name: "LC"
100108 +   c-basic-offset: 8
100109 +   tab-width: 8
100110 +   fill-column: 80
100111 +   End:
100112 +*/
100113 diff -rupN linux-2.6.8-rc3/fs/reiser4/writeout.h linux-2.6.8-rc3-a/fs/reiser4/writeout.h
100114 --- linux-2.6.8-rc3/fs/reiser4/writeout.h       1970-01-01 03:00:00.000000000 +0300
100115 +++ linux-2.6.8-rc3-a/fs/reiser4/writeout.h     2004-08-05 21:20:53.374599063 +0400
100116 @@ -0,0 +1,21 @@
100117 +/* Copyright 2002, 2003, 2004 by Hans Reiser, licensing governed by reiser4/README  */
100118 +
100119 +#if !defined (__FS_REISER4_WRITEOUT_H__)
100120 +
100121 +#define WRITEOUT_SINGLE_STREAM (0x1)
100122 +#define WRITEOUT_FOR_PAGE_RECLAIM  (0x2)
100123 +
100124 +extern int get_writeout_flags(void);
100125 +
100126 +#endif /* __FS_REISER4_WRITEOUT_H__ */
100127 +
100128 +
100129 +/* Make Linus happy.
100130 +   Local variables:
100131 +   c-indentation-style: "K&R"
100132 +   mode-name: "LC"
100133 +   c-basic-offset: 8
100134 +   tab-width: 8
100135 +   fill-column: 80
100136 +   End:
100137 +*/
100138 diff -rupN linux-2.6.8-rc3/fs/reiser4/znode.c linux-2.6.8-rc3-a/fs/reiser4/znode.c
100139 --- linux-2.6.8-rc3/fs/reiser4/znode.c  1970-01-01 03:00:00.000000000 +0300
100140 +++ linux-2.6.8-rc3-a/fs/reiser4/znode.c        2004-08-05 21:20:53.125651571 +0400
100141 @@ -0,0 +1,1360 @@
100142 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
100143 + * reiser4/README */
100144 +/* Znode manipulation functions. */
100145 +/* Znode is the in-memory header for a tree node. It is stored
100146 +   separately from the node itself so that it does not get written to
100147 +   disk.  In this respect znode is like buffer head or page head. We
100148 +   also use znodes for additional reiser4 specific purposes:
100149 +
100150 +    . they are organized into tree structure which is a part of whole
100151 +      reiser4 tree.
100152 +    . they are used to implement node grained locking
100153 +    . they are used to keep additional state associated with a
100154 +      node
100155 +    . they contain links to lists used by the transaction manager
100156 +
100157 +   Znode is attached to some variable "block number" which is instance of
100158 +   fs/reiser4/tree.h:reiser4_block_nr type. Znode can exist without
100159 +   appropriate node being actually loaded in memory. Existence of znode itself
100160 +   is regulated by reference count (->x_count) in it. Each time thread
100161 +   acquires reference to znode through call to zget(), ->x_count is
100162 +   incremented and decremented on call to zput().  Data (content of node) are
100163 +   brought in memory through call to zload(), which also increments ->d_count
100164 +   reference counter.  zload can block waiting on IO.  Call to zrelse()
100165 +   decreases this counter. Also, ->c_count keeps track of number of child
100166 +   znodes and prevents parent znode from being recycled until all of its
100167 +   children are. ->c_count is decremented whenever child goes out of existence
100168 +   (being actually recycled in zdestroy()) which can be some time after last
100169 +   reference to this child dies if we support some form of LRU cache for
100170 +   znodes.
100171 +
100172 +*/
100173 +/* EVERY ZNODE'S STORY
100174 +
100175 +   1. His infancy.
100176 +
100177 +   Once upon a time, the znode was born deep inside of zget() by call to
100178 +   zalloc(). At the return from zget() znode had:
100179 +
100180 +    . reference counter (x_count) of 1
100181 +    . assigned block number, marked as used in bitmap
100182 +    . pointer to parent znode. Root znode parent pointer points
100183 +      to its father: "fake" znode. This, in turn, has NULL parent pointer.
100184 +    . hash table linkage
100185 +    . no data loaded from disk
100186 +    . no node plugin
100187 +    . no sibling linkage
100188 +
100189 +   2. His childhood
100190 +
100191 +   Each node is either brought into memory as a result of tree traversal, or
100192 +   created afresh, creation of the root being a special case of the latter. In
100193 +   either case it's inserted into sibling list. This will typically require
100194 +   some ancillary tree traversing, but ultimately both sibling pointers will
100195 +   exist and JNODE_LEFT_CONNECTED and JNODE_RIGHT_CONNECTED will be true in
100196 +   zjnode.state.
100197 +
100198 +   3. His youth.
100199 +
100200 +   If znode is bound to already existing node in a tree, its content is read
100201 +   from the disk by call to zload(). At that moment, JNODE_LOADED bit is set
100202 +   in zjnode.state and zdata() function starts to return non null for this
100203 +   znode. zload() further calls zparse() that determines which node layout
100204 +   this node is rendered in, and sets ->nplug on success.
100205 +
100206 +   If znode is for new node just created, memory for it is allocated and
100207 +   zinit_new() function is called to initialise data, according to selected
100208 +   node layout.
100209 +
100210 +   4. His maturity.
100211 +
100212 +   After this point, znode lingers in memory for some time. Threads can
100213 +   acquire references to znode either by blocknr through call to zget(), or by
100214 +   following a pointer to unallocated znode from internal item. Each time
100215 +   reference to znode is obtained, x_count is increased. Thread can read/write
100216 +   lock znode. Znode data can be loaded through calls to zload(), d_count will
100217 +   be increased appropriately. If all references to znode are released
100218 +   (x_count drops to 0), znode is not recycled immediately. Rather, it is
100219 +   still cached in the hash table in the hope that it will be accessed
100220 +   shortly.
100221 +
100222 +   There are two ways in which znode existence can be terminated:
100223 +
100224 +    . sudden death: node bound to this znode is removed from the tree
100225 +    . overpopulation: znode is purged out of memory due to memory pressure
100226 +
100227 +   5. His death.
100228 +
100229 +   Death is complex process.
100230 +
100231 +   When we irrevocably commit ourselves to decision to remove node from the
100232 +   tree, JNODE_HEARD_BANSHEE bit is set in zjnode.state of corresponding
100233 +   znode. This is done either in ->kill_hook() of internal item or in
100234 +   kill_root() function when tree root is removed.
100235 +
100236 +   At this moment znode still has:
100237 +
100238 +    . locks held on it, necessary write ones
100239 +    . references to it
100240 +    . disk block assigned to it
100241 +    . data loaded from the disk
100242 +    . pending requests for lock
100243 +
100244 +   But once JNODE_HEARD_BANSHEE bit set, last call to unlock_znode() does node
100245 +   deletion. Node deletion includes two phases. First all ways to get
100246 +   references to that znode (sibling and parent links and hash lookup using
100247 +   block number stored in parent node) should be deleted -- it is done through
100248 +   sibling_list_remove(), also we assume that nobody uses down link from
100249 +   parent node due to its nonexistence or proper parent node locking and
100250 +   nobody uses parent pointers from children due to absence of them. Second we
100251 +   invalidate all pending lock requests which still are on znode's lock
100252 +   request queue, this is done by invalidate_lock(). Another JNODE_IS_DYING
100253 +   znode status bit is used to invalidate pending lock requests. Once it set
100254 +   all requesters are forced to return -EINVAL from
100255 +   longterm_lock_znode(). Future locking attempts are not possible because all
100256 +   ways to get references to that znode are removed already. Last, node is
100257 +   uncaptured from transaction.
100258 +
100259 +   When last reference to the dying znode is just about to be released,
100260 +   block number for this lock is released and znode is removed from the
100261 +   hash table.
100262 +
100263 +   Now znode can be recycled.
100264 +
100265 +   [it's possible to free bitmap block and remove znode from the hash
100266 +   table when last lock is released. This will result in having
100267 +   referenced but completely orphaned znode]
100268 +
100269 +   6. Limbo
100270 +
100271 +   As have been mentioned above znodes with reference counter 0 are
100272 +   still cached in a hash table. Once memory pressure increases they are
100273 +   purged out of there [this requires something like LRU list for
100274 +   efficient implementation. LRU list would also greatly simplify
100275 +   implementation of coord cache that would in this case morph to just
100276 +   scanning some initial segment of LRU list]. Data loaded into
100277 +   unreferenced znode are flushed back to the durable storage if
100278 +   necessary and memory is freed. Znodes themselves can be recycled at
100279 +   this point too.
100280 +
100281 +*/
100282 +
100283 +#include "debug.h"
100284 +#include "dformat.h"
100285 +#include "key.h"
100286 +#include "coord.h"
100287 +#include "plugin/plugin_header.h"
100288 +#include "plugin/node/node.h"
100289 +#include "plugin/plugin.h"
100290 +#include "txnmgr.h"
100291 +#include "jnode.h"
100292 +#include "znode.h"
100293 +#include "block_alloc.h"
100294 +#include "tree.h"
100295 +#include "tree_walk.h"
100296 +#include "super.h"
100297 +#include "reiser4.h"
100298 +#include "prof.h"
100299 +
100300 +#include <linux/pagemap.h>
100301 +#include <linux/spinlock.h>
100302 +#include <linux/slab.h>
100303 +#include <linux/err.h>
100304 +
100305 +/* hash table support */
100306 +
100307 +/* compare two block numbers for equality. Used by hash-table macros */
100308 +static inline int
100309 +blknreq(const reiser4_block_nr * b1, const reiser4_block_nr * b2)
100310 +{
100311 +       assert("nikita-534", b1 != NULL);
100312 +       assert("nikita-535", b2 != NULL);
100313 +
100314 +       return *b1 == *b2;
100315 +}
100316 +
100317 +/* Hash znode by block number. Used by hash-table macros */
100318 +/* Audited by: umka (2002.06.11) */
100319 +static inline __u32
100320 +blknrhashfn(z_hash_table *table, const reiser4_block_nr * b)
100321 +{
100322 +       assert("nikita-536", b != NULL);
100323 +
100324 +       return *b & (REISER4_ZNODE_HASH_TABLE_SIZE - 1);
100325 +}
100326 +
100327 +/* The hash table definition */
100328 +#define KMALLOC(size) reiser4_kmalloc((size), GFP_KERNEL)
100329 +#define KFREE(ptr, size) reiser4_kfree(ptr)
100330 +TYPE_SAFE_HASH_DEFINE(z, znode, reiser4_block_nr, zjnode.key.z, zjnode.link.z, blknrhashfn, blknreq);
100331 +#undef KFREE
100332 +#undef KMALLOC
100333 +
100334 +/* slab for znodes */
100335 +static kmem_cache_t *znode_slab;
100336 +
100337 +int znode_shift_order;
100338 +
100339 +/* ZNODE INITIALIZATION */
100340 +
100341 +/* call this once on reiser4 initialisation */
100342 +reiser4_internal int
100343 +znodes_init(void)
100344 +{
100345 +       znode_slab = kmem_cache_create("znode", sizeof (znode), 0,
100346 +                                      SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT,
100347 +                                      NULL, NULL);
100348 +       if (znode_slab == NULL) {
100349 +               return RETERR(-ENOMEM);
100350 +       } else {
100351 +               for (znode_shift_order = 0;
100352 +                    (1 << znode_shift_order) < sizeof(znode);
100353 +                    ++ znode_shift_order)
100354 +                       ;
100355 +               -- znode_shift_order;
100356 +               return 0;
100357 +       }
100358 +}
100359 +
100360 +/* call this before unloading reiser4 */
100361 +reiser4_internal int
100362 +znodes_done(void)
100363 +{
100364 +       return kmem_cache_destroy(znode_slab);
100365 +}
100366 +
100367 +/* call this to initialise tree of znodes */
100368 +reiser4_internal int
100369 +znodes_tree_init(reiser4_tree * tree /* tree to initialise znodes for */ )
100370 +{
100371 +       int result;
100372 +       assert("umka-050", tree != NULL);
100373 +
100374 +       rw_dk_init(tree);
100375 +
100376 +       result = z_hash_init(&tree->zhash_table, REISER4_ZNODE_HASH_TABLE_SIZE,
100377 +                            reiser4_stat(tree->super, hashes.znode));
100378 +       if (result != 0)
100379 +               return result;
100380 +       result = z_hash_init(&tree->zfake_table, REISER4_ZNODE_HASH_TABLE_SIZE,
100381 +                            reiser4_stat(tree->super, hashes.zfake));
100382 +       return result;
100383 +}
100384 +
100385 +#if REISER4_DEBUG
100386 +extern void jnode_done(jnode * node, reiser4_tree * tree);
100387 +#endif
100388 +
100389 +/* free this znode */
100390 +reiser4_internal void
100391 +zfree(znode * node /* znode to free */ )
100392 +{
100393 +       trace_stamp(TRACE_ZNODES);
100394 +       assert("nikita-465", node != NULL);
100395 +       assert("nikita-2120", znode_page(node) == NULL);
100396 +       assert("nikita-2301", owners_list_empty(&node->lock.owners));
100397 +       assert("nikita-2302", requestors_list_empty(&node->lock.requestors));
100398 +       assert("nikita-2663", capture_list_is_clean(ZJNODE(node)) && NODE_LIST(ZJNODE(node)) == NOT_CAPTURED);
100399 +       assert("nikita-2773", !JF_ISSET(ZJNODE(node), JNODE_EFLUSH));
100400 +       assert("nikita-3220", list_empty(&ZJNODE(node)->jnodes));
100401 +       assert("nikita-3293", !znode_is_right_connected(node));
100402 +       assert("nikita-3294", !znode_is_left_connected(node));
100403 +       assert("nikita-3295", node->left == NULL);
100404 +       assert("nikita-3296", node->right == NULL);
100405 +
100406 +
100407 +       /* not yet phash_jnode_destroy(ZJNODE(node)); */
100408 +
100409 +       /* poison memory. */
100410 +       ON_DEBUG(xmemset(node, 0xde, sizeof *node));
100411 +       kmem_cache_free(znode_slab, node);
100412 +}
100413 +
100414 +/* call this to free tree of znodes */
100415 +reiser4_internal void
100416 +znodes_tree_done(reiser4_tree * tree /* tree to finish with znodes of */ )
100417 +{
100418 +       znode *node;
100419 +       znode *next;
100420 +       z_hash_table *ztable;
100421 +
100422 +       /* scan znode hash-tables and kill all znodes, then free hash tables
100423 +        * themselves. */
100424 +
100425 +       assert("nikita-795", tree != NULL);
100426 +
100427 +       IF_TRACE(TRACE_ZWEB, UNDER_RW_VOID(tree, tree, read,
100428 +                                          print_znodes("umount", tree)));
100429 +
100430 +       ztable = &tree->zhash_table;
100431 +
100432 +       for_all_in_htable(ztable, z, node, next) {
100433 +               node->c_count = 0;
100434 +               node->in_parent.node = NULL;
100435 +               assert("nikita-2179", atomic_read(&ZJNODE(node)->x_count) == 0);
100436 +               zdrop(node);
100437 +       }
100438 +
100439 +       z_hash_done(&tree->zhash_table);
100440 +
100441 +       ztable = &tree->zfake_table;
100442 +
100443 +       for_all_in_htable(ztable, z, node, next) {
100444 +               node->c_count = 0;
100445 +               node->in_parent.node = NULL;
100446 +               assert("nikita-2179", atomic_read(&ZJNODE(node)->x_count) == 0);
100447 +               zdrop(node);
100448 +       }
100449 +
100450 +       z_hash_done(&tree->zfake_table);
100451 +}
100452 +
100453 +/* ZNODE STRUCTURES */
100454 +
100455 +/* allocate fresh znode */
100456 +reiser4_internal znode *
100457 +zalloc(int gfp_flag /* allocation flag */ )
100458 +{
100459 +       znode *node;
100460 +
100461 +       trace_stamp(TRACE_ZNODES);
100462 +       node = kmem_cache_alloc(znode_slab, gfp_flag);
100463 +       return node;
100464 +}
100465 +
100466 +/* Initialize fields of znode
100467 +   @node:    znode to initialize;
100468 +   @parent:  parent znode;
100469 +   @tree:    tree we are in. */
100470 +reiser4_internal void
100471 +zinit(znode * node, const znode * parent, reiser4_tree * tree)
100472 +{
100473 +       assert("nikita-466", node != NULL);
100474 +       assert("umka-268", current_tree != NULL);
100475 +
100476 +       xmemset(node, 0, sizeof *node);
100477 +
100478 +       assert("umka-051", tree != NULL);
100479 +
100480 +       jnode_init(&node->zjnode, tree, JNODE_FORMATTED_BLOCK);
100481 +       reiser4_init_lock(&node->lock);
100482 +       init_parent_coord(&node->in_parent, parent);
100483 +       ON_DEBUG_MODIFY(node->cksum = 0);
100484 +}
100485 +
100486 +/*
100487 + * remove znode from indices. This is called jput() when last reference on
100488 + * znode is released.
100489 + */
100490 +reiser4_internal void
100491 +znode_remove(znode * node /* znode to remove */ , reiser4_tree * tree)
100492 +{
100493 +       assert("nikita-2108", node != NULL);
100494 +       assert("nikita-470", node->c_count == 0);
100495 +       assert("zam-879", rw_tree_is_write_locked(tree));
100496 +
100497 +       /* remove reference to this znode from cbk cache */
100498 +       cbk_cache_invalidate(node, tree);
100499 +
100500 +       /* update c_count of parent */
100501 +       if (znode_parent(node) != NULL) {
100502 +               assert("nikita-472", znode_parent(node)->c_count > 0);
100503 +               /* father, onto your hands I forward my spirit... */
100504 +               znode_parent(node)->c_count --;
100505 +               node->in_parent.node = NULL;
100506 +       } else {
100507 +               /* orphaned znode?! Root? */
100508 +       }
100509 +
100510 +       /* remove znode from hash-table */
100511 +       z_hash_remove_rcu(znode_get_htable(node), node);
100512 +}
100513 +
100514 +/* zdrop() -- Remove znode from the tree.
100515 +
100516 +   This is called when znode is removed from the memory. */
100517 +reiser4_internal void
100518 +zdrop(znode * node /* znode to finish with */ )
100519 +{
100520 +       jdrop(ZJNODE(node));
100521 +}
100522 +
100523 +/*
100524 + * put znode into right place in the hash table. This is called by relocate
100525 + * code.
100526 + */
100527 +reiser4_internal int
100528 +znode_rehash(znode * node /* node to rehash */ ,
100529 +            const reiser4_block_nr * new_block_nr /* new block number */ )
100530 +{
100531 +       z_hash_table *oldtable;
100532 +       z_hash_table *newtable;
100533 +       reiser4_tree *tree;
100534 +
100535 +       assert("nikita-2018", node != NULL);
100536 +
100537 +       tree = znode_get_tree(node);
100538 +       oldtable = znode_get_htable(node);
100539 +       newtable = get_htable(tree, new_block_nr);
100540 +
100541 +       WLOCK_TREE(tree);
100542 +       /* remove znode from hash-table */
100543 +       z_hash_remove_rcu(oldtable, node);
100544 +
100545 +       /* assertion no longer valid due to RCU */
100546 +       /* assert("nikita-2019", z_hash_find(newtable, new_block_nr) == NULL); */
100547 +
100548 +       /* update blocknr */
100549 +       znode_set_block(node, new_block_nr);
100550 +       node->zjnode.key.z = *new_block_nr;
100551 +
100552 +       /* insert it into hash */
100553 +       z_hash_insert_rcu(newtable, node);
100554 +       WUNLOCK_TREE(tree);
100555 +       return 0;
100556 +}
100557 +
100558 +/* ZNODE LOOKUP, GET, PUT */
100559 +
100560 +/* zlook() - get znode with given block_nr in a hash table or return NULL
100561 +
100562 +   If result is non-NULL then the znode's x_count is incremented.  Internal version
100563 +   accepts pre-computed hash index.  The hash table is accessed under caller's
100564 +   tree->hash_lock.
100565 +*/
100566 +reiser4_internal znode *
100567 +zlook(reiser4_tree * tree, const reiser4_block_nr * const blocknr)
100568 +{
100569 +       znode        *result;
100570 +       __u32         hash;
100571 +       z_hash_table *htable;
100572 +
100573 +       trace_stamp(TRACE_ZNODES);
100574 +
100575 +       assert("jmacd-506", tree != NULL);
100576 +       assert("jmacd-507", blocknr != NULL);
100577 +
100578 +       htable = get_htable(tree, blocknr);
100579 +       hash   = blknrhashfn(htable, blocknr);
100580 +
100581 +       rcu_read_lock();
100582 +       result = z_hash_find_index(htable, hash, blocknr);
100583 +
100584 +       if (result != NULL) {
100585 +               add_x_ref(ZJNODE(result));
100586 +               result = znode_rip_check(tree, result);
100587 +       }
100588 +       rcu_read_unlock();
100589 +
100590 +       return result;
100591 +}
100592 +
100593 +/* return hash table where znode with block @blocknr is (or should be)
100594 + * stored */
100595 +reiser4_internal z_hash_table *
100596 +get_htable(reiser4_tree * tree, const reiser4_block_nr * const blocknr)
100597 +{
100598 +       z_hash_table *table;
100599 +       if (is_disk_addr_unallocated(blocknr))
100600 +               table = &tree->zfake_table;
100601 +       else
100602 +               table = &tree->zhash_table;
100603 +       return table;
100604 +}
100605 +
100606 +/* return hash table where znode @node is (or should be) stored */
100607 +reiser4_internal z_hash_table *
100608 +znode_get_htable(const znode *node)
100609 +{
100610 +       return get_htable(znode_get_tree(node), znode_get_block(node));
100611 +}
100612 +
100613 +/* zget() - get znode from hash table, allocating it if necessary.
100614 +
100615 +   First a call to zlook, locating a x-referenced znode if one
100616 +   exists.  If znode is not found, allocate new one and return.  Result
100617 +   is returned with x_count reference increased.
100618 +
100619 +   LOCKS TAKEN:   TREE_LOCK, ZNODE_LOCK
100620 +   LOCK ORDERING: NONE
100621 +*/
100622 +reiser4_internal znode *
100623 +zget(reiser4_tree * tree,
100624 +     const reiser4_block_nr * const blocknr,
100625 +     znode * parent,
100626 +     tree_level level,
100627 +     int gfp_flag)
100628 +{
100629 +       znode *result;
100630 +       __u32 hashi;
100631 +
100632 +       z_hash_table *zth;
100633 +
100634 +       trace_stamp(TRACE_ZNODES);
100635 +
100636 +       assert("jmacd-512", tree != NULL);
100637 +       assert("jmacd-513", blocknr != NULL);
100638 +       assert("jmacd-514", level < REISER4_MAX_ZTREE_HEIGHT);
100639 +
100640 +       zth = get_htable(tree, blocknr);
100641 +       hashi = blknrhashfn(zth, blocknr);
100642 +
100643 +       /* NOTE-NIKITA address-as-unallocated-blocknr still is not
100644 +          implemented. */
100645 +
100646 +       z_hash_prefetch_bucket(zth, hashi);
100647 +
100648 +       rcu_read_lock();
100649 +       /* Find a matching BLOCKNR in the hash table.  If the znode is found,
100650 +          we obtain an reference (x_count) but the znode remains unlocked.
100651 +          Have to worry about race conditions later. */
100652 +       result = z_hash_find_index(zth, hashi, blocknr);
100653 +       /* According to the current design, the hash table lock protects new
100654 +          znode references. */
100655 +       if (result != NULL) {
100656 +               add_x_ref(ZJNODE(result));
100657 +               /* NOTE-NIKITA it should be so, but special case during
100658 +                  creation of new root makes such assertion highly
100659 +                  complicated.  */
100660 +               assert("nikita-2131", 1 || znode_parent(result) == parent ||
100661 +                      (ZF_ISSET(result, JNODE_ORPHAN) && (znode_parent(result) == NULL)));
100662 +               result = znode_rip_check(tree, result);
100663 +       }
100664 +
100665 +       rcu_read_unlock();
100666 +
100667 +       if (!result) {
100668 +               znode * shadow;
100669 +
100670 +               result = zalloc(gfp_flag);
100671 +               if (!result) {
100672 +                       return ERR_PTR(RETERR(-ENOMEM));
100673 +               }
100674 +
100675 +               zinit(result, parent, tree);
100676 +               ZJNODE(result)->blocknr = *blocknr;
100677 +               ZJNODE(result)->key.z = *blocknr;
100678 +               result->level = level;
100679 +
100680 +               WLOCK_TREE(tree);
100681 +
100682 +               shadow = z_hash_find_index(zth, hashi, blocknr);
100683 +               if (unlikely(shadow != NULL && !ZF_ISSET(shadow, JNODE_RIP))) {
100684 +                       jnode_list_remove(ZJNODE(result));
100685 +                       zfree(result);
100686 +                       result = shadow;
100687 +               } else {
100688 +                       result->version = znode_build_version(tree);
100689 +                       z_hash_insert_index_rcu(zth, hashi, result);
100690 +
100691 +                       if (parent != NULL)
100692 +                               ++ parent->c_count;
100693 +               }
100694 +
100695 +               add_x_ref(ZJNODE(result));
100696 +
100697 +               WUNLOCK_TREE(tree);
100698 +       }
100699 +
100700 +#if REISER4_DEBUG
100701 +       if (!blocknr_is_fake(blocknr) && *blocknr != 0)
100702 +               reiser4_check_block(blocknr, 1);
100703 +#endif
100704 +       /* Check for invalid tree level, return -EIO */
100705 +       if (unlikely(znode_get_level(result) != level)) {
100706 +               warning("jmacd-504",
100707 +                       "Wrong level for cached block %llu: %i expecting %i",
100708 +                       *blocknr, znode_get_level(result), level);
100709 +               zput(result);
100710 +               return ERR_PTR(RETERR(-EIO));
100711 +       }
100712 +
100713 +       assert("nikita-1227", znode_invariant(result));
100714 +
100715 +       return result;
100716 +}
100717 +
100718 +/* ZNODE PLUGINS/DATA */
100719 +
100720 +/* "guess" plugin for node loaded from the disk. Plugin id of node plugin is
100721 +   stored at the fixed offset from the beginning of the node. */
100722 +static node_plugin *
100723 +znode_guess_plugin(const znode * node  /* znode to guess
100724 +                                        * plugin of */ )
100725 +{
100726 +       reiser4_tree * tree;
100727 +
100728 +       assert("nikita-1053", node != NULL);
100729 +       assert("nikita-1055", zdata(node) != NULL);
100730 +
100731 +       tree = znode_get_tree(node);
100732 +       assert("umka-053", tree != NULL);
100733 +
100734 +       if (reiser4_is_set(tree->super, REISER4_ONE_NODE_PLUGIN)) {
100735 +               return tree->nplug;
100736 +       } else {
100737 +               return node_plugin_by_disk_id
100738 +                       (tree, &((common_node_header *) zdata(node))->plugin_id);
100739 +#ifdef GUESS_EXISTS
100740 +               reiser4_plugin *plugin;
100741 +
100742 +               /* NOTE-NIKITA add locking here when dynamic plugins will be
100743 +                * implemented */
100744 +               for_all_plugins(REISER4_NODE_PLUGIN_TYPE, plugin) {
100745 +                       if ((plugin->u.node.guess != NULL) && plugin->u.node.guess(node))
100746 +                               return plugin;
100747 +               }
100748 +#endif
100749 +               warning("nikita-1057", "Cannot guess node plugin");
100750 +               print_znode("node", node);
100751 +               return NULL;
100752 +       }
100753 +}
100754 +
100755 +/* parse node header and install ->node_plugin */
100756 +reiser4_internal int
100757 +zparse(znode * node /* znode to parse */ )
100758 +{
100759 +       int result;
100760 +
100761 +       assert("nikita-1233", node != NULL);
100762 +       assert("nikita-2370", zdata(node) != NULL);
100763 +
100764 +       if (node->nplug == NULL) {
100765 +               node_plugin *nplug;
100766 +
100767 +               nplug = znode_guess_plugin(node);
100768 +               if (likely(nplug != NULL)) {
100769 +                       result = nplug->parse(node);
100770 +                       if (likely(result == 0))
100771 +                               node->nplug = nplug;
100772 +               } else {
100773 +                       result = RETERR(-EIO);
100774 +               }
100775 +       } else
100776 +               result = 0;
100777 +       return result;
100778 +}
100779 +
100780 +/* zload with readahead */
100781 +reiser4_internal int
100782 +zload_ra(znode * node /* znode to load */, ra_info_t *info)
100783 +{
100784 +       int result;
100785 +
100786 +       assert("nikita-484", node != NULL);
100787 +       assert("nikita-1377", znode_invariant(node));
100788 +       assert("jmacd-7771", !znode_above_root(node));
100789 +       assert("nikita-2125", atomic_read(&ZJNODE(node)->x_count) > 0);
100790 +       assert("nikita-3016", schedulable());
100791 +
100792 +       if (info)
100793 +               formatted_readahead(node, info);
100794 +
100795 +       result = jload(ZJNODE(node));
100796 +       ON_DEBUG_MODIFY(znode_pre_write(node));
100797 +       assert("nikita-1378", znode_invariant(node));
100798 +       return result;
100799 +}
100800 +
100801 +/* load content of node into memory */
100802 +reiser4_internal int zload(znode * node)
100803 +{
100804 +       return zload_ra(node, 0);
100805 +}
100806 +
100807 +/* call node plugin to initialise newly allocated node. */
100808 +reiser4_internal int
100809 +zinit_new(znode * node /* znode to initialise */, int gfp_flags )
100810 +{
100811 +       return jinit_new(ZJNODE(node), gfp_flags);
100812 +}
100813 +
100814 +/* drop reference to node data. When last reference is dropped, data are
100815 +   unloaded. */
100816 +reiser4_internal void
100817 +zrelse(znode * node /* znode to release references to */ )
100818 +{
100819 +       assert("nikita-1381", znode_invariant(node));
100820 +
100821 +       jrelse(ZJNODE(node));
100822 +}
100823 +
100824 +/* returns free space in node */
100825 +reiser4_internal unsigned
100826 +znode_free_space(znode * node /* znode to query */ )
100827 +{
100828 +       assert("nikita-852", node != NULL);
100829 +       return node_plugin_by_node(node)->free_space(node);
100830 +}
100831 +
100832 +/* left delimiting key of znode */
100833 +reiser4_internal reiser4_key *
100834 +znode_get_rd_key(znode * node /* znode to query */ )
100835 +{
100836 +       assert("nikita-958", node != NULL);
100837 +       assert("nikita-1661", rw_dk_is_locked(znode_get_tree(node)));
100838 +       assert("nikita-3067", LOCK_CNT_GTZ(rw_locked_dk));
100839 +
100840 +       return &node->rd_key;
100841 +}
100842 +
100843 +/* right delimiting key of znode */
100844 +reiser4_internal reiser4_key *
100845 +znode_get_ld_key(znode * node /* znode to query */ )
100846 +{
100847 +       assert("nikita-974", node != NULL);
100848 +       assert("nikita-1662", rw_dk_is_locked(znode_get_tree(node)));
100849 +       assert("nikita-3068", LOCK_CNT_GTZ(rw_locked_dk));
100850 +
100851 +       return &node->ld_key;
100852 +}
100853 +
100854 +/* update right-delimiting key of @node */
100855 +reiser4_internal reiser4_key *
100856 +znode_set_rd_key(znode * node, const reiser4_key * key)
100857 +{
100858 +       assert("nikita-2937", node != NULL);
100859 +       assert("nikita-2939", key != NULL);
100860 +       assert("nikita-2938", rw_dk_is_write_locked(znode_get_tree(node)));
100861 +       assert("nikita-3069", LOCK_CNT_GTZ(write_locked_dk));
100862 +       assert("nikita-2944",
100863 +              znode_is_any_locked(node) ||
100864 +              znode_get_level(node) != LEAF_LEVEL ||
100865 +              keyge(key, znode_get_rd_key(node)) ||
100866 +              keyeq(znode_get_rd_key(node), min_key()));
100867 +
100868 +       node->rd_key = *key;
100869 +       return &node->rd_key;
100870 +}
100871 +
100872 +/* update left-delimiting key of @node */
100873 +reiser4_internal reiser4_key *
100874 +znode_set_ld_key(znode * node, const reiser4_key * key)
100875 +{
100876 +       assert("nikita-2940", node != NULL);
100877 +       assert("nikita-2941", key != NULL);
100878 +       assert("nikita-2942", rw_dk_is_write_locked(znode_get_tree(node)));
100879 +       assert("nikita-3070", LOCK_CNT_GTZ(write_locked_dk > 0));
100880 +       assert("nikita-2943",
100881 +              znode_is_any_locked(node) ||
100882 +              keyeq(znode_get_ld_key(node), min_key()));
100883 +
100884 +       node->ld_key = *key;
100885 +       return &node->ld_key;
100886 +}
100887 +
100888 +/* true if @key is inside key range for @node */
100889 +reiser4_internal int
100890 +znode_contains_key(znode * node /* znode to look in */ ,
100891 +                  const reiser4_key * key /* key to look for */ )
100892 +{
100893 +       assert("nikita-1237", node != NULL);
100894 +       assert("nikita-1238", key != NULL);
100895 +
100896 +       /* left_delimiting_key <= key <= right_delimiting_key */
100897 +       return keyle(znode_get_ld_key(node), key) && keyle(key, znode_get_rd_key(node));
100898 +}
100899 +
100900 +/* same as znode_contains_key(), but lock dk lock */
100901 +reiser4_internal int
100902 +znode_contains_key_lock(znode * node /* znode to look in */ ,
100903 +                       const reiser4_key * key /* key to look for */ )
100904 +{
100905 +       assert("umka-056", node != NULL);
100906 +       assert("umka-057", key != NULL);
100907 +
100908 +       return UNDER_RW(dk, znode_get_tree(node),
100909 +                       read, znode_contains_key(node, key));
100910 +}
100911 +
100912 +/* get parent pointer, assuming tree is not locked */
100913 +reiser4_internal znode *
100914 +znode_parent_nolock(const znode * node /* child znode */ )
100915 +{
100916 +       assert("nikita-1444", node != NULL);
100917 +       return node->in_parent.node;
100918 +}
100919 +
100920 +/* get parent pointer of znode */
100921 +reiser4_internal znode *
100922 +znode_parent(const znode * node /* child znode */ )
100923 +{
100924 +       assert("nikita-1226", node != NULL);
100925 +       assert("nikita-1406", LOCK_CNT_GTZ(rw_locked_tree));
100926 +       return znode_parent_nolock(node);
100927 +}
100928 +
100929 +/* detect uber znode used to protect in-superblock tree root pointer */
100930 +reiser4_internal int
100931 +znode_above_root(const znode * node /* znode to query */ )
100932 +{
100933 +       assert("umka-059", node != NULL);
100934 +
100935 +       return disk_addr_eq(&ZJNODE(node)->blocknr, &UBER_TREE_ADDR);
100936 +}
100937 +
100938 +/* check that @node is root---that its block number is recorder in the tree as
100939 +   that of root node */
100940 +reiser4_internal int
100941 +znode_is_true_root(const znode * node /* znode to query */ )
100942 +{
100943 +       assert("umka-060", node != NULL);
100944 +       assert("umka-061", current_tree != NULL);
100945 +
100946 +       return disk_addr_eq(znode_get_block(node), &znode_get_tree(node)->root_block);
100947 +}
100948 +
100949 +/* check that @node is root */
100950 +reiser4_internal int
100951 +znode_is_root(const znode * node /* znode to query */ )
100952 +{
100953 +       assert("nikita-1206", node != NULL);
100954 +
100955 +       return znode_get_level(node) == znode_get_tree(node)->height;
100956 +}
100957 +
100958 +/* Returns true is @node was just created by zget() and wasn't ever loaded
100959 +   into memory. */
100960 +/* NIKITA-HANS: yes */
100961 +reiser4_internal int
100962 +znode_just_created(const znode * node)
100963 +{
100964 +       assert("nikita-2188", node != NULL);
100965 +       return (znode_page(node) == NULL);
100966 +}
100967 +
100968 +/* obtain updated ->znode_epoch. See seal.c for description. */
100969 +reiser4_internal __u64
100970 +znode_build_version(reiser4_tree * tree)
100971 +{
100972 +       return UNDER_SPIN(epoch, tree, ++tree->znode_epoch);
100973 +}
100974 +
100975 +/*
100976 + * relocate znode to the new block number @blk. Caller keeps @node and @parent
100977 + * long-term locked, and loaded.
100978 + */
100979 +static int
100980 +relocate_locked(znode * node, znode * parent, reiser4_block_nr * blk)
100981 +{
100982 +       coord_t  inparent;
100983 +       int      result;
100984 +
100985 +       assert("nikita-3127", node != NULL);
100986 +       assert("nikita-3128", parent != NULL);
100987 +       assert("nikita-3129", blk != NULL);
100988 +       assert("nikita-3130", znode_is_any_locked(node));
100989 +       assert("nikita-3131", znode_is_any_locked(parent));
100990 +       assert("nikita-3132", znode_is_loaded(node));
100991 +       assert("nikita-3133", znode_is_loaded(parent));
100992 +
100993 +       result = find_child_ptr(parent, node, &inparent);
100994 +       if (result == NS_FOUND) {
100995 +               int grabbed;
100996 +
100997 +               grabbed = get_current_context()->grabbed_blocks;
100998 +               /* for a node and its parent */
100999 +               result = reiser4_grab_space_force((__u64)2, BA_RESERVED);
101000 +               if (result == 0) {
101001 +                       item_plugin *iplug;
101002 +
101003 +                       iplug = item_plugin_by_coord(&inparent);
101004 +                       assert("nikita-3126", iplug->f.update != NULL);
101005 +                       iplug->f.update(&inparent, blk);
101006 +                       znode_make_dirty(inparent.node);
101007 +                       result = znode_rehash(node, blk);
101008 +               }
101009 +               grabbed2free_mark(grabbed);
101010 +       } else
101011 +               result = RETERR(-EIO);
101012 +       return result;
101013 +}
101014 +
101015 +/*
101016 + * relocate znode to the new block number @blk. Used for speculative
101017 + * relocation of bad blocks.
101018 + */
101019 +reiser4_internal int
101020 +znode_relocate(znode * node, reiser4_block_nr * blk)
101021 +{
101022 +       lock_handle lh;
101023 +       int         result;
101024 +
101025 +       assert("nikita-3120", node != NULL);
101026 +       assert("nikita-3121", atomic_read(&ZJNODE(node)->x_count) > 0);
101027 +       assert("nikita-3122", blk != NULL);
101028 +       assert("nikita-3123", lock_stack_isclean(get_current_lock_stack()));
101029 +       assert("nikita-3124", schedulable());
101030 +       assert("nikita-3125", !znode_is_root(node));
101031 +
101032 +       init_lh(&lh);
101033 +       result = longterm_lock_znode(&lh, node,
101034 +                                    ZNODE_READ_LOCK, ZNODE_LOCK_LOPRI);
101035 +       if (result == 0) {
101036 +               lock_handle parent;
101037 +
101038 +               result = reiser4_get_parent(&parent, node, ZNODE_READ_LOCK, 1);
101039 +               if (result == 0) {
101040 +                       result = zload(node);
101041 +                       if (result == 0) {
101042 +                               result = zload(parent.node);
101043 +                               if (result == 0) {
101044 +                                       result = relocate_locked(node,
101045 +                                                                parent.node,
101046 +                                                                blk);
101047 +                                       zrelse(parent.node);
101048 +                               }
101049 +                               zrelse(node);
101050 +                       }
101051 +                       done_lh(&parent);
101052 +               }
101053 +               done_lh(&lh);
101054 +       }
101055 +       return result;
101056 +}
101057 +
101058 +reiser4_internal void
101059 +init_load_count(load_count * dh)
101060 +{
101061 +       assert("nikita-2105", dh != NULL);
101062 +       xmemset(dh, 0, sizeof *dh);
101063 +}
101064 +
101065 +reiser4_internal void
101066 +done_load_count(load_count * dh)
101067 +{
101068 +       assert("nikita-2106", dh != NULL);
101069 +       if (dh->node != NULL) {
101070 +               for (; dh->d_ref > 0; --dh->d_ref)
101071 +                       zrelse(dh->node);
101072 +               dh->node = NULL;
101073 +       }
101074 +}
101075 +
101076 +reiser4_internal int
101077 +incr_load_count_znode(load_count * dh, znode * node)
101078 +{
101079 +       assert("nikita-2107", dh != NULL);
101080 +       assert("nikita-2158", node != NULL);
101081 +       assert("nikita-2109", ergo(dh->node != NULL, (dh->node == node) || (dh->d_ref == 0)));
101082 +
101083 +       dh->node = node;
101084 +       return incr_load_count(dh);
101085 +}
101086 +
101087 +reiser4_internal int
101088 +incr_load_count(load_count * dh)
101089 +{
101090 +       int result;
101091 +
101092 +       assert("nikita-2110", dh != NULL);
101093 +       assert("nikita-2111", dh->node != NULL);
101094 +
101095 +       result = zload(dh->node);
101096 +       if (result == 0)
101097 +               ++dh->d_ref;
101098 +       return result;
101099 +}
101100 +
101101 +reiser4_internal int
101102 +incr_load_count_jnode(load_count * dh, jnode * node)
101103 +{
101104 +       if (jnode_is_znode(node)) {
101105 +               return incr_load_count_znode(dh, JZNODE(node));
101106 +       }
101107 +       return 0;
101108 +}
101109 +
101110 +reiser4_internal void
101111 +copy_load_count(load_count * new, load_count * old)
101112 +{
101113 +       int ret = 0;
101114 +       done_load_count(new);
101115 +       new->node = old->node;
101116 +       new->d_ref = 0;
101117 +
101118 +       while ((new->d_ref < old->d_ref) && (ret = incr_load_count(new)) == 0) {
101119 +       }
101120 +
101121 +       assert("jmacd-87589", ret == 0);
101122 +}
101123 +
101124 +reiser4_internal void
101125 +move_load_count(load_count * new, load_count * old)
101126 +{
101127 +       done_load_count(new);
101128 +       new->node = old->node;
101129 +       new->d_ref = old->d_ref;
101130 +       old->node = NULL;
101131 +       old->d_ref = 0;
101132 +}
101133 +
101134 +/* convert parent pointer into coord */
101135 +reiser4_internal void
101136 +parent_coord_to_coord(const parent_coord_t * pcoord, coord_t * coord)
101137 +{
101138 +       assert("nikita-3204", pcoord != NULL);
101139 +       assert("nikita-3205", coord != NULL);
101140 +
101141 +       coord_init_first_unit_nocheck(coord, pcoord->node);
101142 +       coord_set_item_pos(coord, pcoord->item_pos);
101143 +       coord->between = AT_UNIT;
101144 +}
101145 +
101146 +/* pack coord into parent_coord_t */
101147 +reiser4_internal void
101148 +coord_to_parent_coord(const coord_t * coord, parent_coord_t * pcoord)
101149 +{
101150 +       assert("nikita-3206", pcoord != NULL);
101151 +       assert("nikita-3207", coord != NULL);
101152 +
101153 +       pcoord->node = coord->node;
101154 +       pcoord->item_pos = coord->item_pos;
101155 +}
101156 +
101157 +/* Initialize a parent hint pointer. (parent hint pointer is a field in znode,
101158 +   look for comments there) */
101159 +reiser4_internal void
101160 +init_parent_coord(parent_coord_t * pcoord, const znode * node)
101161 +{
101162 +       pcoord->node = (znode *) node;
101163 +       pcoord->item_pos = (unsigned short)~0;
101164 +}
101165 +
101166 +
101167 +#if REISER4_DEBUG_NODE_INVARIANT
101168 +int jnode_invariant_f(const jnode * node, char const **msg);
101169 +
101170 +/* debugging aid: znode invariant */
101171 +static int
101172 +znode_invariant_f(const znode * node /* znode to check */ ,
101173 +                 char const **msg      /* where to store error
101174 +                                        * message, if any */ )
101175 +{
101176 +#define _ergo(ant, con)                                                \
101177 +       ((*msg) = "{" #ant "} ergo {" #con "}", ergo((ant), (con)))
101178 +
101179 +#define _equi(e1, e2)                                          \
101180 +       ((*msg) = "{" #e1 "} <=> {" #e2 "}", equi((e1), (e2)))
101181 +
101182 +#define _check(exp) ((*msg) = #exp, (exp))
101183 +
101184 +       return
101185 +               jnode_invariant_f(ZJNODE(node), msg) &&
101186 +
101187 +               /* [znode-fake] invariant */
101188 +
101189 +               /* fake znode doesn't have a parent, and */
101190 +               _ergo(znode_get_level(node) == 0, znode_parent(node) == NULL) &&
101191 +               /* there is another way to express this very check, and */
101192 +               _ergo(znode_above_root(node),
101193 +                     znode_parent(node) == NULL) &&
101194 +               /* it has special block number, and */
101195 +               _ergo(znode_get_level(node) == 0,
101196 +                     disk_addr_eq(znode_get_block(node), &UBER_TREE_ADDR)) &&
101197 +               /* it is the only znode with such block number, and */
101198 +               _ergo(!znode_above_root(node) && znode_is_loaded(node),
101199 +                     !disk_addr_eq(znode_get_block(node), &UBER_TREE_ADDR)) &&
101200 +               /* it is parent of the tree root node */
101201 +               _ergo(znode_is_true_root(node), znode_above_root(znode_parent(node))) &&
101202 +
101203 +               /* [znode-level] invariant */
101204 +
101205 +               /* level of parent znode is one larger than that of child,
101206 +                  except for the fake znode, and */
101207 +               _ergo(znode_parent(node) && !znode_above_root(znode_parent(node)),
101208 +                     znode_get_level(znode_parent(node)) ==
101209 +                     znode_get_level(node) + 1) &&
101210 +               /* left neighbor is at the same level, and */
101211 +               _ergo(znode_is_left_connected(node) && node->left != NULL,
101212 +                     znode_get_level(node) == znode_get_level(node->left)) &&
101213 +               /* right neighbor is at the same level */
101214 +               _ergo(znode_is_right_connected(node) && node->right != NULL,
101215 +                     znode_get_level(node) == znode_get_level(node->right)) &&
101216 +
101217 +               /* [znode-connected] invariant */
101218 +
101219 +               _ergo(node->left != NULL, znode_is_left_connected(node)) &&
101220 +               _ergo(node->right != NULL, znode_is_right_connected(node)) &&
101221 +               _ergo(!znode_is_root(node) && node->left != NULL,
101222 +                     znode_is_right_connected(node->left) &&
101223 +                     node->left->right == node) &&
101224 +               _ergo(!znode_is_root(node) && node->right != NULL,
101225 +                     znode_is_left_connected(node->right) &&
101226 +                     node->right->left == node) &&
101227 +
101228 +               /* [znode-c_count] invariant */
101229 +
101230 +               /* for any znode, c_count of its parent is greater than 0 */
101231 +               _ergo(znode_parent(node) != NULL &&
101232 +                     !znode_above_root(znode_parent(node)),
101233 +                     znode_parent(node)->c_count > 0) &&
101234 +               /* leaves don't have children */
101235 +               _ergo(znode_get_level(node) == LEAF_LEVEL,
101236 +                     node->c_count == 0) &&
101237 +
101238 +               _check(node->zjnode.jnodes.prev != NULL) &&
101239 +               _check(node->zjnode.jnodes.next != NULL) &&
101240 +               /* orphan doesn't have a parent */
101241 +               _ergo(ZF_ISSET(node, JNODE_ORPHAN), znode_parent(node) == 0) &&
101242 +
101243 +               /* [znode-modify] invariant */
101244 +
101245 +               /* if znode is not write-locked, its checksum remains
101246 +                * invariant */
101247 +               /* unfortunately, zlock is unordered w.r.t. jnode_lock, so we
101248 +                * cannot check this. */
101249 +               /*
101250 +               UNDER_RW(zlock, (zlock *)&node->lock,
101251 +                        read, _ergo(!znode_is_wlocked(node),
101252 +                                    znode_at_read(node))) &&
101253 +               */
101254 +               /* [znode-refs] invariant */
101255 +
101256 +               /* only referenced znode can be long-term locked */
101257 +               _ergo(znode_is_locked(node),
101258 +                     atomic_read(&ZJNODE(node)->x_count) != 0);
101259 +}
101260 +
101261 +/* debugging aid: check znode invariant and panic if it doesn't hold */
101262 +int
101263 +znode_invariant(const znode * node /* znode to check */ )
101264 +{
101265 +       char const *failed_msg;
101266 +       int result;
101267 +
101268 +       assert("umka-063", node != NULL);
101269 +       assert("umka-064", current_tree != NULL);
101270 +
101271 +       spin_lock_znode((znode *) node);
101272 +       RLOCK_TREE(znode_get_tree(node));
101273 +       result = znode_invariant_f(node, &failed_msg);
101274 +       if (!result) {
101275 +               /* print_znode("corrupted node", node); */
101276 +               warning("jmacd-555", "Condition %s failed", failed_msg);
101277 +       }
101278 +       RUNLOCK_TREE(znode_get_tree(node));
101279 +       spin_unlock_znode((znode *) node);
101280 +       return result;
101281 +}
101282 +/* REISER4_DEBUG_NODE_INVARIANT */
101283 +#endif
101284 +
101285 +/*
101286 + * Node dirtying debug.
101287 + *
101288 + * Whenever formatted node is modified, it should be marked dirty (through
101289 + * call to znode_make_dirty()) before exclusive long term lock (necessary to
101290 + * modify node) is released. This is critical for correct operation of seal.c
101291 + * code.
101292 + *
101293 + * As this is an error easy to make, special debugging mode was implemented to
101294 + * catch it.
101295 + *
101296 + * In this mode new field ->cksum is added to znode. This field contains
101297 + * checksum (adler32) of znode content calculated when znode is loaded into
101298 + * memory and re-calculated whenever znode_make_dirty() is called on it.
101299 + *
101300 + * Whenever long term lock on znode is released, and znode wasn't marked
101301 + * dirty, checksum of its content is calculated and compared with value stored
101302 + * in ->cksum. If they differ, call to znode_make_dirty() is missing.
101303 + *
101304 + * This debugging mode (tunable though fs/Kconfig) is very CPU consuming and
101305 + * hence, unsuitable for normal operation.
101306 + *
101307 + */
101308 +
101309 +#if REISER4_DEBUG_MODIFY
101310 +__u32 znode_checksum(const znode * node)
101311 +{
101312 +       int i, size = znode_size(node);
101313 +       __u32 l = 0;
101314 +       __u32 h = 0;
101315 +       const char *data = page_address(znode_page(node));
101316 +
101317 +       /* Checksum is similar to adler32... */
101318 +       for (i = 0; i < size; i += 1) {
101319 +               l += data[i];
101320 +               h += l;
101321 +       }
101322 +
101323 +       return (h << 16) | (l & 0xffff);
101324 +}
101325 +
101326 +static inline int znode_has_data(const znode *z)
101327 +{
101328 +       return znode_page(z) != NULL && page_address(znode_page(z)) == zdata(z);
101329 +}
101330 +
101331 +void znode_set_checksum(jnode * node, int locked_p)
101332 +{
101333 +       if (jnode_is_znode(node)) {
101334 +               znode *z;
101335 +
101336 +               z = JZNODE(node);
101337 +
101338 +               if (!locked_p)
101339 +                       LOCK_JNODE(node);
101340 +               if (znode_has_data(z))
101341 +                       z->cksum = znode_checksum(z);
101342 +               else
101343 +                       z->cksum = 0;
101344 +               if (!locked_p)
101345 +                       UNLOCK_JNODE(node);
101346 +       }
101347 +}
101348 +
101349 +void
101350 +znode_pre_write(znode * node)
101351 +{
101352 +       assert("umka-066", node != NULL);
101353 +
101354 +       spin_lock_znode(node);
101355 +       if (znode_has_data(node)) {
101356 +               if (node->cksum == 0 && !znode_is_dirty(node))
101357 +                       node->cksum = znode_checksum(node);
101358 +       }
101359 +       spin_unlock_znode(node);
101360 +}
101361 +
101362 +void
101363 +znode_post_write(znode * node)
101364 +{
101365 +       __u32 cksum;
101366 +
101367 +       assert("umka-067", node != NULL);
101368 +
101369 +       if (znode_has_data(node)) {
101370 +               cksum = znode_checksum(node);
101371 +
101372 +               if (cksum != node->cksum && node->cksum != 0)
101373 +                       reiser4_panic("jmacd-1081",
101374 +                                     "changed znode is not dirty: %llu",
101375 +                                     node->zjnode.blocknr);
101376 +       }
101377 +}
101378 +
101379 +int
101380 +znode_at_read(const znode * node)
101381 +{
101382 +       __u32 cksum;
101383 +
101384 +       assert("umka-067", node != NULL);
101385 +
101386 +       if (znode_has_data(node)) {
101387 +               cksum = znode_checksum((znode *)node);
101388 +
101389 +               if (cksum != node->cksum && node->cksum != 0) {
101390 +                       reiser4_panic("nikita-3561",
101391 +                                     "znode is changed: %llu",
101392 +                                     node->zjnode.blocknr);
101393 +                       return 0;
101394 +               }
101395 +       }
101396 +       return 1;
101397 +}
101398 +#endif
101399 +
101400 +#if REISER4_DEBUG_OUTPUT
101401 +
101402 +/* debugging aid: output more human readable information about @node that
101403 +   info_znode(). */
101404 +reiser4_internal void
101405 +print_znode(const char *prefix /* prefix to print */ ,
101406 +           const znode * node /* node to print */ )
101407 +{
101408 +       if (node == NULL) {
101409 +               printk("%s: null\n", prefix);
101410 +               return;
101411 +       }
101412 +
101413 +       info_znode(prefix, node);
101414 +       if (!jnode_is_znode(ZJNODE(node)))
101415 +               return;
101416 +       info_znode("\tparent", znode_parent_nolock(node));
101417 +       info_znode("\tleft", node->left);
101418 +       info_znode("\tright", node->right);
101419 +       print_key("\tld", &node->ld_key);
101420 +       print_key("\trd", &node->rd_key);
101421 +       printk("\n");
101422 +}
101423 +
101424 +/* debugging aid: output human readable information about @node */
101425 +reiser4_internal void
101426 +info_znode(const char *prefix /* prefix to print */ ,
101427 +          const znode * node /* node to print */ )
101428 +{
101429 +       if (node == NULL) {
101430 +               return;
101431 +       }
101432 +       info_jnode(prefix, ZJNODE(node));
101433 +       if (!jnode_is_znode(ZJNODE(node)))
101434 +               return;
101435 +
101436 +       printk("c_count: %i, readers: %i, items: %i\n",
101437 +              node->c_count, node->lock.nr_readers, node->nr_items);
101438 +}
101439 +
101440 +/* print all znodes in @tree */
101441 +reiser4_internal void
101442 +print_znodes(const char *prefix, reiser4_tree * tree)
101443 +{
101444 +       znode *node;
101445 +       znode *next;
101446 +       z_hash_table *htable;
101447 +       int tree_lock_taken;
101448 +
101449 +       if (tree == NULL)
101450 +               tree = current_tree;
101451 +
101452 +       /* this is debugging function. It can be called by reiser4_panic()
101453 +          with tree spin-lock already held. Trylock is not exactly what we
101454 +          want here, but it is passable.
101455 +       */
101456 +       tree_lock_taken = write_trylock_tree(tree);
101457 +
101458 +       htable = &tree->zhash_table;
101459 +       for_all_in_htable(htable, z, node, next) {
101460 +               info_znode(prefix, node);
101461 +       }
101462 +
101463 +       htable = &tree->zfake_table;
101464 +       for_all_in_htable(htable, z, node, next) {
101465 +               info_znode(prefix, node);
101466 +       }
101467 +
101468 +       if (tree_lock_taken)
101469 +               WUNLOCK_TREE(tree);
101470 +}
101471 +#endif
101472 +
101473 +#if defined(REISER4_DEBUG) || defined(REISER4_DEBUG_MODIFY) || defined(REISER4_DEBUG_OUTPUT)
101474 +
101475 +/* return non-0 iff data are loaded into znode */
101476 +reiser4_internal int
101477 +znode_is_loaded(const znode * node /* znode to query */ )
101478 +{
101479 +       assert("nikita-497", node != NULL);
101480 +       return jnode_is_loaded(ZJNODE(node));
101481 +}
101482 +
101483 +#endif
101484 +
101485 +#if REISER4_DEBUG
101486 +reiser4_internal unsigned long
101487 +znode_times_locked(const znode *z)
101488 +{
101489 +       return z->times_locked;
101490 +}
101491 +#endif
101492 +
101493 +/* Make Linus happy.
101494 +   Local variables:
101495 +   c-indentation-style: "K&R"
101496 +   mode-name: "LC"
101497 +   c-basic-offset: 8
101498 +   tab-width: 8
101499 +   fill-column: 120
101500 +   End:
101501 +*/
101502 diff -rupN linux-2.6.8-rc3/fs/reiser4/znode.h linux-2.6.8-rc3-a/fs/reiser4/znode.h
101503 --- linux-2.6.8-rc3/fs/reiser4/znode.h  1970-01-01 03:00:00.000000000 +0300
101504 +++ linux-2.6.8-rc3-a/fs/reiser4/znode.h        2004-08-05 21:20:53.076661904 +0400
101505 @@ -0,0 +1,473 @@
101506 +/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
101507 + * reiser4/README */
101508 +
101509 +/* Declaration of znode (Zam's node). See znode.c for more details. */
101510 +
101511 +#ifndef __ZNODE_H__
101512 +#define __ZNODE_H__
101513 +
101514 +#include "forward.h"
101515 +#include "debug.h"
101516 +#include "dformat.h"
101517 +#include "spin_macros.h"
101518 +#include "key.h"
101519 +#include "coord.h"
101520 +#include "type_safe_list.h"
101521 +#include "plugin/node/node.h"
101522 +#include "jnode.h"
101523 +#include "lock.h"
101524 +#include "readahead.h"
101525 +
101526 +#include <linux/types.h>
101527 +#include <linux/spinlock.h>
101528 +#include <linux/pagemap.h>     /* for PAGE_CACHE_SIZE */
101529 +#include <asm/atomic.h>
101530 +#include <asm/semaphore.h>
101531 +
101532 +/* znode tracks its position within parent (internal item in a parent node,
101533 + * that contains znode's block number). */
101534 +typedef struct parent_coord {
101535 +       znode       *node;
101536 +       pos_in_node_t  item_pos;
101537 +} parent_coord_t;
101538 +
101539 +/* &znode - node in a reiser4 tree.
101540 +
101541 +   NOTE-NIKITA fields in this struct have to be rearranged (later) to reduce
101542 +   cacheline pressure.
101543 +
101544 +   Locking:
101545 +
101546 +   Long term: data in a disk node attached to this znode are protected
101547 +   by long term, deadlock aware lock ->lock;
101548 +
101549 +   Spin lock: the following fields are protected by the spin lock:
101550 +
101551 +    ->lock
101552 +
101553 +   Following fields are protected by the global tree lock:
101554 +
101555 +    ->left
101556 +    ->right
101557 +    ->in_parent
101558 +    ->c_count
101559 +
101560 +   Following fields are protected by the global delimiting key lock (dk_lock):
101561 +
101562 +    ->ld_key (to update ->ld_key long-term lock on the node is also required)
101563 +    ->rd_key
101564 +
101565 +   Following fields are protected by the long term lock:
101566 +
101567 +    ->nr_items
101568 +
101569 +   ->node_plugin is never changed once set. This means that after code made
101570 +   itself sure that field is valid it can be accessed without any additional
101571 +   locking.
101572 +
101573 +   ->level is immutable.
101574 +
101575 +   Invariants involving this data-type:
101576 +
101577 +      [znode-fake]
101578 +      [znode-level]
101579 +      [znode-connected]
101580 +      [znode-c_count]
101581 +      [znode-refs]
101582 +      [jnode-refs]
101583 +      [jnode-queued]
101584 +      [znode-modify]
101585 +
101586 +    For this to be made into a clustering or NUMA filesystem, we would want to eliminate all of the global locks.
101587 +    Suggestions for how to do that are desired.*/
101588 +struct znode {
101589 +       /* Embedded jnode. */
101590 +       jnode zjnode;
101591 +
101592 +       /* contains three subfields, node, pos_in_node, and pos_in_unit.
101593 +
101594 +          pos_in_node and pos_in_unit are only hints that are cached to
101595 +          speed up lookups during balancing. They are not required to be up to
101596 +          date. Synched in find_child_ptr().
101597 +
101598 +          This value allows us to avoid expensive binary searches.
101599 +
101600 +          in_parent->node points to the parent of this node, and is NOT a
101601 +          hint.
101602 +       */
101603 +       parent_coord_t in_parent;
101604 +
101605 +       /*
101606 +        * sibling list pointers
101607 +        */
101608 +
101609 +       /* left-neighbor */
101610 +       znode *left;
101611 +       /* right-neighbor */
101612 +       znode *right;
101613 +       /* long term lock on node content. This lock supports deadlock
101614 +          detection. See lock.c
101615 +       */
101616 +       zlock lock;
101617 +
101618 +       /* You cannot remove from memory a node that has children in
101619 +          memory. This is because we rely on the fact that parent of given
101620 +          node can always be reached without blocking for io. When reading a
101621 +          node into memory you must increase the c_count of its parent, when
101622 +          removing it from memory you must decrease the c_count.  This makes
101623 +          the code simpler, and the cases where it is suboptimal are truly
101624 +          obscure.
101625 +       */
101626 +       int c_count;
101627 +
101628 +       /* plugin of node attached to this znode. NULL if znode is not
101629 +          loaded. */
101630 +       node_plugin *nplug;
101631 +
101632 +       /* version of znode data. This is increased on each modification. This
101633 +        * is necessary to implement seals (see seal.[ch]) efficiently. */
101634 +       __u64 version;
101635 +
101636 +       /* left delimiting key. Necessary to efficiently perform
101637 +          balancing with node-level locking. Kept in memory only. */
101638 +       reiser4_key ld_key;
101639 +       /* right delimiting key. */
101640 +       reiser4_key rd_key;
101641 +
101642 +       /* znode's tree level */
101643 +       __u16 level;
101644 +       /* number of items in this node. This field is modified by node
101645 +        * plugin. */
101646 +       __u16 nr_items;
101647 +#if REISER4_DEBUG_MODIFY
101648 +       /* In debugging mode, used to detect loss of znode_set_dirty()
101649 +          notification. */
101650 +       spinlock_t cksum_lock;
101651 +       __u32 cksum;
101652 +#endif
101653 +
101654 +#if REISER4_DEBUG
101655 +       void *creator;
101656 +       reiser4_key first_key;
101657 +       unsigned long times_locked;
101658 +#endif
101659 +#if REISER4_STATS
101660 +       int last_lookup_pos;
101661 +#endif
101662 +} __attribute__((aligned(16)));
101663 +
101664 +/* In general I think these macros should not be exposed. */
101665 +#define znode_is_locked(node)          (lock_is_locked(&node->lock))
101666 +#define znode_is_rlocked(node)         (lock_is_rlocked(&node->lock))
101667 +#define znode_is_wlocked(node)         (lock_is_wlocked(&node->lock))
101668 +#define znode_is_wlocked_once(node)    (lock_is_wlocked_once(&node->lock))
101669 +#define znode_can_be_rlocked(node)     (lock_can_be_rlocked(&node->lock))
101670 +#define is_lock_compatible(node, mode) (lock_mode_compatible(&node->lock, mode))
101671 +
101672 +/* Macros for accessing the znode state. */
101673 +#define        ZF_CLR(p,f)             JF_CLR  (ZJNODE(p), (f))
101674 +#define        ZF_ISSET(p,f)           JF_ISSET(ZJNODE(p), (f))
101675 +#define        ZF_SET(p,f)             JF_SET  (ZJNODE(p), (f))
101676 +
101677 +extern znode *zget(reiser4_tree * tree, const reiser4_block_nr * const block,
101678 +                  znode * parent, tree_level level, int gfp_flag);
101679 +extern znode *zlook(reiser4_tree * tree, const reiser4_block_nr * const block);
101680 +extern int zload(znode * node);
101681 +extern int zload_ra(znode * node, ra_info_t *info);
101682 +extern int zinit_new(znode * node, int gfp_flags);
101683 +extern void zrelse(znode * node);
101684 +extern void znode_change_parent(znode * new_parent, reiser4_block_nr * block);
101685 +
101686 +/* size of data in znode */
101687 +static inline unsigned
101688 +znode_size(const znode * node UNUSED_ARG /* znode to query */ )
101689 +{
101690 +       assert("nikita-1416", node != NULL);
101691 +       return PAGE_CACHE_SIZE;
101692 +}
101693 +
101694 +extern void parent_coord_to_coord(const parent_coord_t *pcoord, coord_t *coord);
101695 +extern void coord_to_parent_coord(const coord_t *coord, parent_coord_t *pcoord);
101696 +extern void init_parent_coord(parent_coord_t * pcoord, const znode * node);
101697 +
101698 +extern unsigned znode_free_space(znode * node);
101699 +
101700 +extern reiser4_key *znode_get_rd_key(znode * node);
101701 +extern reiser4_key *znode_get_ld_key(znode * node);
101702 +
101703 +extern reiser4_key *znode_set_rd_key(znode * node, const reiser4_key * key);
101704 +extern reiser4_key *znode_set_ld_key(znode * node, const reiser4_key * key);
101705 +
101706 +/* `connected' state checks */
101707 +static inline int
101708 +znode_is_right_connected(const znode * node)
101709 +{
101710 +       return ZF_ISSET(node, JNODE_RIGHT_CONNECTED);
101711 +}
101712 +
101713 +static inline int
101714 +znode_is_left_connected(const znode * node)
101715 +{
101716 +       return ZF_ISSET(node, JNODE_LEFT_CONNECTED);
101717 +}
101718 +
101719 +static inline int
101720 +znode_is_connected(const znode * node)
101721 +{
101722 +       return znode_is_right_connected(node) && znode_is_left_connected(node);
101723 +}
101724 +
101725 +extern int znode_rehash(znode * node, const reiser4_block_nr * new_block_nr);
101726 +extern void znode_remove(znode *, reiser4_tree *);
101727 +extern znode *znode_parent(const znode * node);
101728 +extern znode *znode_parent_nolock(const znode * node);
101729 +extern int znode_above_root(const znode * node);
101730 +extern int znode_is_true_root(const znode * node);
101731 +extern void zdrop(znode * node);
101732 +extern int znodes_init(void);
101733 +extern int znodes_done(void);
101734 +extern int znodes_tree_init(reiser4_tree * ztree);
101735 +extern void znodes_tree_done(reiser4_tree * ztree);
101736 +extern int znode_contains_key(znode * node, const reiser4_key * key);
101737 +extern int znode_contains_key_lock(znode * node, const reiser4_key * key);
101738 +extern unsigned znode_save_free_space(znode * node);
101739 +extern unsigned znode_recover_free_space(znode * node);
101740 +
101741 +extern int znode_just_created(const znode * node);
101742 +
101743 +extern void zfree(znode * node);
101744 +
101745 +#if REISER4_DEBUG_MODIFY
101746 +extern void znode_pre_write(znode * node);
101747 +extern void znode_post_write(znode * node);
101748 +extern void znode_set_checksum(jnode * node, int locked_p);
101749 +extern int  znode_at_read(const znode * node);
101750 +#else
101751 +#define znode_pre_write(n) noop
101752 +#define znode_post_write(n) noop
101753 +#define znode_set_checksum(n, l) noop
101754 +#define znode_at_read(n) (1)
101755 +#endif
101756 +
101757 +#if REISER4_DEBUG_OUTPUT
101758 +extern void print_znode(const char *prefix, const znode * node);
101759 +extern void info_znode(const char *prefix, const znode * node);
101760 +extern void print_znodes(const char *prefix, reiser4_tree * tree);
101761 +extern void print_lock_stack(const char *prefix, lock_stack * owner);
101762 +#else
101763 +#define print_znode( p, n ) noop
101764 +#define info_znode( p, n ) noop
101765 +#define print_znodes( p, t ) noop
101766 +#define print_lock_stack( p, o ) noop
101767 +#endif
101768 +
101769 +/* Make it look like various znode functions exist instead of treating znodes as
101770 +   jnodes in znode-specific code. */
101771 +#define znode_page(x)               jnode_page ( ZJNODE(x) )
101772 +#define zdata(x)                    jdata ( ZJNODE(x) )
101773 +#define znode_get_block(x)          jnode_get_block ( ZJNODE(x) )
101774 +#define znode_created(x)            jnode_created ( ZJNODE(x) )
101775 +#define znode_set_created(x)        jnode_set_created ( ZJNODE(x) )
101776 +#define znode_squeezable(x)         jnode_squeezable (ZJNODE(x))
101777 +#define znode_set_squeezable(x)     jnode_set_squeezable (ZJNODE(x))
101778 +
101779 +#define znode_is_dirty(x)           jnode_is_dirty    ( ZJNODE(x) )
101780 +#define znode_check_dirty(x)        jnode_check_dirty ( ZJNODE(x) )
101781 +#define znode_make_clean(x)         jnode_make_clean   ( ZJNODE(x) )
101782 +#define znode_set_block(x, b)       jnode_set_block ( ZJNODE(x), (b) )
101783 +
101784 +#define spin_lock_znode(x)          LOCK_JNODE ( ZJNODE(x) )
101785 +#define spin_unlock_znode(x)        UNLOCK_JNODE ( ZJNODE(x) )
101786 +#define spin_trylock_znode(x)       spin_trylock_jnode ( ZJNODE(x) )
101787 +#define spin_znode_is_locked(x)     spin_jnode_is_locked ( ZJNODE(x) )
101788 +#define spin_znode_is_not_locked(x) spin_jnode_is_not_locked ( ZJNODE(x) )
101789 +
101790 +#if REISER4_DEBUG
101791 +extern int znode_x_count_is_protected(const znode * node);
101792 +#endif
101793 +
101794 +#if REISER4_DEBUG_NODE_INVARIANT
101795 +extern int znode_invariant(const znode * node);
101796 +#else
101797 +#define znode_invariant(n) (1)
101798 +#endif
101799 +
101800 +/* acquire reference to @node */
101801 +static inline znode *
101802 +zref(znode * node)
101803 +{
101804 +       /* change of x_count from 0 to 1 is protected by tree spin-lock */
101805 +       return JZNODE(jref(ZJNODE(node)));
101806 +}
101807 +
101808 +/* release reference to @node */
101809 +static inline void
101810 +zput(znode * node)
101811 +{
101812 +       assert("nikita-3564", znode_invariant(node));
101813 +       jput(ZJNODE(node));
101814 +}
101815 +
101816 +/* get the level field for a znode */
101817 +static inline tree_level
101818 +znode_get_level(const znode * node)
101819 +{
101820 +       return node->level;
101821 +}
101822 +
101823 +/* get the level field for a jnode */
101824 +static inline tree_level
101825 +jnode_get_level(const jnode * node)
101826 +{
101827 +       if (jnode_is_znode(node))
101828 +               return znode_get_level(JZNODE(node));
101829 +       else
101830 +               /* unformatted nodes are all at the LEAF_LEVEL and for
101831 +                  "semi-formatted" nodes like bitmaps, level doesn't matter. */
101832 +               return LEAF_LEVEL;
101833 +}
101834 +
101835 +/* true if jnode is on leaf level */
101836 +static inline int jnode_is_leaf(const jnode * node)
101837 +{
101838 +       if (jnode_is_znode(node))
101839 +               return (znode_get_level(JZNODE(node)) == LEAF_LEVEL);
101840 +       if (jnode_get_type(node) == JNODE_UNFORMATTED_BLOCK)
101841 +               return 1;
101842 +       return 0;
101843 +}
101844 +
101845 +/* return znode's tree */
101846 +static inline reiser4_tree *
101847 +znode_get_tree(const znode * node)
101848 +{
101849 +       assert("nikita-2692", node != NULL);
101850 +       return jnode_get_tree(ZJNODE(node));
101851 +}
101852 +
101853 +/* resolve race with zput */
101854 +static inline znode *
101855 +znode_rip_check(reiser4_tree *tree, znode * node)
101856 +{
101857 +       jnode *j;
101858 +
101859 +       j = jnode_rip_sync(tree, ZJNODE(node));
101860 +       if (likely(j != NULL))
101861 +               node = JZNODE(j);
101862 +       else
101863 +               node = NULL;
101864 +       return node;
101865 +}
101866 +
101867 +#if defined(REISER4_DEBUG) || defined(REISER4_DEBUG_MODIFY) || defined(REISER4_DEBUG_OUTPUT)
101868 +int znode_is_loaded(const znode * node /* znode to query */ );
101869 +#endif
101870 +
101871 +extern z_hash_table *get_htable(reiser4_tree * tree,
101872 +                               const reiser4_block_nr * const blocknr);
101873 +extern z_hash_table *znode_get_htable(const znode *node);
101874 +
101875 +extern __u64 znode_build_version(reiser4_tree * tree);
101876 +
101877 +extern int znode_relocate(znode * node, reiser4_block_nr * blk);
101878 +
101879 +/* Data-handles.  A data handle object manages pairing calls to zload() and zrelse().  We
101880 +   must load the data for a node in many places.  We could do this by simply calling
101881 +   zload() everywhere, the difficulty arises when we must release the loaded data by
101882 +   calling zrelse.  In a function with many possible error/return paths, it requires extra
101883 +   work to figure out which exit paths must call zrelse and those which do not.  The data
101884 +   handle automatically calls zrelse for every zload that it is responsible for.  In that
101885 +   sense, it acts much like a lock_handle.
101886 +*/
101887 +typedef struct load_count {
101888 +       znode *node;
101889 +       int d_ref;
101890 +} load_count;
101891 +
101892 +extern void init_load_count(load_count * lc);  /* Initialize a load_count set the current node to NULL. */
101893 +extern void done_load_count(load_count * dh);  /* Finalize a load_count: call zrelse() if necessary */
101894 +extern int incr_load_count(load_count * dh);   /* Call zload() on the current node. */
101895 +extern int incr_load_count_znode(load_count * dh, znode * node);       /* Set the argument znode to the current node, call zload(). */
101896 +extern int incr_load_count_jnode(load_count * dh, jnode * node);       /* If the argument jnode is formatted, do the same as
101897 +                                                                          * incr_load_count_znode, otherwise do nothing (unformatted nodes
101898 +                                                                          * don't require zload/zrelse treatment). */
101899 +extern void move_load_count(load_count * new, load_count * old);       /* Move the contents of a load_count.  Old handle is released. */
101900 +extern void copy_load_count(load_count * new, load_count * old);       /* Copy the contents of a load_count.  Old handle remains held. */
101901 +
101902 +/* Variable initializers for load_count. */
101903 +#define INIT_LOAD_COUNT ( load_count * ){ .node = NULL, .d_ref = 0 }
101904 +#define INIT_LOAD_COUNT_NODE( n ) ( load_count ){ .node = ( n ), .d_ref = 0 }
101905 +/* A convenience macro for use in assertions or debug-only code, where loaded
101906 +   data is only required to perform the debugging check.  This macro
101907 +   encapsulates an expression inside a pair of calls to zload()/zrelse(). */
101908 +#define WITH_DATA( node, exp )                         \
101909 +({                                                     \
101910 +       long __with_dh_result;                          \
101911 +       znode *__with_dh_node;                          \
101912 +                                                       \
101913 +       __with_dh_node = ( node );                      \
101914 +       __with_dh_result = zload( __with_dh_node );     \
101915 +       if( __with_dh_result == 0 ) {                   \
101916 +               __with_dh_result = ( long )( exp );     \
101917 +               zrelse( __with_dh_node );               \
101918 +       }                                               \
101919 +       __with_dh_result;                               \
101920 +})
101921 +
101922 +/* Same as above, but accepts a return value in case zload fails. */
101923 +#define WITH_DATA_RET( node, ret, exp )                        \
101924 +({                                                     \
101925 +       int __with_dh_result;                           \
101926 +       znode *__with_dh_node;                          \
101927 +                                                       \
101928 +       __with_dh_node = ( node );                      \
101929 +       __with_dh_result = zload( __with_dh_node );     \
101930 +       if( __with_dh_result == 0 ) {                   \
101931 +               __with_dh_result = ( int )( exp );      \
101932 +               zrelse( __with_dh_node );               \
101933 +       } else                                          \
101934 +               __with_dh_result = ( ret );             \
101935 +       __with_dh_result;                               \
101936 +})
101937 +
101938 +#define WITH_COORD(coord, exp)                 \
101939 +({                                             \
101940 +       coord_t *__coord;                       \
101941 +                                               \
101942 +       __coord = (coord);                      \
101943 +       coord_clear_iplug(__coord);             \
101944 +       WITH_DATA(__coord->node, exp);          \
101945 +})
101946 +
101947 +
101948 +#if REISER4_DEBUG_SPIN_LOCKS
101949 +#define STORE_COUNTERS                                         \
101950 +       lock_counters_info __entry_counters = *lock_counters()
101951 +#define CHECK_COUNTERS                                         \
101952 +ON_DEBUG_CONTEXT(                                              \
101953 +({                                                             \
101954 +       __entry_counters.x_refs = lock_counters() -> x_refs;    \
101955 +       __entry_counters.t_refs = lock_counters() -> t_refs;    \
101956 +       __entry_counters.d_refs = lock_counters() -> d_refs;    \
101957 +       assert("nikita-2159",                                   \
101958 +              !memcmp(&__entry_counters, lock_counters(),      \
101959 +                      sizeof __entry_counters));               \
101960 +}) )
101961 +
101962 +#else
101963 +#define STORE_COUNTERS
101964 +#define CHECK_COUNTERS noop
101965 +#endif
101966 +
101967 +/* __ZNODE_H__ */
101968 +#endif
101969 +
101970 +/* Make Linus happy.
101971 +   Local variables:
101972 +   c-indentation-style: "K&R"
101973 +   mode-name: "LC"
101974 +   c-basic-offset: 8
101975 +   tab-width: 8
101976 +   fill-column: 120
101977 +   End:
101978 +*/