kernel-ext4.patch

   1 Patchset: 2.6.26-ext4-7
   2
   3 This patch was created by combining the ext4-pushed-post-2.6.27-rc1.gz
   4 patches with the stable patches in 2.6.27-rc3-ext4-1 series.
   5
   6  Documentation/filesystems/ext4.txt |  131 ++-
   7  fs/buffer.c                        |   19 +-
   8  fs/ext4/acl.c                      |  188 ++--
   9  fs/ext4/balloc.c                   |  221 +++--
  10  fs/ext4/dir.c                      |   37 +-
  11  fs/ext4/ext4.h                     |   64 +-
  12  fs/ext4/ext4_extents.h             |    5 +-
  13  fs/ext4/ext4_i.h                   |   10 +-
  14  fs/ext4/ext4_jbd2.h                |   29 +-
  15  fs/ext4/ext4_sb.h                  |    5 +-
  16  fs/ext4/extents.c                  |  277 +++---
  17  fs/ext4/file.c                     |   20 +-
  18  fs/ext4/fsync.c                    |    4 +
  19  fs/ext4/group.h                    |    2 +-
  20  fs/ext4/ialloc.c                   |  169 +++-
  21  fs/ext4/inode.c                    | 1931 ++++++++++++++++++++++++++++++------
  22  fs/ext4/mballoc.c                  |  744 +++++++++++----
  23  fs/ext4/mballoc.h                  |   10 +-
  24  fs/ext4/migrate.c                  |    3 +-
  25  fs/ext4/namei.c                    |   45 +-
  26  fs/ext4/resize.c                   |  134 ++-
  27  fs/ext4/super.c                    |  451 ++++++---
  28  fs/ext4/xattr.c                    |    4 +-
  29  fs/ext4/xattr_trusted.c            |    4 +-
  30  fs/ext4/xattr_user.c               |    4 +-
  31  fs/jbd2/checkpoint.c               |    1 -
  32  fs/jbd2/commit.c                   |  308 +++----
  33  fs/jbd2/journal.c                  |   54 +-
  34  fs/jbd2/transaction.c              |  365 +++----
  35  fs/mpage.c                         |   14 +-
  36  include/linux/fs.h                 |    2 +
  37  include/linux/jbd2.h               |   73 +-
  38  include/linux/mpage.h              |   10 +
  39  include/linux/percpu_counter.h     |   12 +-
  40  include/linux/writeback.h          |    1 +
  41  lib/percpu_counter.c               |    7 +-
  42  mm/filemap.c                       |    3 +-
  43  mm/page-writeback.c                |    3 +
  44  38 files changed, 3822 insertions(+), 1542 deletions(-)
  45
  46 diff --git a/Documentation/filesystems/ext4.txt b/Documentation/filesystems/ext4.txt
  47 index 0c5086d..0d53949 100644
  48 --- a/Documentation/filesystems/ext4.txt
  49 +++ b/Documentation/filesystems/ext4.txt
  50 @@ -13,72 +13,99 @@ Mailing list: linux-ext4@vger.kernel.org
  51  1. Quick usage instructions:
  52  ===========================
  53
  54 -  - Grab updated e2fsprogs from
  55 -    ftp://ftp.kernel.org/pub/linux/kernel/people/tytso/e2fsprogs-interim/
  56 -    This is a patchset on top of e2fsprogs-1.39, which can be found at
  57 +  - Compile and install the latest version of e2fsprogs (as of this
  58 +    writing version 1.41) from:
  59 +
  60 +    http://sourceforge.net/project/showfiles.php?group_id=2406
  61 +
  62 +       or
  63 +
  64      ftp://ftp.kernel.org/pub/linux/kernel/people/tytso/e2fsprogs/
  65
  66 -  - It's still mke2fs -j /dev/hda1
  67 +       or grab the latest git repository from:
  68 +
  69 +    git://git.kernel.org/pub/scm/fs/ext2/e2fsprogs.git
  70 +
  71 +  - Note that it is highly important to install the mke2fs.conf file
  72 +    that comes with the e2fsprogs 1.41.x sources in /etc/mke2fs.conf. If
  73 +    you have edited the /etc/mke2fs.conf file installed on your system,
  74 +    you will need to merge your changes with the version from e2fsprogs
  75 +    1.41.x.
  76 +
  77 +  - Create a new filesystem using the ext4dev filesystem type:
  78 +
  79 +       # mke2fs -t ext4dev /dev/hda1
  80 +
  81 +    Or configure an existing ext3 filesystem to support extents and set
  82 +    the test_fs flag to indicate that it's ok for an in-development
  83 +    filesystem to touch this filesystem:
  84
  85 -  - mount /dev/hda1 /wherever -t ext4dev
  86 +       # tune2fs -O extents -E test_fs /dev/hda1
  87
  88 -  - To enable extents,
  89 +    If the filesystem was created with 128 byte inodes, it can be
  90 +    converted to use 256 byte for greater efficiency via:
  91
  92 -       mount /dev/hda1 /wherever -t ext4dev -o extents
  93 +        # tune2fs -I 256 /dev/hda1
  94
  95 -  - The filesystem is compatible with the ext3 driver until you add a file
  96 -    which has extents (ie: `mount -o extents', then create a file).
  97 +    (Note: we currently do not have tools to convert an ext4dev
  98 +    filesystem back to ext3; so please do not do try this on production
  99 +    filesystems.)
 100
 101 -    NOTE: The "extents" mount flag is temporary.  It will soon go away and
 102 -    extents will be enabled by the "-o extents" flag to mke2fs or tune2fs
 103 +  - Mounting:
 104 +
 105 +       # mount -t ext4dev /dev/hda1 /wherever
 106
 107    - When comparing performance with other filesystems, remember that
 108 -    ext3/4 by default offers higher data integrity guarantees than most.  So
 109 -    when comparing with a metadata-only journalling filesystem, use `mount -o
 110 -    data=writeback'.  And you might as well use `mount -o nobh' too along
 111 -    with it.  Making the journal larger than the mke2fs default often helps
 112 -    performance with metadata-intensive workloads.
 113 +    ext3/4 by default offers higher data integrity guarantees than most.
 114 +    So when comparing with a metadata-only journalling filesystem, such
 115 +    as ext3, use `mount -o data=writeback'.  And you might as well use
 116 +    `mount -o nobh' too along with it.  Making the journal larger than
 117 +    the mke2fs default often helps performance with metadata-intensive
 118 +    workloads.
 119
 120  2. Features
 121  ===========
 122
 123  2.1 Currently available
 124
 125 -* ability to use filesystems > 16TB
 126 +* ability to use filesystems > 16TB (e2fsprogs support not available yet)
 127  * extent format reduces metadata overhead (RAM, IO for access, transactions)
 128  * extent format more robust in face of on-disk corruption due to magics,
 129  * internal redunancy in tree
 130 -
 131 -2.1 Previously available, soon to be enabled by default by "mkefs.ext4":
 132 -
 133 -* dir_index and resize inode will be on by default
 134 -* large inodes will be used by default for fast EAs, nsec timestamps, etc
 135 +* improved file allocation (multi-block alloc)
 136 +* fix 32000 subdirectory limit
 137 +* nsec timestamps for mtime, atime, ctime, create time
 138 +* inode version field on disk (NFSv4, Lustre)
 139 +* reduced e2fsck time via uninit_bg feature
 140 +* journal checksumming for robustness, performance
 141 +* persistent file preallocation (e.g for streaming media, databases)
 142 +* ability to pack bitmaps and inode tables into larger virtual groups via the
 143 +  flex_bg feature
 144 +* large file support
 145 +* Inode allocation using large virtual block groups via flex_bg
 146 +* delayed allocation
 147 +* large block (up to pagesize) support
 148 +* efficent new ordered mode in JBD2 and ext4(avoid using buffer head to force
 149 +  the ordering)
 150
 151  2.2 Candidate features for future inclusion
 152
 153 -There are several under discussion, whether they all make it in is
 154 -partly a function of how much time everyone has to work on them:
 155 +* Online defrag (patches available but not well tested)
 156 +* reduced mke2fs time via lazy itable initialization in conjuction with
 157 +  the uninit_bg feature (capability to do this is available in e2fsprogs
 158 +  but a kernel thread to do lazy zeroing of unused inode table blocks
 159 +  after filesystem is first mounted is required for safety)
 160
 161 -* improved file allocation (multi-block alloc, delayed alloc; basically done)
 162 -* fix 32000 subdirectory limit (patch exists, needs some e2fsck work)
 163 -* nsec timestamps for mtime, atime, ctime, create time (patch exists,
 164 -  needs some e2fsck work)
 165 -* inode version field on disk (NFSv4, Lustre; prototype exists)
 166 -* reduced mke2fs/e2fsck time via uninitialized groups (prototype exists)
 167 -* journal checksumming for robustness, performance (prototype exists)
 168 -* persistent file preallocation (e.g for streaming media, databases)
 169 +There are several others under discussion, whether they all make it in is
 170 +partly a function of how much time everyone has to work on them. Features like
 171 +metadata checksumming have been discussed and planned for a bit but no patches
 172 +exist yet so I'm not sure they're in the near-term roadmap.
 173
 174 -Features like metadata checksumming have been discussed and planned for
 175 -a bit but no patches exist yet so I'm not sure they're in the near-term
 176 -roadmap.
 177 +The big performance win will come with mballoc, delalloc and flex_bg
 178 +grouping of bitmaps and inode tables.  Some test results available here:
 179
 180 -The big performance win will come with mballoc and delalloc.  CFS has
 181 -been using mballoc for a few years already with Lustre, and IBM + Bull
 182 -did a lot of benchmarking on it.  The reason it isn't in the first set of
 183 -patches is partly a manageability issue, and partly because it doesn't
 184 -directly affect the on-disk format (outside of much better allocation)
 185 -so it isn't critical to get into the first round of changes.  I believe
 186 -Alex is working on a new set of patches right now.
 187 + - http://www.bullopensource.org/ext4/20080530/ffsb-write-2.6.26-rc2.html
 188 + - http://www.bullopensource.org/ext4/20080530/ffsb-readwrite-2.6.26-rc2.html
 189
 190  3. Options
 191  ==========
 192 @@ -222,9 +249,11 @@ stripe=n           Number of filesystem blocks that mballoc will try
 193                         to use for allocation size and alignment. For RAID5/6
 194                         systems this should be the number of data
 195                         disks *  RAID chunk size in file system blocks.
 196 -
 197 +delalloc       (*)     Deferring block allocation until write-out time.
 198 +nodelalloc             Disable delayed allocation. Blocks are allocation
 199 +                       when data is copied from user to page cache.
 200  Data Mode
 201 ----------
 202 +=========
 203  There are 3 different data modes:
 204
 205  * writeback mode
 206 @@ -236,10 +265,10 @@ typically provide the best ext4 performance.
 207
 208  * ordered mode
 209  In data=ordered mode, ext4 only officially journals metadata, but it logically
 210 -groups metadata and data blocks into a single unit called a transaction.  When
 211 -it's time to write the new metadata out to disk, the associated data blocks
 212 -are written first.  In general, this mode performs slightly slower than
 213 -writeback but significantly faster than journal mode.
 214 +groups metadata information related to data changes with the data blocks into a
 215 +single unit called a transaction.  When it's time to write the new metadata
 216 +out to disk, the associated data blocks are written first.  In general,
 217 +this mode performs slightly slower than writeback but significantly faster than journal mode.
 218
 219  * journal mode
 220  data=journal mode provides full data and metadata journaling.  All new data is
 221 @@ -247,7 +276,8 @@ written to the journal first, and then to its final location.
 222  In the event of a crash, the journal can be replayed, bringing both data and
 223  metadata into a consistent state.  This mode is the slowest except when data
 224  needs to be read from and written to disk at the same time where it
 225 -outperforms all others modes.
 226 +outperforms all others modes.  Curently ext4 does not have delayed
 227 +allocation support if this data journalling mode is selected.
 228
 229  References
 230  ==========
 231 @@ -256,7 +286,8 @@ kernel source:      <file:fs/ext4/>
 232                 <file:fs/jbd2/>
 233
 234  programs:      http://e2fsprogs.sourceforge.net/
 235 -               http://ext2resize.sourceforge.net
 236
 237  useful links:  http://fedoraproject.org/wiki/ext3-devel
 238                 http://www.bullopensource.org/ext4/
 239 +               http://ext4.wiki.kernel.org/index.php/Main_Page
 240 +               http://fedoraproject.org/wiki/Features/Ext4
 241 diff --git a/fs/buffer.c b/fs/buffer.c
 242 index 0f51c0f..5fa1512 100644
 243 --- a/fs/buffer.c
 244 +++ b/fs/buffer.c
 245 @@ -1691,11 +1691,13 @@ static int __block_write_full_page(struct inode *inode, struct page *page,
 246                          */
 247                         clear_buffer_dirty(bh);
 248                         set_buffer_uptodate(bh);
 249 -               } else if (!buffer_mapped(bh) && buffer_dirty(bh)) {
 250 +               } else if ((!buffer_mapped(bh) || buffer_delay(bh)) &&
 251 +                          buffer_dirty(bh)) {
 252                         WARN_ON(bh->b_size != blocksize);
 253                         err = get_block(inode, block, bh, 1);
 254                         if (err)
 255                                 goto recover;
 256 +                       clear_buffer_delay(bh);
 257                         if (buffer_new(bh)) {
 258                                 /* blockdev mappings never come here */
 259                                 clear_buffer_new(bh);
 260 @@ -1774,7 +1776,8 @@ recover:
 261         bh = head;
 262         /* Recovery: lock and submit the mapped buffers */
 263         do {
 264 -               if (buffer_mapped(bh) && buffer_dirty(bh)) {
 265 +               if (buffer_mapped(bh) && buffer_dirty(bh) &&
 266 +                   !buffer_delay(bh)) {
 267                         lock_buffer(bh);
 268                         mark_buffer_async_write(bh);
 269                 } else {
 270 @@ -2061,6 +2064,7 @@ int generic_write_end(struct file *file, struct address_space *mapping,
 271                         struct page *page, void *fsdata)
 272  {
 273         struct inode *inode = mapping->host;
 274 +       int i_size_changed = 0;
 275
 276         copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
 277
 278 @@ -2073,12 +2077,21 @@ int generic_write_end(struct file *file, struct address_space *mapping,
 279          */
 280         if (pos+copied > inode->i_size) {
 281                 i_size_write(inode, pos+copied);
 282 -               mark_inode_dirty(inode);
 283 +               i_size_changed = 1;
 284         }
 285
 286         unlock_page(page);
 287         page_cache_release(page);
 288
 289 +       /*
 290 +        * Don't mark the inode dirty under page lock. First, it unnecessarily
 291 +        * makes the holding time of page lock longer. Second, it forces lock
 292 +        * ordering of page lock and transaction start for journaling
 293 +        * filesystems.
 294 +        */
 295 +       if (i_size_changed)
 296 +               mark_inode_dirty(inode);
 297 +
 298         return copied;
 299  }
 300  EXPORT_SYMBOL(generic_write_end);
 301 diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c
 302 index 3c8dab8..a234b54 100644
 303 --- a/fs/ext4/acl.c
 304 +++ b/fs/ext4/acl.c
 305 @@ -40,34 +40,35 @@ ext4_acl_from_disk(const void *value, size_t size)
 306         acl = posix_acl_alloc(count, GFP_NOFS);
 307         if (!acl)
 308                 return ERR_PTR(-ENOMEM);
 309 -       for (n=0; n < count; n++) {
 310 +       for (n = 0; n < count; n++) {
 311                 ext4_acl_entry *entry =
 312                         (ext4_acl_entry *)value;
 313                 if ((char *)value + sizeof(ext4_acl_entry_short) > end)
 314                         goto fail;
 315                 acl->a_entries[n].e_tag  = le16_to_cpu(entry->e_tag);
 316                 acl->a_entries[n].e_perm = le16_to_cpu(entry->e_perm);
 317 -               switch(acl->a_entries[n].e_tag) {
 318 -                       case ACL_USER_OBJ:
 319 -                       case ACL_GROUP_OBJ:
 320 -                       case ACL_MASK:
 321 -                       case ACL_OTHER:
 322 -                               value = (char *)value +
 323 -                                       sizeof(ext4_acl_entry_short);
 324 -                               acl->a_entries[n].e_id = ACL_UNDEFINED_ID;
 325 -                               break;
 326 -
 327 -                       case ACL_USER:
 328 -                       case ACL_GROUP:
 329 -                               value = (char *)value + sizeof(ext4_acl_entry);
 330 -                               if ((char *)value > end)
 331 -                                       goto fail;
 332 -                               acl->a_entries[n].e_id =
 333 -                                       le32_to_cpu(entry->e_id);
 334 -                               break;
 335 -
 336 -                       default:
 337 +
 338 +               switch (acl->a_entries[n].e_tag) {
 339 +               case ACL_USER_OBJ:
 340 +               case ACL_GROUP_OBJ:
 341 +               case ACL_MASK:
 342 +               case ACL_OTHER:
 343 +                       value = (char *)value +
 344 +                               sizeof(ext4_acl_entry_short);
 345 +                       acl->a_entries[n].e_id = ACL_UNDEFINED_ID;
 346 +                       break;
 347 +
 348 +               case ACL_USER:
 349 +               case ACL_GROUP:
 350 +                       value = (char *)value + sizeof(ext4_acl_entry);
 351 +                       if ((char *)value > end)
 352                                 goto fail;
 353 +                       acl->a_entries[n].e_id =
 354 +                               le32_to_cpu(entry->e_id);
 355 +                       break;
 356 +
 357 +               default:
 358 +                       goto fail;
 359                 }
 360         }
 361         if (value != end)
 362 @@ -96,27 +97,26 @@ ext4_acl_to_disk(const struct posix_acl *acl, size_t *size)
 363                 return ERR_PTR(-ENOMEM);
 364         ext_acl->a_version = cpu_to_le32(EXT4_ACL_VERSION);
 365         e = (char *)ext_acl + sizeof(ext4_acl_header);
 366 -       for (n=0; n < acl->a_count; n++) {
 367 +       for (n = 0; n < acl->a_count; n++) {
 368                 ext4_acl_entry *entry = (ext4_acl_entry *)e;
 369                 entry->e_tag  = cpu_to_le16(acl->a_entries[n].e_tag);
 370                 entry->e_perm = cpu_to_le16(acl->a_entries[n].e_perm);
 371 -               switch(acl->a_entries[n].e_tag) {
 372 -                       case ACL_USER:
 373 -                       case ACL_GROUP:
 374 -                               entry->e_id =
 375 -                                       cpu_to_le32(acl->a_entries[n].e_id);
 376 -                               e += sizeof(ext4_acl_entry);
 377 -                               break;
 378 -
 379 -                       case ACL_USER_OBJ:
 380 -                       case ACL_GROUP_OBJ:
 381 -                       case ACL_MASK:
 382 -                       case ACL_OTHER:
 383 -                               e += sizeof(ext4_acl_entry_short);
 384 -                               break;
 385 -
 386 -                       default:
 387 -                               goto fail;
 388 +               switch (acl->a_entries[n].e_tag) {
 389 +               case ACL_USER:
 390 +               case ACL_GROUP:
 391 +                       entry->e_id = cpu_to_le32(acl->a_entries[n].e_id);
 392 +                       e += sizeof(ext4_acl_entry);
 393 +                       break;
 394 +
 395 +               case ACL_USER_OBJ:
 396 +               case ACL_GROUP_OBJ:
 397 +               case ACL_MASK:
 398 +               case ACL_OTHER:
 399 +                       e += sizeof(ext4_acl_entry_short);
 400 +                       break;
 401 +
 402 +               default:
 403 +                       goto fail;
 404                 }
 405         }
 406         return (char *)ext_acl;
 407 @@ -167,23 +167,23 @@ ext4_get_acl(struct inode *inode, int type)
 408         if (!test_opt(inode->i_sb, POSIX_ACL))
 409                 return NULL;
 410
 411 -       switch(type) {
 412 -               case ACL_TYPE_ACCESS:
 413 -                       acl = ext4_iget_acl(inode, &ei->i_acl);
 414 -                       if (acl != EXT4_ACL_NOT_CACHED)
 415 -                               return acl;
 416 -                       name_index = EXT4_XATTR_INDEX_POSIX_ACL_ACCESS;
 417 -                       break;
 418 -
 419 -               case ACL_TYPE_DEFAULT:
 420 -                       acl = ext4_iget_acl(inode, &ei->i_default_acl);
 421 -                       if (acl != EXT4_ACL_NOT_CACHED)
 422 -                               return acl;
 423 -                       name_index = EXT4_XATTR_INDEX_POSIX_ACL_DEFAULT;
 424 -                       break;
 425 -
 426 -               default:
 427 -                       return ERR_PTR(-EINVAL);
 428 +       switch (type) {
 429 +       case ACL_TYPE_ACCESS:
 430 +               acl = ext4_iget_acl(inode, &ei->i_acl);
 431 +               if (acl != EXT4_ACL_NOT_CACHED)
 432 +                       return acl;
 433 +               name_index = EXT4_XATTR_INDEX_POSIX_ACL_ACCESS;
 434 +               break;
 435 +
 436 +       case ACL_TYPE_DEFAULT:
 437 +               acl = ext4_iget_acl(inode, &ei->i_default_acl);
 438 +               if (acl != EXT4_ACL_NOT_CACHED)
 439 +                       return acl;
 440 +               name_index = EXT4_XATTR_INDEX_POSIX_ACL_DEFAULT;
 441 +               break;
 442 +
 443 +       default:
 444 +               return ERR_PTR(-EINVAL);
 445         }
 446         retval = ext4_xattr_get(inode, name_index, "", NULL, 0);
 447         if (retval > 0) {
 448 @@ -201,14 +201,14 @@ ext4_get_acl(struct inode *inode, int type)
 449         kfree(value);
 450
 451         if (!IS_ERR(acl)) {
 452 -               switch(type) {
 453 -                       case ACL_TYPE_ACCESS:
 454 -                               ext4_iset_acl(inode, &ei->i_acl, acl);
 455 -                               break;
 456 -
 457 -                       case ACL_TYPE_DEFAULT:
 458 -                               ext4_iset_acl(inode, &ei->i_default_acl, acl);
 459 -                               break;
 460 +               switch (type) {
 461 +               case ACL_TYPE_ACCESS:
 462 +                       ext4_iset_acl(inode, &ei->i_acl, acl);
 463 +                       break;
 464 +
 465 +               case ACL_TYPE_DEFAULT:
 466 +                       ext4_iset_acl(inode, &ei->i_default_acl, acl);
 467 +                       break;
 468                 }
 469         }
 470         return acl;
 471 @@ -232,31 +232,31 @@ ext4_set_acl(handle_t *handle, struct inode *inode, int type,
 472         if (S_ISLNK(inode->i_mode))
 473                 return -EOPNOTSUPP;
 474
 475 -       switch(type) {
 476 -               case ACL_TYPE_ACCESS:
 477 -                       name_index = EXT4_XATTR_INDEX_POSIX_ACL_ACCESS;
 478 -                       if (acl) {
 479 -                               mode_t mode = inode->i_mode;
 480 -                               error = posix_acl_equiv_mode(acl, &mode);
 481 -                               if (error < 0)
 482 -                                       return error;
 483 -                               else {
 484 -                                       inode->i_mode = mode;
 485 -                                       ext4_mark_inode_dirty(handle, inode);
 486 -                                       if (error == 0)
 487 -                                               acl = NULL;
 488 -                               }
 489 +       switch (type) {
 490 +       case ACL_TYPE_ACCESS:
 491 +               name_index = EXT4_XATTR_INDEX_POSIX_ACL_ACCESS;
 492 +               if (acl) {
 493 +                       mode_t mode = inode->i_mode;
 494 +                       error = posix_acl_equiv_mode(acl, &mode);
 495 +                       if (error < 0)
 496 +                               return error;
 497 +                       else {
 498 +                               inode->i_mode = mode;
 499 +                               ext4_mark_inode_dirty(handle, inode);
 500 +                               if (error == 0)
 501 +                                       acl = NULL;
 502                         }
 503 -                       break;
 504 +               }
 505 +               break;
 506
 507 -               case ACL_TYPE_DEFAULT:
 508 -                       name_index = EXT4_XATTR_INDEX_POSIX_ACL_DEFAULT;
 509 -                       if (!S_ISDIR(inode->i_mode))
 510 -                               return acl ? -EACCES : 0;
 511 -                       break;
 512 +       case ACL_TYPE_DEFAULT:
 513 +               name_index = EXT4_XATTR_INDEX_POSIX_ACL_DEFAULT;
 514 +               if (!S_ISDIR(inode->i_mode))
 515 +                       return acl ? -EACCES : 0;
 516 +               break;
 517
 518 -               default:
 519 -                       return -EINVAL;
 520 +       default:
 521 +               return -EINVAL;
 522         }
 523         if (acl) {
 524                 value = ext4_acl_to_disk(acl, &size);
 525 @@ -269,14 +269,14 @@ ext4_set_acl(handle_t *handle, struct inode *inode, int type,
 526
 527         kfree(value);
 528         if (!error) {
 529 -               switch(type) {
 530 -                       case ACL_TYPE_ACCESS:
 531 -                               ext4_iset_acl(inode, &ei->i_acl, acl);
 532 -                               break;
 533 -
 534 -                       case ACL_TYPE_DEFAULT:
 535 -                               ext4_iset_acl(inode, &ei->i_default_acl, acl);
 536 -                               break;
 537 +               switch (type) {
 538 +               case ACL_TYPE_ACCESS:
 539 +                       ext4_iset_acl(inode, &ei->i_acl, acl);
 540 +                       break;
 541 +
 542 +               case ACL_TYPE_DEFAULT:
 543 +                       ext4_iset_acl(inode, &ei->i_default_acl, acl);
 544 +                       break;
 545                 }
 546         }
 547         return error;
 548 diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
 549 index 9cc80b9..e9fa960 100644
 550 --- a/fs/ext4/balloc.c
 551 +++ b/fs/ext4/balloc.c
 552 @@ -47,7 +47,7 @@ static int ext4_block_in_group(struct super_block *sb, ext4_fsblk_t block,
 553                         ext4_group_t block_group)
 554  {
 555         ext4_group_t actual_group;
 556 -       ext4_get_group_no_and_offset(sb, block, &actual_group, 0);
 557 +       ext4_get_group_no_and_offset(sb, block, &actual_group, NULL);
 558         if (actual_group == block_group)
 559                 return 1;
 560         return 0;
 561 @@ -121,12 +121,7 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
 562                                 le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks);
 563                 }
 564         } else { /* For META_BG_BLOCK_GROUPS */
 565 -               int group_rel = (block_group -
 566 -                                le32_to_cpu(sbi->s_es->s_first_meta_bg)) %
 567 -                               EXT4_DESC_PER_BLOCK(sb);
 568 -               if (group_rel == 0 || group_rel == 1 ||
 569 -                   (group_rel == EXT4_DESC_PER_BLOCK(sb) - 1))
 570 -                       bit_max += 1;
 571 +               bit_max += ext4_bg_num_gdb(sb, block_group);
 572         }
 573
 574         if (block_group == sbi->s_groups_count - 1) {
 575 @@ -295,7 +290,7 @@ err_out:
 576         return 0;
 577  }
 578  /**
 579 - * read_block_bitmap()
 580 + * ext4_read_block_bitmap()
 581   * @sb:                        super block
 582   * @block_group:       given block group
 583   *
 584 @@ -305,7 +300,7 @@ err_out:
 585   * Return buffer_head on success or NULL in case of failure.
 586   */
 587  struct buffer_head *
 588 -read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
 589 +ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
 590  {
 591         struct ext4_group_desc * desc;
 592         struct buffer_head * bh = NULL;
 593 @@ -319,25 +314,28 @@ read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
 594         if (unlikely(!bh)) {
 595                 ext4_error(sb, __func__,
 596                             "Cannot read block bitmap - "
 597 -                           "block_group = %d, block_bitmap = %llu",
 598 -                           (int)block_group, (unsigned long long)bitmap_blk);
 599 +                           "block_group = %lu, block_bitmap = %llu",
 600 +                           block_group, bitmap_blk);
 601                 return NULL;
 602         }
 603         if (bh_uptodate_or_lock(bh))
 604                 return bh;
 605
 606 +       spin_lock(sb_bgl_lock(EXT4_SB(sb), block_group));
 607         if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
 608                 ext4_init_block_bitmap(sb, bh, block_group, desc);
 609                 set_buffer_uptodate(bh);
 610                 unlock_buffer(bh);
 611 +               spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group));
 612                 return bh;
 613         }
 614 +       spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group));
 615         if (bh_submit_read(bh) < 0) {
 616                 put_bh(bh);
 617                 ext4_error(sb, __func__,
 618                             "Cannot read block bitmap - "
 619 -                           "block_group = %d, block_bitmap = %llu",
 620 -                           (int)block_group, (unsigned long long)bitmap_blk);
 621 +                           "block_group = %lu, block_bitmap = %llu",
 622 +                           block_group, bitmap_blk);
 623                 return NULL;
 624         }
 625         ext4_valid_block_bitmap(sb, desc, block_group, bh);
 626 @@ -409,8 +407,7 @@ restart:
 627                 prev = rsv;
 628         }
 629         printk("Window map complete.\n");
 630 -       if (bad)
 631 -               BUG();
 632 +       BUG_ON(bad);
 633  }
 634  #define rsv_window_dump(root, verbose) \
 635         __rsv_window_dump((root), (verbose), __func__)
 636 @@ -694,7 +691,7 @@ do_more:
 637                 count -= overflow;
 638         }
 639         brelse(bitmap_bh);
 640 -       bitmap_bh = read_block_bitmap(sb, block_group);
 641 +       bitmap_bh = ext4_read_block_bitmap(sb, block_group);
 642         if (!bitmap_bh)
 643                 goto error_return;
 644         desc = ext4_get_group_desc (sb, block_group, &gd_bh);
 645 @@ -810,6 +807,13 @@ do_more:
 646         spin_unlock(sb_bgl_lock(sbi, block_group));
 647         percpu_counter_add(&sbi->s_freeblocks_counter, count);
 648
 649 +       if (sbi->s_log_groups_per_flex) {
 650 +               ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
 651 +               spin_lock(sb_bgl_lock(sbi, flex_group));
 652 +               sbi->s_flex_groups[flex_group].free_blocks += count;
 653 +               spin_unlock(sb_bgl_lock(sbi, flex_group));
 654 +       }
 655 +
 656         /* We dirtied the bitmap block */
 657         BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
 658         err = ext4_journal_dirty_metadata(handle, bitmap_bh);
 659 @@ -1598,23 +1602,38 @@ out:
 660
 661  /**
 662   * ext4_has_free_blocks()
 663 - * @sbi:               in-core super block structure.
 664 + * @sbi:       in-core super block structure.
 665 + * @nblocks:   number of neeed blocks
 666   *
 667 - * Check if filesystem has at least 1 free block available for allocation.
 668 + * Check if filesystem has free blocks available for allocation.
 669 + * Return the number of blocks avaible for allocation for this request
 670 + * On success, return nblocks
 671   */
 672 -static int ext4_has_free_blocks(struct ext4_sb_info *sbi)
 673 +ext4_fsblk_t ext4_has_free_blocks(struct ext4_sb_info *sbi,
 674 +                                               ext4_fsblk_t nblocks)
 675  {
 676 -       ext4_fsblk_t free_blocks, root_blocks;
 677 +       ext4_fsblk_t free_blocks;
 678 +       ext4_fsblk_t root_blocks = 0;
 679
 680         free_blocks = percpu_counter_read_positive(&sbi->s_freeblocks_counter);
 681 -       root_blocks = ext4_r_blocks_count(sbi->s_es);
 682 -       if (free_blocks < root_blocks + 1 && !capable(CAP_SYS_RESOURCE) &&
 683 +
 684 +       if (!capable(CAP_SYS_RESOURCE) &&
 685                 sbi->s_resuid != current->fsuid &&
 686 -               (sbi->s_resgid == 0 || !in_group_p (sbi->s_resgid))) {
 687 +               (sbi->s_resgid == 0 || !in_group_p(sbi->s_resgid)))
 688 +               root_blocks = ext4_r_blocks_count(sbi->s_es);
 689 +#ifdef CONFIG_SMP
 690 +       if (free_blocks - root_blocks < FBC_BATCH)
 691 +               free_blocks =
 692 +                       percpu_counter_sum_and_set(&sbi->s_freeblocks_counter);
 693 +#endif
 694 +       if (free_blocks <= root_blocks)
 695 +               /* we don't have free space */
 696                 return 0;
 697 -       }
 698 -       return 1;
 699 -}
 700 +       if (free_blocks - root_blocks < nblocks)
 701 +               return free_blocks - root_blocks;
 702 +       return nblocks;
 703 + }
 704 +
 705
 706  /**
 707   * ext4_should_retry_alloc()
 708 @@ -1630,7 +1649,7 @@ static int ext4_has_free_blocks(struct ext4_sb_info *sbi)
 709   */
 710  int ext4_should_retry_alloc(struct super_block *sb, int *retries)
 711  {
 712 -       if (!ext4_has_free_blocks(EXT4_SB(sb)) || (*retries)++ > 3)
 713 +       if (!ext4_has_free_blocks(EXT4_SB(sb), 1) || (*retries)++ > 3)
 714                 return 0;
 715
 716         jbd_debug(1, "%s: retrying operation after ENOSPC\n", sb->s_id);
 717 @@ -1639,20 +1658,24 @@ int ext4_should_retry_alloc(struct super_block *sb, int *retries)
 718  }
 719
 720  /**
 721 - * ext4_new_blocks_old() -- core block(s) allocation function
 722 + * ext4_old_new_blocks() -- core block bitmap based block allocation function
 723 + *
 724   * @handle:            handle to this transaction
 725   * @inode:             file inode
 726   * @goal:              given target block(filesystem wide)
 727   * @count:             target number of blocks to allocate
 728   * @errp:              error code
 729   *
 730 - * ext4_new_blocks uses a goal block to assist allocation.  It tries to
 731 - * allocate block(s) from the block group contains the goal block first. If that
 732 - * fails, it will try to allocate block(s) from other block groups without
 733 - * any specific goal block.
 734 + * ext4_old_new_blocks uses a goal block to assist allocation and look up
 735 + * the block bitmap directly to do block allocation.  It tries to
 736 + * allocate block(s) from the block group contains the goal block first. If
 737 + * that fails, it will try to allocate block(s) from other block groups
 738 + * without any specific goal block.
 739 + *
 740 + * This function is called when -o nomballoc mount option is enabled
 741   *
 742   */
 743 -ext4_fsblk_t ext4_new_blocks_old(handle_t *handle, struct inode *inode,
 744 +ext4_fsblk_t ext4_old_new_blocks(handle_t *handle, struct inode *inode,
 745                         ext4_fsblk_t goal, unsigned long *count, int *errp)
 746  {
 747         struct buffer_head *bitmap_bh = NULL;
 748 @@ -1676,13 +1699,26 @@ ext4_fsblk_t ext4_new_blocks_old(handle_t *handle, struct inode *inode,
 749         ext4_group_t ngroups;
 750         unsigned long num = *count;
 751
 752 -       *errp = -ENOSPC;
 753         sb = inode->i_sb;
 754         if (!sb) {
 755 +               *errp = -ENODEV;
 756                 printk("ext4_new_block: nonexistent device");
 757                 return 0;
 758         }
 759
 760 +       sbi = EXT4_SB(sb);
 761 +       if (!EXT4_I(inode)->i_delalloc_reserved_flag) {
 762 +               /*
 763 +                * With delalloc we already reserved the blocks
 764 +                */
 765 +               *count = ext4_has_free_blocks(sbi, *count);
 766 +       }
 767 +       if (*count == 0) {
 768 +               *errp = -ENOSPC;
 769 +               return 0;       /*return with ENOSPC error */
 770 +       }
 771 +       num = *count;
 772 +
 773         /*
 774          * Check quota for allocation of this block.
 775          */
 776 @@ -1706,11 +1742,6 @@ ext4_fsblk_t ext4_new_blocks_old(handle_t *handle, struct inode *inode,
 777         if (block_i && ((windowsz = block_i->rsv_window_node.rsv_goal_size) > 0))
 778                 my_rsv = &block_i->rsv_window_node;
 779
 780 -       if (!ext4_has_free_blocks(sbi)) {
 781 -               *errp = -ENOSPC;
 782 -               goto out;
 783 -       }
 784 -
 785         /*
 786          * First, test whether the goal block is free.
 787          */
 788 @@ -1734,7 +1765,7 @@ retry_alloc:
 789                 my_rsv = NULL;
 790
 791         if (free_blocks > 0) {
 792 -               bitmap_bh = read_block_bitmap(sb, group_no);
 793 +               bitmap_bh = ext4_read_block_bitmap(sb, group_no);
 794                 if (!bitmap_bh)
 795                         goto io_error;
 796                 grp_alloc_blk = ext4_try_to_allocate_with_rsv(sb, handle,
 797 @@ -1770,7 +1801,7 @@ retry_alloc:
 798                         continue;
 799
 800                 brelse(bitmap_bh);
 801 -               bitmap_bh = read_block_bitmap(sb, group_no);
 802 +               bitmap_bh = ext4_read_block_bitmap(sb, group_no);
 803                 if (!bitmap_bh)
 804                         goto io_error;
 805                 /*
 806 @@ -1882,7 +1913,15 @@ allocated:
 807         le16_add_cpu(&gdp->bg_free_blocks_count, -num);
 808         gdp->bg_checksum = ext4_group_desc_csum(sbi, group_no, gdp);
 809         spin_unlock(sb_bgl_lock(sbi, group_no));
 810 -       percpu_counter_sub(&sbi->s_freeblocks_counter, num);
 811 +       if (!EXT4_I(inode)->i_delalloc_reserved_flag)
 812 +               percpu_counter_sub(&sbi->s_freeblocks_counter, num);
 813 +
 814 +       if (sbi->s_log_groups_per_flex) {
 815 +               ext4_group_t flex_group = ext4_flex_group(sbi, group_no);
 816 +               spin_lock(sb_bgl_lock(sbi, flex_group));
 817 +               sbi->s_flex_groups[flex_group].free_blocks -= num;
 818 +               spin_unlock(sb_bgl_lock(sbi, flex_group));
 819 +       }
 820
 821         BUFFER_TRACE(gdp_bh, "journal_dirty_metadata for group descriptor");
 822         err = ext4_journal_dirty_metadata(handle, gdp_bh);
 823 @@ -1915,46 +1954,104 @@ out:
 824         return 0;
 825  }
 826
 827 -ext4_fsblk_t ext4_new_block(handle_t *handle, struct inode *inode,
 828 -               ext4_fsblk_t goal, int *errp)
 829 +#define EXT4_META_BLOCK 0x1
 830 +
 831 +static ext4_fsblk_t do_blk_alloc(handle_t *handle, struct inode *inode,
 832 +                               ext4_lblk_t iblock, ext4_fsblk_t goal,
 833 +                               unsigned long *count, int *errp, int flags)
 834  {
 835         struct ext4_allocation_request ar;
 836         ext4_fsblk_t ret;
 837
 838         if (!test_opt(inode->i_sb, MBALLOC)) {
 839 -               unsigned long count = 1;
 840 -               ret = ext4_new_blocks_old(handle, inode, goal, &count, errp);
 841 -               return ret;
 842 +               return ext4_old_new_blocks(handle, inode, goal, count, errp);
 843         }
 844
 845         memset(&ar, 0, sizeof(ar));
 846 +       /* Fill with neighbour allocated blocks */
 847 +
 848         ar.inode = inode;
 849         ar.goal = goal;
 850 -       ar.len = 1;
 851 +       ar.len = *count;
 852 +       ar.logical = iblock;
 853 +
 854 +       if (S_ISREG(inode->i_mode) && !(flags & EXT4_META_BLOCK))
 855 +               /* enable in-core preallocation for data block allocation */
 856 +               ar.flags = EXT4_MB_HINT_DATA;
 857 +       else
 858 +               /* disable in-core preallocation for non-regular files */
 859 +               ar.flags = 0;
 860 +
 861         ret = ext4_mb_new_blocks(handle, &ar, errp);
 862 +       *count = ar.len;
 863         return ret;
 864  }
 865
 866 -ext4_fsblk_t ext4_new_blocks(handle_t *handle, struct inode *inode,
 867 +/*
 868 + * ext4_new_meta_blocks() -- allocate block for meta data (indexing) blocks
 869 + *
 870 + * @handle:             handle to this transaction
 871 + * @inode:              file inode
 872 + * @goal:               given target block(filesystem wide)
 873 + * @count:             total number of blocks need
 874 + * @errp:               error code
 875 + *
 876 + * Return 1st allocated block numberon success, *count stores total account
 877 + * error stores in errp pointer
 878 + */
 879 +ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
 880                 ext4_fsblk_t goal, unsigned long *count, int *errp)
 881  {
 882 -       struct ext4_allocation_request ar;
 883         ext4_fsblk_t ret;
 884 -
 885 -       if (!test_opt(inode->i_sb, MBALLOC)) {
 886 -               ret = ext4_new_blocks_old(handle, inode, goal, count, errp);
 887 -               return ret;
 888 +       ret = do_blk_alloc(handle, inode, 0, goal,
 889 +                               count, errp, EXT4_META_BLOCK);
 890 +       /*
 891 +        * Account for the allocated meta blocks
 892 +        */
 893 +       if (!(*errp)) {
 894 +               spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
 895 +               EXT4_I(inode)->i_allocated_meta_blocks += *count;
 896 +               spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
 897         }
 898 -
 899 -       memset(&ar, 0, sizeof(ar));
 900 -       ar.inode = inode;
 901 -       ar.goal = goal;
 902 -       ar.len = *count;
 903 -       ret = ext4_mb_new_blocks(handle, &ar, errp);
 904 -       *count = ar.len;
 905         return ret;
 906  }
 907
 908 +/*
 909 + * ext4_new_meta_block() -- allocate block for meta data (indexing) blocks
 910 + *
 911 + * @handle:             handle to this transaction
 912 + * @inode:              file inode
 913 + * @goal:               given target block(filesystem wide)
 914 + * @errp:               error code
 915 + *
 916 + * Return allocated block number on success
 917 + */
 918 +ext4_fsblk_t ext4_new_meta_block(handle_t *handle, struct inode *inode,
 919 +               ext4_fsblk_t goal, int *errp)
 920 +{
 921 +       unsigned long count = 1;
 922 +       return ext4_new_meta_blocks(handle, inode, goal, &count, errp);
 923 +}
 924 +
 925 +/*
 926 + * ext4_new_blocks() -- allocate data blocks
 927 + *
 928 + * @handle:             handle to this transaction
 929 + * @inode:              file inode
 930 + * @goal:               given target block(filesystem wide)
 931 + * @count:             total number of blocks need
 932 + * @errp:               error code
 933 + *
 934 + * Return 1st allocated block numberon success, *count stores total account
 935 + * error stores in errp pointer
 936 + */
 937 +
 938 +ext4_fsblk_t ext4_new_blocks(handle_t *handle, struct inode *inode,
 939 +                               ext4_lblk_t iblock, ext4_fsblk_t goal,
 940 +                               unsigned long *count, int *errp)
 941 +{
 942 +       return do_blk_alloc(handle, inode, iblock, goal, count, errp, 0);
 943 +}
 944
 945  /**
 946   * ext4_count_free_blocks() -- count filesystem free blocks
 947 @@ -1986,7 +2083,7 @@ ext4_fsblk_t ext4_count_free_blocks(struct super_block *sb)
 948                         continue;
 949                 desc_count += le16_to_cpu(gdp->bg_free_blocks_count);
 950                 brelse(bitmap_bh);
 951 -               bitmap_bh = read_block_bitmap(sb, i);
 952 +               bitmap_bh = ext4_read_block_bitmap(sb, i);
 953                 if (bitmap_bh == NULL)
 954                         continue;
 955
 956 diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
 957 index 2bf0331..ec8e33b 100644
 958 --- a/fs/ext4/dir.c
 959 +++ b/fs/ext4/dir.c
 960 @@ -129,7 +129,8 @@ static int ext4_readdir(struct file * filp,
 961                 struct buffer_head *bh = NULL;
 962
 963                 map_bh.b_state = 0;
 964 -               err = ext4_get_blocks_wrap(NULL, inode, blk, 1, &map_bh, 0, 0);
 965 +               err = ext4_get_blocks_wrap(NULL, inode, blk, 1, &map_bh,
 966 +                                               0, 0, 0);
 967                 if (err > 0) {
 968                         pgoff_t index = map_bh.b_blocknr >>
 969                                         (PAGE_CACHE_SHIFT - inode->i_blkbits);
 970 @@ -272,7 +273,7 @@ static void free_rb_tree_fname(struct rb_root *root)
 971
 972         while (n) {
 973                 /* Do the node's children first */
 974 -               if ((n)->rb_left) {
 975 +               if (n->rb_left) {
 976                         n = n->rb_left;
 977                         continue;
 978                 }
 979 @@ -301,24 +302,18 @@ static void free_rb_tree_fname(struct rb_root *root)
 980                         parent->rb_right = NULL;
 981                 n = parent;
 982         }
 983 -       root->rb_node = NULL;
 984  }
 985
 986
 987 -static struct dir_private_info *create_dir_info(loff_t pos)
 988 +static struct dir_private_info *ext4_htree_create_dir_info(loff_t pos)
 989  {
 990         struct dir_private_info *p;
 991
 992 -       p = kmalloc(sizeof(struct dir_private_info), GFP_KERNEL);
 993 +       p = kzalloc(sizeof(struct dir_private_info), GFP_KERNEL);
 994         if (!p)
 995                 return NULL;
 996 -       p->root.rb_node = NULL;
 997 -       p->curr_node = NULL;
 998 -       p->extra_fname = NULL;
 999 -       p->last_pos = 0;
1000         p->curr_hash = pos2maj_hash(pos);
1001         p->curr_minor_hash = pos2min_hash(pos);
1002 -       p->next_hash = 0;
1003         return p;
1004  }
1005
1006 @@ -416,7 +411,7 @@ static int call_filldir(struct file * filp, void * dirent,
1007                                 get_dtype(sb, fname->file_type));
1008                 if (error) {
1009                         filp->f_pos = curr_pos;
1010 -                       info->extra_fname = fname->next;
1011 +                       info->extra_fname = fname;
1012                         return error;
1013                 }
1014                 fname = fname->next;
1015 @@ -433,7 +428,7 @@ static int ext4_dx_readdir(struct file * filp,
1016         int     ret;
1017
1018         if (!info) {
1019 -               info = create_dir_info(filp->f_pos);
1020 +               info = ext4_htree_create_dir_info(filp->f_pos);
1021                 if (!info)
1022                         return -ENOMEM;
1023                 filp->private_data = info;
1024 @@ -455,11 +450,21 @@ static int ext4_dx_readdir(struct file * filp,
1025          * If there are any leftover names on the hash collision
1026          * chain, return them first.
1027          */
1028 -       if (info->extra_fname &&
1029 -           call_filldir(filp, dirent, filldir, info->extra_fname))
1030 -               goto finished;
1031 +       if (info->extra_fname) {
1032 +               if (call_filldir(filp, dirent, filldir, info->extra_fname))
1033 +                       goto finished;
1034
1035 -       if (!info->curr_node)
1036 +               info->extra_fname = NULL;
1037 +               info->curr_node = rb_next(info->curr_node);
1038 +               if (!info->curr_node) {
1039 +                       if (info->next_hash == ~0) {
1040 +                               filp->f_pos = EXT4_HTREE_EOF;
1041 +                               goto finished;
1042 +                       }
1043 +                       info->curr_hash = info->next_hash;
1044 +                       info->curr_minor_hash = 0;
1045 +               }
1046 +       } else if (!info->curr_node)
1047                 info->curr_node = rb_first(&info->root);
1048
1049         while (1) {
1050 diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
1051 index 8158083..2950032 100644
1052 --- a/fs/ext4/ext4.h
1053 +++ b/fs/ext4/ext4.h
1054 @@ -22,7 +22,7 @@
1055  #include "ext4_i.h"
1056
1057  /*
1058 - * The second extended filesystem constants/structures
1059 + * The fourth extended filesystem constants/structures
1060   */
1061
1062  /*
1063 @@ -45,7 +45,7 @@
1064  #define ext4_debug(f, a...)                                            \
1065         do {                                                            \
1066                 printk (KERN_DEBUG "EXT4-fs DEBUG (%s, %d): %s:",       \
1067 -                       __FILE__, __LINE__, __FUNCTION__);              \
1068 +                       __FILE__, __LINE__, __func__);                  \
1069                 printk (KERN_DEBUG f, ## a);                            \
1070         } while (0)
1071  #else
1072 @@ -74,6 +74,9 @@
1073  #define EXT4_MB_HINT_GOAL_ONLY         256
1074  /* goal is meaningful */
1075  #define EXT4_MB_HINT_TRY_GOAL          512
1076 +/* blocks already pre-reserved by delayed allocation */
1077 +#define EXT4_MB_DELALLOC_RESERVED      1024
1078 +
1079
1080  struct ext4_allocation_request {
1081         /* target inode for block we're allocating */
1082 @@ -170,6 +173,15 @@ struct ext4_group_desc
1083         __u32   bg_reserved2[3];
1084  };
1085
1086 +/*
1087 + * Structure of a flex block group info
1088 + */
1089 +
1090 +struct flex_groups {
1091 +       __u32 free_inodes;
1092 +       __u32 free_blocks;
1093 +};
1094 +
1095  #define EXT4_BG_INODE_UNINIT   0x0001 /* Inode table/bitmap not in use */
1096  #define EXT4_BG_BLOCK_UNINIT   0x0002 /* Block bitmap not in use */
1097  #define EXT4_BG_INODE_ZEROED   0x0004 /* On-disk itable initialized to zero */
1098 @@ -527,6 +539,7 @@ do {                                                                               \
1099  #define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT        0x1000000 /* Journal Async Commit */
1100  #define EXT4_MOUNT_I_VERSION            0x2000000 /* i_version support */
1101  #define EXT4_MOUNT_MBALLOC             0x4000000 /* Buddy allocation support */
1102 +#define EXT4_MOUNT_DELALLOC            0x8000000 /* Delalloc support */
1103  /* Compatibility, for having both ext2_fs.h and ext4_fs.h included at once */
1104  #ifndef _LINUX_EXT2_FS_H
1105  #define clear_opt(o, opt)              o &= ~EXT4_MOUNT_##opt
1106 @@ -647,7 +660,10 @@ struct ext4_super_block {
1107         __le16  s_mmp_interval;         /* # seconds to wait in MMP checking */
1108         __le64  s_mmp_block;            /* Block for multi-mount protection */
1109         __le32  s_raid_stripe_width;    /* blocks on all data disks (N*stride)*/
1110 -       __u32   s_reserved[163];        /* Padding to the end of the block */
1111 +       __u8    s_log_groups_per_flex;  /* FLEX_BG group size */
1112 +       __u8    s_reserved_char_pad2;
1113 +       __le16  s_reserved_pad;
1114 +       __u32   s_reserved[162];        /* Padding to the end of the block */
1115  };
1116
1117  #ifdef __KERNEL__
1118 @@ -958,12 +974,17 @@ extern ext4_grpblk_t ext4_block_group_offset(struct super_block *sb,
1119  extern int ext4_bg_has_super(struct super_block *sb, ext4_group_t group);
1120  extern unsigned long ext4_bg_num_gdb(struct super_block *sb,
1121                         ext4_group_t group);
1122 -extern ext4_fsblk_t ext4_new_block (handle_t *handle, struct inode *inode,
1123 +extern ext4_fsblk_t ext4_new_meta_block(handle_t *handle, struct inode *inode,
1124                         ext4_fsblk_t goal, int *errp);
1125 -extern ext4_fsblk_t ext4_new_blocks (handle_t *handle, struct inode *inode,
1126 +extern ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
1127                         ext4_fsblk_t goal, unsigned long *count, int *errp);
1128 -extern ext4_fsblk_t ext4_new_blocks_old(handle_t *handle, struct inode *inode,
1129 +extern ext4_fsblk_t ext4_new_blocks(handle_t *handle, struct inode *inode,
1130 +                                       ext4_lblk_t iblock, ext4_fsblk_t goal,
1131 +                                       unsigned long *count, int *errp);
1132 +extern ext4_fsblk_t ext4_old_new_blocks(handle_t *handle, struct inode *inode,
1133                         ext4_fsblk_t goal, unsigned long *count, int *errp);
1134 +extern ext4_fsblk_t ext4_has_free_blocks(struct ext4_sb_info *sbi,
1135 +                                               ext4_fsblk_t nblocks);
1136  extern void ext4_free_blocks (handle_t *handle, struct inode *inode,
1137                         ext4_fsblk_t block, unsigned long count, int metadata);
1138  extern void ext4_free_blocks_sb (handle_t *handle, struct super_block *sb,
1139 @@ -1016,6 +1037,10 @@ extern int __init init_ext4_mballoc(void);
1140  extern void exit_ext4_mballoc(void);
1141  extern void ext4_mb_free_blocks(handle_t *, struct inode *,
1142                 unsigned long, unsigned long, int, unsigned long *);
1143 +extern int ext4_mb_add_more_groupinfo(struct super_block *sb,
1144 +               ext4_group_t i, struct ext4_group_desc *desc);
1145 +extern void ext4_mb_update_group_info(struct ext4_group_info *grp,
1146 +               ext4_grpblk_t add);
1147
1148
1149  /* inode.c */
1150 @@ -1033,19 +1058,25 @@ int ext4_get_blocks_handle(handle_t *handle, struct inode *inode,
1151  extern struct inode *ext4_iget(struct super_block *, unsigned long);
1152  extern int  ext4_write_inode (struct inode *, int);
1153  extern int  ext4_setattr (struct dentry *, struct iattr *);
1154 +extern int  ext4_getattr(struct vfsmount *mnt, struct dentry *dentry,
1155 +                               struct kstat *stat);
1156  extern void ext4_delete_inode (struct inode *);
1157  extern int  ext4_sync_inode (handle_t *, struct inode *);
1158  extern void ext4_discard_reservation (struct inode *);
1159  extern void ext4_dirty_inode(struct inode *);
1160  extern int ext4_change_inode_journal_flag(struct inode *, int);
1161  extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *);
1162 +extern int ext4_can_truncate(struct inode *inode);
1163  extern void ext4_truncate (struct inode *);
1164  extern void ext4_set_inode_flags(struct inode *);
1165  extern void ext4_get_inode_flags(struct ext4_inode_info *);
1166  extern void ext4_set_aops(struct inode *inode);
1167  extern int ext4_writepage_trans_blocks(struct inode *);
1168 -extern int ext4_block_truncate_page(handle_t *handle, struct page *page,
1169 +extern int ext4_meta_trans_blocks(struct inode *, int nrblocks, int idxblocks);
1170 +extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks);
1171 +extern int ext4_block_truncate_page(handle_t *handle,
1172                 struct address_space *mapping, loff_t from);
1173 +extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct page *page);
1174
1175  /* ioctl.c */
1176  extern long ext4_ioctl(struct file *, unsigned int, unsigned long);
1177 @@ -1159,10 +1190,21 @@ struct ext4_group_info *ext4_get_group_info(struct super_block *sb,
1178  }
1179
1180
1181 +static inline ext4_group_t ext4_flex_group(struct ext4_sb_info *sbi,
1182 +                                            ext4_group_t block_group)
1183 +{
1184 +       return block_group >> sbi->s_log_groups_per_flex;
1185 +}
1186 +
1187 +static inline unsigned int ext4_flex_bg_size(struct ext4_sb_info *sbi)
1188 +{
1189 +       return 1 << sbi->s_log_groups_per_flex;
1190 +}
1191 +
1192  #define ext4_std_error(sb, errno)                              \
1193  do {                                                           \
1194         if ((errno))                                            \
1195 -               __ext4_std_error((sb), __FUNCTION__, (errno));  \
1196 +               __ext4_std_error((sb), __func__, (errno));      \
1197  } while (0)
1198
1199  /*
1200 @@ -1187,11 +1229,13 @@ extern const struct inode_operations ext4_fast_symlink_inode_operations;
1201  /* extents.c */
1202  extern int ext4_ext_tree_init(handle_t *handle, struct inode *);
1203  extern int ext4_ext_writepage_trans_blocks(struct inode *, int);
1204 +extern int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks,
1205 +                                      int chunk);
1206  extern int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
1207                         ext4_lblk_t iblock,
1208                         unsigned long max_blocks, struct buffer_head *bh_result,
1209                         int create, int extend_disksize);
1210 -extern void ext4_ext_truncate(struct inode *, struct page *);
1211 +extern void ext4_ext_truncate(struct inode *);
1212  extern void ext4_ext_init(struct super_block *);
1213  extern void ext4_ext_release(struct super_block *);
1214  extern long ext4_fallocate(struct inode *inode, int mode, loff_t offset,
1215 @@ -1199,7 +1243,7 @@ extern long ext4_fallocate(struct inode *inode, int mode, loff_t offset,
1216  extern int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode,
1217                         sector_t block, unsigned long max_blocks,
1218                         struct buffer_head *bh, int create,
1219 -                       int extend_disksize);
1220 +                       int extend_disksize, int flag);
1221  #endif /* __KERNEL__ */
1222
1223  #endif /* _EXT4_H */
1224 diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h
1225 index 75333b5..d33dc56 100644
1226 --- a/fs/ext4/ext4_extents.h
1227 +++ b/fs/ext4/ext4_extents.h
1228 @@ -212,10 +212,13 @@ static inline int ext4_ext_get_actual_len(struct ext4_extent *ext)
1229                 (le16_to_cpu(ext->ee_len) - EXT_INIT_MAX_LEN));
1230  }
1231
1232 +extern int ext4_ext_calc_metadata_amount(struct inode *inode, int blocks);
1233  extern ext4_fsblk_t idx_pblock(struct ext4_extent_idx *);
1234  extern void ext4_ext_store_pblock(struct ext4_extent *, ext4_fsblk_t);
1235  extern int ext4_extent_tree_init(handle_t *, struct inode *);
1236 -extern int ext4_ext_calc_credits_for_insert(struct inode *, struct ext4_ext_path *);
1237 +extern int ext4_ext_calc_credits_for_single_extent(struct inode *inode,
1238 +                                                  int num,
1239 +                                                  struct ext4_ext_path *path);
1240  extern int ext4_ext_try_to_merge(struct inode *inode,
1241                                  struct ext4_ext_path *path,
1242                                  struct ext4_extent *);
1243 diff --git a/fs/ext4/ext4_i.h b/fs/ext4/ext4_i.h
1244 index 26a4ae2..ef7409f 100644
1245 --- a/fs/ext4/ext4_i.h
1246 +++ b/fs/ext4/ext4_i.h
1247 @@ -79,7 +79,7 @@ struct ext4_ext_cache {
1248  };
1249
1250  /*
1251 - * third extended file system inode data in memory
1252 + * fourth extended file system inode data in memory
1253   */
1254  struct ext4_inode_info {
1255         __le32  i_data[15];     /* unconverted */
1256 @@ -150,6 +150,7 @@ struct ext4_inode_info {
1257          */
1258         struct rw_semaphore i_data_sem;
1259         struct inode vfs_inode;
1260 +       struct jbd2_inode jinode;
1261
1262         unsigned long i_ext_generation;
1263         struct ext4_ext_cache i_cached_extent;
1264 @@ -162,6 +163,13 @@ struct ext4_inode_info {
1265         /* mballoc */
1266         struct list_head i_prealloc_list;
1267         spinlock_t i_prealloc_lock;
1268 +
1269 +       /* allocation reservation info for delalloc */
1270 +       unsigned long i_reserved_data_blocks;
1271 +       unsigned long i_reserved_meta_blocks;
1272 +       unsigned long i_allocated_meta_blocks;
1273 +       unsigned short i_delalloc_reserved_flag;
1274 +       spinlock_t i_block_reservation_lock;
1275  };
1276
1277  #endif /* _EXT4_I */
1278 diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
1279 index 9255a7d..b455c68 100644
1280 --- a/fs/ext4/ext4_jbd2.h
1281 +++ b/fs/ext4/ext4_jbd2.h
1282 @@ -51,6 +51,14 @@
1283                                          EXT4_XATTR_TRANS_BLOCKS - 2 + \
1284                                          2*EXT4_QUOTA_TRANS_BLOCKS(sb))
1285
1286 +/*
1287 + * Define the number of metadata blocks we need to account to modify data.
1288 + *
1289 + * This include super block, inode block, quota blocks and xattr blocks
1290 + */
1291 +#define EXT4_META_TRANS_BLOCKS(sb)     (EXT4_XATTR_TRANS_BLOCKS + \
1292 +                                       2*EXT4_QUOTA_TRANS_BLOCKS(sb))
1293 +
1294  /* Delete operations potentially hit one directory's namespace plus an
1295   * entire inode, plus arbitrary amounts of bitmap/indirection data.  Be
1296   * generous.  We can grow the delete transaction later if necessary. */
1297 @@ -142,19 +150,17 @@ int __ext4_journal_dirty_metadata(const char *where,
1298                                 handle_t *handle, struct buffer_head *bh);
1299
1300  #define ext4_journal_get_undo_access(handle, bh) \
1301 -       __ext4_journal_get_undo_access(__FUNCTION__, (handle), (bh))
1302 +       __ext4_journal_get_undo_access(__func__, (handle), (bh))
1303  #define ext4_journal_get_write_access(handle, bh) \
1304 -       __ext4_journal_get_write_access(__FUNCTION__, (handle), (bh))
1305 +       __ext4_journal_get_write_access(__func__, (handle), (bh))
1306  #define ext4_journal_revoke(handle, blocknr, bh) \
1307 -       __ext4_journal_revoke(__FUNCTION__, (handle), (blocknr), (bh))
1308 +       __ext4_journal_revoke(__func__, (handle), (blocknr), (bh))
1309  #define ext4_journal_get_create_access(handle, bh) \
1310 -       __ext4_journal_get_create_access(__FUNCTION__, (handle), (bh))
1311 +       __ext4_journal_get_create_access(__func__, (handle), (bh))
1312  #define ext4_journal_dirty_metadata(handle, bh) \
1313 -       __ext4_journal_dirty_metadata(__FUNCTION__, (handle), (bh))
1314 +       __ext4_journal_dirty_metadata(__func__, (handle), (bh))
1315  #define ext4_journal_forget(handle, bh) \
1316 -       __ext4_journal_forget(__FUNCTION__, (handle), (bh))
1317 -
1318 -int ext4_journal_dirty_data(handle_t *handle, struct buffer_head *bh);
1319 +       __ext4_journal_forget(__func__, (handle), (bh))
1320
1321  handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks);
1322  int __ext4_journal_stop(const char *where, handle_t *handle);
1323 @@ -165,7 +171,7 @@ static inline handle_t *ext4_journal_start(struct inode *inode, int nblocks)
1324  }
1325
1326  #define ext4_journal_stop(handle) \
1327 -       __ext4_journal_stop(__FUNCTION__, (handle))
1328 +       __ext4_journal_stop(__func__, (handle))
1329
1330  static inline handle_t *ext4_journal_current_handle(void)
1331  {
1332 @@ -192,6 +198,11 @@ static inline int ext4_journal_force_commit(journal_t *journal)
1333         return jbd2_journal_force_commit(journal);
1334  }
1335
1336 +static inline int ext4_jbd2_file_inode(handle_t *handle, struct inode *inode)
1337 +{
1338 +       return jbd2_journal_file_inode(handle, &EXT4_I(inode)->jinode);
1339 +}
1340 +
1341  /* super.c */
1342  int ext4_force_commit(struct super_block *sb);
1343
1344 diff --git a/fs/ext4/ext4_sb.h b/fs/ext4/ext4_sb.h
1345 index 5802e69..6300226 100644
1346 --- a/fs/ext4/ext4_sb.h
1347 +++ b/fs/ext4/ext4_sb.h
1348 @@ -25,7 +25,7 @@
1349  #include <linux/rbtree.h>
1350
1351  /*
1352 - * third extended-fs super-block data in memory
1353 + * fourth extended-fs super-block data in memory
1354   */
1355  struct ext4_sb_info {
1356         unsigned long s_desc_size;      /* Size of a group descriptor in bytes */
1357 @@ -143,6 +143,9 @@ struct ext4_sb_info {
1358
1359         /* locality groups */
1360         struct ext4_locality_group *s_locality_groups;
1361 +
1362 +       unsigned int s_log_groups_per_flex;
1363 +       struct flex_groups *s_flex_groups;
1364  };
1365
1366  #endif /* _EXT4_SB */
1367 diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
1368 index 47929c4..b24d3c5 100644
1369 --- a/fs/ext4/extents.c
1370 +++ b/fs/ext4/extents.c
1371 @@ -92,17 +92,16 @@ static void ext4_idx_store_pblock(struct ext4_extent_idx *ix, ext4_fsblk_t pb)
1372         ix->ei_leaf_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) & 0xffff);
1373  }
1374
1375 -static handle_t *ext4_ext_journal_restart(handle_t *handle, int needed)
1376 +static int ext4_ext_journal_restart(handle_t *handle, int needed)
1377  {
1378         int err;
1379
1380         if (handle->h_buffer_credits > needed)
1381 -               return handle;
1382 -       if (!ext4_journal_extend(handle, needed))
1383 -               return handle;
1384 -       err = ext4_journal_restart(handle, needed);
1385 -
1386 -       return handle;
1387 +               return 0;
1388 +       err = ext4_journal_extend(handle, needed);
1389 +       if (err <= 0)
1390 +               return err;
1391 +       return ext4_journal_restart(handle, needed);
1392  }
1393
1394  /*
1395 @@ -180,15 +179,18 @@ static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode,
1396         return bg_start + colour + block;
1397  }
1398
1399 +/*
1400 + * Allocation for a meta data block
1401 + */
1402  static ext4_fsblk_t
1403 -ext4_ext_new_block(handle_t *handle, struct inode *inode,
1404 +ext4_ext_new_meta_block(handle_t *handle, struct inode *inode,
1405                         struct ext4_ext_path *path,
1406                         struct ext4_extent *ex, int *err)
1407  {
1408         ext4_fsblk_t goal, newblock;
1409
1410         goal = ext4_ext_find_goal(inode, path, le32_to_cpu(ex->ee_block));
1411 -       newblock = ext4_new_block(handle, inode, goal, err);
1412 +       newblock = ext4_new_meta_block(handle, inode, goal, err);
1413         return newblock;
1414  }
1415
1416 @@ -246,6 +248,36 @@ static int ext4_ext_space_root_idx(struct inode *inode)
1417         return size;
1418  }
1419
1420 +/*
1421 + * Calculate the number of metadata blocks needed
1422 + * to allocate @blocks
1423 + * Worse case is one block per extent
1424 + */
1425 +int ext4_ext_calc_metadata_amount(struct inode *inode, int blocks)
1426 +{
1427 +       int lcap, icap, rcap, leafs, idxs, num;
1428 +       int newextents = blocks;
1429 +
1430 +       rcap = ext4_ext_space_root_idx(inode);
1431 +       lcap = ext4_ext_space_block(inode);
1432 +       icap = ext4_ext_space_block_idx(inode);
1433 +
1434 +       /* number of new leaf blocks needed */
1435 +       num = leafs = (newextents + lcap - 1) / lcap;
1436 +
1437 +       /*
1438 +        * Worse case, we need separate index block(s)
1439 +        * to link all new leaf blocks
1440 +        */
1441 +       idxs = (leafs + icap - 1) / icap;
1442 +       do {
1443 +               num += idxs;
1444 +               idxs = (idxs + icap - 1) / icap;
1445 +       } while (idxs > rcap);
1446 +
1447 +       return num;
1448 +}
1449 +
1450  static int
1451  ext4_ext_max_entries(struct inode *inode, int depth)
1452  {
1453 @@ -524,6 +556,7 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
1454                 alloc = 1;
1455         }
1456         path[0].p_hdr = eh;
1457 +       path[0].p_bh = NULL;
1458
1459         i = depth;
1460         /* walk through the tree */
1461 @@ -552,12 +585,14 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
1462         }
1463
1464         path[ppos].p_depth = i;
1465 -       path[ppos].p_hdr = eh;
1466         path[ppos].p_ext = NULL;
1467         path[ppos].p_idx = NULL;
1468
1469         /* find extent */
1470         ext4_ext_binsearch(inode, path + ppos, block);
1471 +       /* if not an empty leaf */
1472 +       if (path[ppos].p_ext)
1473 +               path[ppos].p_block = ext_pblock(path[ppos].p_ext);
1474
1475         ext4_ext_show_path(inode, path);
1476
1477 @@ -688,7 +723,8 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
1478         /* allocate all needed blocks */
1479         ext_debug("allocate %d blocks for indexes/leaf\n", depth - at);
1480         for (a = 0; a < depth - at; a++) {
1481 -               newblock = ext4_ext_new_block(handle, inode, path, newext, &err);
1482 +               newblock = ext4_ext_new_meta_block(handle, inode, path,
1483 +                                                  newext, &err);
1484                 if (newblock == 0)
1485                         goto cleanup;
1486                 ablocks[a] = newblock;
1487 @@ -884,7 +920,7 @@ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
1488         ext4_fsblk_t newblock;
1489         int err = 0;
1490
1491 -       newblock = ext4_ext_new_block(handle, inode, path, newext, &err);
1492 +       newblock = ext4_ext_new_meta_block(handle, inode, path, newext, &err);
1493         if (newblock == 0)
1494                 return err;
1495
1496 @@ -981,6 +1017,8 @@ repeat:
1497                 /* if we found index with free entry, then use that
1498                  * entry: create all needed subtree and add new leaf */
1499                 err = ext4_ext_split(handle, inode, path, newext, i);
1500 +               if (err)
1501 +                       goto out;
1502
1503                 /* refill path */
1504                 ext4_ext_drop_refs(path);
1505 @@ -1403,7 +1441,7 @@ unsigned int ext4_ext_check_overlap(struct inode *inode,
1506
1507         /*
1508          * get the next allocated block if the extent in the path
1509 -        * is before the requested block(s)
1510 +        * is before the requested block(s)
1511          */
1512         if (b2 < b1) {
1513                 b2 = ext4_ext_next_allocated_block(path);
1514 @@ -1709,54 +1747,61 @@ static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode,
1515  }
1516
1517  /*
1518 - * ext4_ext_calc_credits_for_insert:
1519 - * This routine returns max. credits that the extent tree can consume.
1520 - * It should be OK for low-performance paths like ->writepage()
1521 - * To allow many writing processes to fit into a single transaction,
1522 - * the caller should calculate credits under i_data_sem and
1523 - * pass the actual path.
1524 + * ext4_ext_calc_credits_for_single_extent:
1525 + * This routine returns max. credits that needed to insert an extent
1526 + * to the extent tree.
1527 + * When pass the actual path, the caller should calculate credits
1528 + * under i_data_sem.
1529   */
1530 -int ext4_ext_calc_credits_for_insert(struct inode *inode,
1531 +int ext4_ext_calc_credits_for_single_extent(struct inode *inode, int nrblocks,
1532                                                 struct ext4_ext_path *path)
1533  {
1534 -       int depth, needed;
1535 -
1536         if (path) {
1537 +               int depth = ext_depth(inode);
1538 +               int ret = 0;
1539 +
1540                 /* probably there is space in leaf? */
1541 -               depth = ext_depth(inode);
1542                 if (le16_to_cpu(path[depth].p_hdr->eh_entries)
1543 -                               < le16_to_cpu(path[depth].p_hdr->eh_max))
1544 -                       return 1;
1545 -       }
1546 +                               < le16_to_cpu(path[depth].p_hdr->eh_max)) {
1547
1548 -       /*
1549 -        * given 32-bit logical block (4294967296 blocks), max. tree
1550 -        * can be 4 levels in depth -- 4 * 340^4 == 53453440000.
1551 -        * Let's also add one more level for imbalance.
1552 -        */
1553 -       depth = 5;
1554 -
1555 -       /* allocation of new data block(s) */
1556 -       needed = 2;
1557 +                       /*
1558 +                        *  There are some space in the leaf tree, no
1559 +                        *  need to account for leaf block credit
1560 +                        *
1561 +                        *  bitmaps and block group descriptor blocks
1562 +                        *  and other metadat blocks still need to be
1563 +                        *  accounted.
1564 +                        */
1565 +                       /* 1 bitmap, 1 block group descriptor */
1566 +                       ret = 2 + EXT4_META_TRANS_BLOCKS(inode->i_sb);
1567 +               }
1568 +       }
1569
1570 -       /*
1571 -        * tree can be full, so it would need to grow in depth:
1572 -        * we need one credit to modify old root, credits for
1573 -        * new root will be added in split accounting
1574 -        */
1575 -       needed += 1;
1576 +       return ext4_chunk_trans_blocks(inode, nrblocks);
1577 +}
1578
1579 -       /*
1580 -        * Index split can happen, we would need:
1581 -        *    allocate intermediate indexes (bitmap + group)
1582 -        *  + change two blocks at each level, but root (already included)
1583 -        */
1584 -       needed += (depth * 2) + (depth * 2);
1585 +/*
1586 + * How many index/leaf blocks need to change/allocate to modify nrblocks?
1587 + *
1588 + * if nrblocks are fit in a single extent (chunk flag is 1), then
1589 + * in the worse case, each tree level index/leaf need to be changed
1590 + * if the tree split due to insert a new extent, then the old tree
1591 + * index/leaf need to be updated too
1592 + *
1593 + * If the nrblocks are discontiguous, they could cause
1594 + * the whole tree split more than once, but this is really rare.
1595 + */
1596 +int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks, int chunk)
1597 +{
1598 +       int index;
1599 +       int depth = ext_depth(inode);
1600
1601 -       /* any allocation modifies superblock */
1602 -       needed += 1;
1603 +       if (chunk)
1604 +               index = depth * 2;
1605 +       else
1606 +               index = depth * 3;
1607
1608 -       return needed;
1609 +       return index;
1610  }
1611
1612  static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
1613 @@ -1872,22 +1917,22 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
1614                         BUG_ON(b != ex_ee_block + ex_ee_len - 1);
1615                 }
1616
1617 -               /* at present, extent can't cross block group: */
1618 -               /* leaf + bitmap + group desc + sb + inode */
1619 -               credits = 5;
1620 +               /*
1621 +                * 3 for leaf, sb, and inode plus 2 (bmap and group
1622 +                * descriptor) for each block group; assume two block
1623 +                * groups plus ex_ee_len/blocks_per_block_group for
1624 +                * the worst case
1625 +                */
1626 +               credits = 7 + 2*(ex_ee_len/EXT4_BLOCKS_PER_GROUP(inode->i_sb));
1627                 if (ex == EXT_FIRST_EXTENT(eh)) {
1628                         correct_index = 1;
1629                         credits += (ext_depth(inode)) + 1;
1630                 }
1631 -#ifdef CONFIG_QUOTA
1632                 credits += 2 * EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb);
1633 -#endif
1634
1635 -               handle = ext4_ext_journal_restart(handle, credits);
1636 -               if (IS_ERR(handle)) {
1637 -                       err = PTR_ERR(handle);
1638 +               err = ext4_ext_journal_restart(handle, credits);
1639 +               if (err)
1640                         goto out;
1641 -               }
1642
1643                 err = ext4_ext_get_access(handle, inode, path + depth);
1644                 if (err)
1645 @@ -2287,7 +2332,10 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
1646                 unsigned int newdepth;
1647                 /* If extent has less than EXT4_EXT_ZERO_LEN zerout directly */
1648                 if (allocated <= EXT4_EXT_ZERO_LEN) {
1649 -                       /* Mark first half uninitialized.
1650 +                       /*
1651 +                        * iblock == ee_block is handled by the zerouout
1652 +                        * at the beginning.
1653 +                        * Mark first half uninitialized.
1654                          * Mark second half initialized and zero out the
1655                          * initialized extent
1656                          */
1657 @@ -2310,7 +2358,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
1658                                 ex->ee_len   = orig_ex.ee_len;
1659                                 ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
1660                                 ext4_ext_dirty(handle, inode, path + depth);
1661 -                               /* zeroed the full extent */
1662 +                               /* blocks available from iblock */
1663                                 return allocated;
1664
1665                         } else if (err)
1666 @@ -2338,6 +2386,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
1667                                         err = PTR_ERR(path);
1668                                         return err;
1669                                 }
1670 +                               /* get the second half extent details */
1671                                 ex = path[depth].p_ext;
1672                                 err = ext4_ext_get_access(handle, inode,
1673                                                                 path + depth);
1674 @@ -2367,6 +2416,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
1675                         ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
1676                         ext4_ext_dirty(handle, inode, path + depth);
1677                         /* zeroed the full extent */
1678 +                       /* blocks available from iblock */
1679                         return allocated;
1680
1681                 } else if (err)
1682 @@ -2382,23 +2432,22 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
1683                  */
1684                 orig_ex.ee_len = cpu_to_le16(ee_len -
1685                                                 ext4_ext_get_actual_len(ex3));
1686 -               if (newdepth != depth) {
1687 -                       depth = newdepth;
1688 -                       ext4_ext_drop_refs(path);
1689 -                       path = ext4_ext_find_extent(inode, iblock, path);
1690 -                       if (IS_ERR(path)) {
1691 -                               err = PTR_ERR(path);
1692 -                               goto out;
1693 -                       }
1694 -                       eh = path[depth].p_hdr;
1695 -                       ex = path[depth].p_ext;
1696 -                       if (ex2 != &newex)
1697 -                               ex2 = ex;
1698 -
1699 -                       err = ext4_ext_get_access(handle, inode, path + depth);
1700 -                       if (err)
1701 -                               goto out;
1702 +               depth = newdepth;
1703 +               ext4_ext_drop_refs(path);
1704 +               path = ext4_ext_find_extent(inode, iblock, path);
1705 +               if (IS_ERR(path)) {
1706 +                       err = PTR_ERR(path);
1707 +                       goto out;
1708                 }
1709 +               eh = path[depth].p_hdr;
1710 +               ex = path[depth].p_ext;
1711 +               if (ex2 != &newex)
1712 +                       ex2 = ex;
1713 +
1714 +               err = ext4_ext_get_access(handle, inode, path + depth);
1715 +               if (err)
1716 +                       goto out;
1717 +
1718                 allocated = max_blocks;
1719
1720                 /* If extent has less than EXT4_EXT_ZERO_LEN and we are trying
1721 @@ -2416,6 +2465,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
1722                         ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
1723                         ext4_ext_dirty(handle, inode, path + depth);
1724                         /* zero out the first half */
1725 +                       /* blocks available from iblock */
1726                         return allocated;
1727                 }
1728         }
1729 @@ -2529,6 +2579,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
1730         int err = 0, depth, ret;
1731         unsigned long allocated = 0;
1732         struct ext4_allocation_request ar;
1733 +       loff_t disksize;
1734
1735         __clear_bit(BH_New, &bh_result->b_state);
1736         ext_debug("blocks %u/%lu requested for inode %u\n",
1737 @@ -2616,8 +2667,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
1738                                  */
1739                                 if (allocated > max_blocks)
1740                                         allocated = max_blocks;
1741 -                               /* mark the buffer unwritten */
1742 -                               __set_bit(BH_Unwritten, &bh_result->b_state);
1743 +                               set_buffer_unwritten(bh_result);
1744                                 goto out2;
1745                         }
1746
1747 @@ -2716,14 +2766,19 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
1748                 goto out2;
1749         }
1750
1751 -       if (extend_disksize && inode->i_size > EXT4_I(inode)->i_disksize)
1752 -               EXT4_I(inode)->i_disksize = inode->i_size;
1753 -
1754         /* previous routine could use block we allocated */
1755         newblock = ext_pblock(&newex);
1756         allocated = ext4_ext_get_actual_len(&newex);
1757  outnew:
1758 -       __set_bit(BH_New, &bh_result->b_state);
1759 +       if (extend_disksize) {
1760 +               disksize = ((loff_t) iblock + ar.len) << inode->i_blkbits;
1761 +               if (disksize > i_size_read(inode))
1762 +                       disksize = i_size_read(inode);
1763 +               if (disksize > EXT4_I(inode)->i_disksize)
1764 +                       EXT4_I(inode)->i_disksize = disksize;
1765 +       }
1766 +
1767 +       set_buffer_new(bh_result);
1768
1769         /* Cache only when it is _not_ an uninitialized extent */
1770         if (create != EXT4_CREATE_UNINITIALIZED_EXT)
1771 @@ -2733,7 +2788,7 @@ out:
1772         if (allocated > max_blocks)
1773                 allocated = max_blocks;
1774         ext4_ext_show_leaf(inode, path);
1775 -       __set_bit(BH_Mapped, &bh_result->b_state);
1776 +       set_buffer_mapped(bh_result);
1777         bh_result->b_bdev = inode->i_sb->s_bdev;
1778         bh_result->b_blocknr = newblock;
1779  out2:
1780 @@ -2744,7 +2799,7 @@ out2:
1781         return err ? err : allocated;
1782  }
1783
1784 -void ext4_ext_truncate(struct inode * inode, struct page *page)
1785 +void ext4_ext_truncate(struct inode *inode)
1786  {
1787         struct address_space *mapping = inode->i_mapping;
1788         struct super_block *sb = inode->i_sb;
1789 @@ -2755,33 +2810,27 @@ void ext4_ext_truncate(struct inode * inode, struct page *page)
1790         /*
1791          * probably first extent we're gonna free will be last in block
1792          */
1793 -       err = ext4_writepage_trans_blocks(inode) + 3;
1794 +       err = ext4_writepage_trans_blocks(inode);
1795         handle = ext4_journal_start(inode, err);
1796 -       if (IS_ERR(handle)) {
1797 -               if (page) {
1798 -                       clear_highpage(page);
1799 -                       flush_dcache_page(page);
1800 -                       unlock_page(page);
1801 -                       page_cache_release(page);
1802 -               }
1803 +       if (IS_ERR(handle))
1804                 return;
1805 -       }
1806
1807 -       if (page)
1808 -               ext4_block_truncate_page(handle, page, mapping, inode->i_size);
1809 +       if (inode->i_size & (sb->s_blocksize - 1))
1810 +               ext4_block_truncate_page(handle, mapping, inode->i_size);
1811 +
1812 +       if (ext4_orphan_add(handle, inode))
1813 +               goto out_stop;
1814
1815         down_write(&EXT4_I(inode)->i_data_sem);
1816         ext4_ext_invalidate_cache(inode);
1817
1818 -       ext4_mb_discard_inode_preallocations(inode);
1819 +       ext4_discard_reservation(inode);
1820
1821         /*
1822          * TODO: optimization is possible here.
1823          * Probably we need not scan at all,
1824          * because page truncation is enough.
1825          */
1826 -       if (ext4_orphan_add(handle, inode))
1827 -               goto out_stop;
1828
1829         /* we have to know where to truncate from in crash case */
1830         EXT4_I(inode)->i_disksize = inode->i_size;
1831 @@ -2798,6 +2847,7 @@ void ext4_ext_truncate(struct inode * inode, struct page *page)
1832                 handle->h_sync = 1;
1833
1834  out_stop:
1835 +       up_write(&EXT4_I(inode)->i_data_sem);
1836         /*
1837          * If this was a simple ftruncate() and the file will remain alive,
1838          * then we need to clear up the orphan record which we created above.
1839 @@ -2808,33 +2858,11 @@ out_stop:
1840         if (inode->i_nlink)
1841                 ext4_orphan_del(handle, inode);
1842
1843 -       up_write(&EXT4_I(inode)->i_data_sem);
1844         inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
1845         ext4_mark_inode_dirty(handle, inode);
1846         ext4_journal_stop(handle);
1847  }
1848
1849 -/*
1850 - * ext4_ext_writepage_trans_blocks:
1851 - * calculate max number of blocks we could modify
1852 - * in order to allocate new block for an inode
1853 - */
1854 -int ext4_ext_writepage_trans_blocks(struct inode *inode, int num)
1855 -{
1856 -       int needed;
1857 -
1858 -       needed = ext4_ext_calc_credits_for_insert(inode, NULL);
1859 -
1860 -       /* caller wants to allocate num blocks, but note it includes sb */
1861 -       needed = needed * num - (num - 1);
1862 -
1863 -#ifdef CONFIG_QUOTA
1864 -       needed += 2 * EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb);
1865 -#endif
1866 -
1867 -       return needed;
1868 -}
1869 -
1870  static void ext4_falloc_update_inode(struct inode *inode,
1871                                 int mode, loff_t new_size, int update_ctime)
1872  {
1873 @@ -2895,10 +2923,9 @@ long ext4_fallocate(struct inode *inode, int mode, loff_t offset, loff_t len)
1874         max_blocks = (EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits)
1875                                                         - block;
1876         /*
1877 -        * credits to insert 1 extent into extent tree + buffers to be able to
1878 -        * modify 1 super block, 1 block bitmap and 1 group descriptor.
1879 +        * credits to insert 1 extent into extent tree
1880          */
1881 -       credits = EXT4_DATA_TRANS_BLOCKS(inode->i_sb) + 3;
1882 +       credits = ext4_chunk_trans_blocks(inode, max_blocks);
1883         mutex_lock(&inode->i_mutex);
1884  retry:
1885         while (ret >= 0 && ret < max_blocks) {
1886 @@ -2911,7 +2938,7 @@ retry:
1887                 }
1888                 ret = ext4_get_blocks_wrap(handle, inode, block,
1889                                           max_blocks, &map_bh,
1890 -                                         EXT4_CREATE_UNINITIALIZED_EXT, 0);
1891 +                                         EXT4_CREATE_UNINITIALIZED_EXT, 0, 0);
1892                 if (ret <= 0) {
1893  #ifdef EXT4FS_DEBUG
1894                         WARN_ON(ret <= 0);
1895 diff --git a/fs/ext4/file.c b/fs/ext4/file.c
1896 index 4159be6..430eb79 100644
1897 --- a/fs/ext4/file.c
1898 +++ b/fs/ext4/file.c
1899 @@ -123,6 +123,23 @@ force_commit:
1900         return ret;
1901  }
1902
1903 +static struct vm_operations_struct ext4_file_vm_ops = {
1904 +       .fault          = filemap_fault,
1905 +       .page_mkwrite   = ext4_page_mkwrite,
1906 +};
1907 +
1908 +static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma)
1909 +{
1910 +       struct address_space *mapping = file->f_mapping;
1911 +
1912 +       if (!mapping->a_ops->readpage)
1913 +               return -ENOEXEC;
1914 +       file_accessed(file);
1915 +       vma->vm_ops = &ext4_file_vm_ops;
1916 +       vma->vm_flags |= VM_CAN_NONLINEAR;
1917 +       return 0;
1918 +}
1919 +
1920  const struct file_operations ext4_file_operations = {
1921         .llseek         = generic_file_llseek,
1922         .read           = do_sync_read,
1923 @@ -133,7 +150,7 @@ const struct file_operations ext4_file_operations = {
1924  #ifdef CONFIG_COMPAT
1925         .compat_ioctl   = ext4_compat_ioctl,
1926  #endif
1927 -       .mmap           = generic_file_mmap,
1928 +       .mmap           = ext4_file_mmap,
1929         .open           = generic_file_open,
1930         .release        = ext4_release_file,
1931         .fsync          = ext4_sync_file,
1932 @@ -144,6 +161,7 @@ const struct file_operations ext4_file_operations = {
1933  const struct inode_operations ext4_file_inode_operations = {
1934         .truncate       = ext4_truncate,
1935         .setattr        = ext4_setattr,
1936 +       .getattr        = ext4_getattr,
1937  #ifdef CONFIG_EXT4DEV_FS_XATTR
1938         .setxattr       = generic_setxattr,
1939         .getxattr       = generic_getxattr,
1940 diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
1941 index 1c8ba48..a45c373 100644
1942 --- a/fs/ext4/fsync.c
1943 +++ b/fs/ext4/fsync.c
1944 @@ -27,6 +27,7 @@
1945  #include <linux/sched.h>
1946  #include <linux/writeback.h>
1947  #include <linux/jbd2.h>
1948 +#include <linux/blkdev.h>
1949  #include "ext4.h"
1950  #include "ext4_jbd2.h"
1951
1952 @@ -45,6 +46,7 @@
1953  int ext4_sync_file(struct file * file, struct dentry *dentry, int datasync)
1954  {
1955         struct inode *inode = dentry->d_inode;
1956 +       journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
1957         int ret = 0;
1958
1959         J_ASSERT(ext4_journal_current_handle() == NULL);
1960 @@ -85,6 +87,8 @@ int ext4_sync_file(struct file * file, struct dentry *dentry, int datasync)
1961                         .nr_to_write = 0, /* sys_fsync did this */
1962                 };
1963                 ret = sync_inode(inode, &wbc);
1964 +               if (journal && (journal->j_flags & JBD2_BARRIER))
1965 +                       blkdev_issue_flush(inode->i_sb->s_bdev, NULL);
1966         }
1967  out:
1968         return ret;
1969 diff --git a/fs/ext4/group.h b/fs/ext4/group.h
1970 index 7eb0604..c2c0a8d 100644
1971 --- a/fs/ext4/group.h
1972 +++ b/fs/ext4/group.h
1973 @@ -13,7 +13,7 @@ extern __le16 ext4_group_desc_csum(struct ext4_sb_info *sbi, __u32 group,
1974                                    struct ext4_group_desc *gdp);
1975  extern int ext4_group_desc_csum_verify(struct ext4_sb_info *sbi, __u32 group,
1976                                        struct ext4_group_desc *gdp);
1977 -struct buffer_head *read_block_bitmap(struct super_block *sb,
1978 +struct buffer_head *ext4_read_block_bitmap(struct super_block *sb,
1979                                       ext4_group_t block_group);
1980  extern unsigned ext4_init_block_bitmap(struct super_block *sb,
1981                                        struct buffer_head *bh,
1982 diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
1983 index c6efbab..f344834 100644
1984 --- a/fs/ext4/ialloc.c
1985 +++ b/fs/ext4/ialloc.c
1986 @@ -97,34 +97,44 @@ unsigned ext4_init_inode_bitmap(struct super_block *sb, struct buffer_head *bh,
1987   * Return buffer_head of bitmap on success or NULL.
1988   */
1989  static struct buffer_head *
1990 -read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
1991 +ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
1992  {
1993         struct ext4_group_desc *desc;
1994         struct buffer_head *bh = NULL;
1995 +       ext4_fsblk_t bitmap_blk;
1996
1997         desc = ext4_get_group_desc(sb, block_group, NULL);
1998         if (!desc)
1999 -               goto error_out;
2000 +               return NULL;
2001 +       bitmap_blk = ext4_inode_bitmap(sb, desc);
2002 +       bh = sb_getblk(sb, bitmap_blk);
2003 +       if (unlikely(!bh)) {
2004 +               ext4_error(sb, __func__,
2005 +                           "Cannot read inode bitmap - "
2006 +                           "block_group = %lu, inode_bitmap = %llu",
2007 +                           block_group, bitmap_blk);
2008 +               return NULL;
2009 +       }
2010 +       if (bh_uptodate_or_lock(bh))
2011 +               return bh;
2012 +
2013 +       spin_lock(sb_bgl_lock(EXT4_SB(sb), block_group));
2014         if (desc->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) {
2015 -               bh = sb_getblk(sb, ext4_inode_bitmap(sb, desc));
2016 -               if (!buffer_uptodate(bh)) {
2017 -                       lock_buffer(bh);
2018 -                       if (!buffer_uptodate(bh)) {
2019 -                               ext4_init_inode_bitmap(sb, bh, block_group,
2020 -                                                      desc);
2021 -                               set_buffer_uptodate(bh);
2022 -                       }
2023 -                       unlock_buffer(bh);
2024 -               }
2025 -       } else {
2026 -               bh = sb_bread(sb, ext4_inode_bitmap(sb, desc));
2027 +               ext4_init_inode_bitmap(sb, bh, block_group, desc);
2028 +               set_buffer_uptodate(bh);
2029 +               unlock_buffer(bh);
2030 +               spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group));
2031 +               return bh;
2032         }
2033 -       if (!bh)
2034 -               ext4_error(sb, "read_inode_bitmap",
2035 +       spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group));
2036 +       if (bh_submit_read(bh) < 0) {
2037 +               put_bh(bh);
2038 +               ext4_error(sb, __func__,
2039                             "Cannot read inode bitmap - "
2040                             "block_group = %lu, inode_bitmap = %llu",
2041 -                           block_group, ext4_inode_bitmap(sb, desc));
2042 -error_out:
2043 +                           block_group, bitmap_blk);
2044 +               return NULL;
2045 +       }
2046         return bh;
2047  }
2048
2049 @@ -157,6 +167,7 @@ void ext4_free_inode (handle_t *handle, struct inode * inode)
2050         struct ext4_super_block * es;
2051         struct ext4_sb_info *sbi;
2052         int fatal = 0, err;
2053 +       ext4_group_t flex_group;
2054
2055         if (atomic_read(&inode->i_count) > 1) {
2056                 printk ("ext4_free_inode: inode has count=%d\n",
2057 @@ -199,7 +210,7 @@ void ext4_free_inode (handle_t *handle, struct inode * inode)
2058         }
2059         block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb);
2060         bit = (ino - 1) % EXT4_INODES_PER_GROUP(sb);
2061 -       bitmap_bh = read_inode_bitmap(sb, block_group);
2062 +       bitmap_bh = ext4_read_inode_bitmap(sb, block_group);
2063         if (!bitmap_bh)
2064                 goto error_return;
2065
2066 @@ -232,6 +243,12 @@ void ext4_free_inode (handle_t *handle, struct inode * inode)
2067                         if (is_directory)
2068                                 percpu_counter_dec(&sbi->s_dirs_counter);
2069
2070 +                       if (sbi->s_log_groups_per_flex) {
2071 +                               flex_group = ext4_flex_group(sbi, block_group);
2072 +                               spin_lock(sb_bgl_lock(sbi, flex_group));
2073 +                               sbi->s_flex_groups[flex_group].free_inodes++;
2074 +                               spin_unlock(sb_bgl_lock(sbi, flex_group));
2075 +                       }
2076                 }
2077                 BUFFER_TRACE(bh2, "call ext4_journal_dirty_metadata");
2078                 err = ext4_journal_dirty_metadata(handle, bh2);
2079 @@ -286,6 +303,80 @@ static int find_group_dir(struct super_block *sb, struct inode *parent,
2080         return ret;
2081  }
2082
2083 +#define free_block_ratio 10
2084 +
2085 +static int find_group_flex(struct super_block *sb, struct inode *parent,
2086 +                          ext4_group_t *best_group)
2087 +{
2088 +       struct ext4_sb_info *sbi = EXT4_SB(sb);
2089 +       struct ext4_group_desc *desc;
2090 +       struct buffer_head *bh;
2091 +       struct flex_groups *flex_group = sbi->s_flex_groups;
2092 +       ext4_group_t parent_group = EXT4_I(parent)->i_block_group;
2093 +       ext4_group_t parent_fbg_group = ext4_flex_group(sbi, parent_group);
2094 +       ext4_group_t ngroups = sbi->s_groups_count;
2095 +       int flex_size = ext4_flex_bg_size(sbi);
2096 +       ext4_group_t best_flex = parent_fbg_group;
2097 +       int blocks_per_flex = sbi->s_blocks_per_group * flex_size;
2098 +       int flexbg_free_blocks;
2099 +       int flex_freeb_ratio;
2100 +       ext4_group_t n_fbg_groups;
2101 +       ext4_group_t i;
2102 +
2103 +       n_fbg_groups = (sbi->s_groups_count + flex_size - 1) >>
2104 +               sbi->s_log_groups_per_flex;
2105 +
2106 +find_close_to_parent:
2107 +       flexbg_free_blocks = flex_group[best_flex].free_blocks;
2108 +       flex_freeb_ratio = flexbg_free_blocks * 100 / blocks_per_flex;
2109 +       if (flex_group[best_flex].free_inodes &&
2110 +           flex_freeb_ratio > free_block_ratio)
2111 +               goto found_flexbg;
2112 +
2113 +       if (best_flex && best_flex == parent_fbg_group) {
2114 +               best_flex--;
2115 +               goto find_close_to_parent;
2116 +       }
2117 +
2118 +       for (i = 0; i < n_fbg_groups; i++) {
2119 +               if (i == parent_fbg_group || i == parent_fbg_group - 1)
2120 +                       continue;
2121 +
2122 +               flexbg_free_blocks = flex_group[i].free_blocks;
2123 +               flex_freeb_ratio = flexbg_free_blocks * 100 / blocks_per_flex;
2124 +
2125 +               if (flex_freeb_ratio > free_block_ratio &&
2126 +                   flex_group[i].free_inodes) {
2127 +                       best_flex = i;
2128 +                       goto found_flexbg;
2129 +               }
2130 +
2131 +               if (flex_group[best_flex].free_inodes == 0 ||
2132 +                   (flex_group[i].free_blocks >
2133 +                    flex_group[best_flex].free_blocks &&
2134 +                    flex_group[i].free_inodes))
2135 +                       best_flex = i;
2136 +       }
2137 +
2138 +       if (!flex_group[best_flex].free_inodes ||
2139 +           !flex_group[best_flex].free_blocks)
2140 +               return -1;
2141 +
2142 +found_flexbg:
2143 +       for (i = best_flex * flex_size; i < ngroups &&
2144 +                    i < (best_flex + 1) * flex_size; i++) {
2145 +               desc = ext4_get_group_desc(sb, i, &bh);
2146 +               if (le16_to_cpu(desc->bg_free_inodes_count)) {
2147 +                       *best_group = i;
2148 +                       goto out;
2149 +               }
2150 +       }
2151 +
2152 +       return -1;
2153 +out:
2154 +       return 0;
2155 +}
2156 +
2157  /*
2158   * Orlov's allocator for directories.
2159   *
2160 @@ -501,6 +592,7 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode * dir, int mode)
2161         struct inode *ret;
2162         ext4_group_t i;
2163         int free = 0;
2164 +       ext4_group_t flex_group;
2165
2166         /* Cannot create files in a deleted directory */
2167         if (!dir || !dir->i_nlink)
2168 @@ -514,6 +606,12 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode * dir, int mode)
2169
2170         sbi = EXT4_SB(sb);
2171         es = sbi->s_es;
2172 +
2173 +       if (sbi->s_log_groups_per_flex) {
2174 +               ret2 = find_group_flex(sb, dir, &group);
2175 +               goto got_group;
2176 +       }
2177 +
2178         if (S_ISDIR(mode)) {
2179                 if (test_opt (sb, OLDALLOC))
2180                         ret2 = find_group_dir(sb, dir, &group);
2181 @@ -522,6 +620,7 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode * dir, int mode)
2182         } else
2183                 ret2 = find_group_other(sb, dir, &group);
2184
2185 +got_group:
2186         err = -ENOSPC;
2187         if (ret2 == -1)
2188                 goto out;
2189 @@ -534,7 +633,7 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode * dir, int mode)
2190                         goto fail;
2191
2192                 brelse(bitmap_bh);
2193 -               bitmap_bh = read_inode_bitmap(sb, group);
2194 +               bitmap_bh = ext4_read_inode_bitmap(sb, group);
2195                 if (!bitmap_bh)
2196                         goto fail;
2197
2198 @@ -600,7 +699,7 @@ got:
2199         /* We may have to initialize the block bitmap if it isn't already */
2200         if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM) &&
2201             gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
2202 -               struct buffer_head *block_bh = read_block_bitmap(sb, group);
2203 +               struct buffer_head *block_bh = ext4_read_block_bitmap(sb, group);
2204
2205                 BUFFER_TRACE(block_bh, "get block bitmap access");
2206                 err = ext4_journal_get_write_access(handle, block_bh);
2207 @@ -639,7 +738,7 @@ got:
2208
2209                         /* When marking the block group with
2210                          * ~EXT4_BG_INODE_UNINIT we don't want to depend
2211 -                        * on the value of bg_itable_unsed even though
2212 +                        * on the value of bg_itable_unused even though
2213                          * mke2fs could have initialized the same for us.
2214                          * Instead we calculated the value below
2215                          */
2216 @@ -676,6 +775,13 @@ got:
2217                 percpu_counter_inc(&sbi->s_dirs_counter);
2218         sb->s_dirt = 1;
2219
2220 +       if (sbi->s_log_groups_per_flex) {
2221 +               flex_group = ext4_flex_group(sbi, group);
2222 +               spin_lock(sb_bgl_lock(sbi, flex_group));
2223 +               sbi->s_flex_groups[flex_group].free_inodes--;
2224 +               spin_unlock(sb_bgl_lock(sbi, flex_group));
2225 +       }
2226 +
2227         inode->i_uid = current->fsuid;
2228         if (test_opt (sb, GRPID))
2229                 inode->i_gid = dir->i_gid;
2230 @@ -740,14 +846,10 @@ got:
2231                 goto fail_free_drop;
2232
2233         if (test_opt(sb, EXTENTS)) {
2234 -               /* set extent flag only for diretory, file and normal symlink*/
2235 +               /* set extent flag only for directory, file and normal symlink*/
2236                 if (S_ISDIR(mode) || S_ISREG(mode) || S_ISLNK(mode)) {
2237                         EXT4_I(inode)->i_flags |= EXT4_EXTENTS_FL;
2238                         ext4_ext_tree_init(handle, inode);
2239 -                       err = ext4_update_incompat_feature(handle, sb,
2240 -                                       EXT4_FEATURE_INCOMPAT_EXTENTS);
2241 -                       if (err)
2242 -                               goto fail_free_drop;
2243                 }
2244         }
2245
2246 @@ -799,7 +901,7 @@ struct inode *ext4_orphan_get(struct super_block *sb, unsigned long ino)
2247
2248         block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb);
2249         bit = (ino - 1) % EXT4_INODES_PER_GROUP(sb);
2250 -       bitmap_bh = read_inode_bitmap(sb, block_group);
2251 +       bitmap_bh = ext4_read_inode_bitmap(sb, block_group);
2252         if (!bitmap_bh) {
2253                 ext4_warning(sb, __func__,
2254                              "inode bitmap error for orphan %lu", ino);
2255 @@ -817,6 +919,14 @@ struct inode *ext4_orphan_get(struct super_block *sb, unsigned long ino)
2256         if (IS_ERR(inode))
2257                 goto iget_failed;
2258
2259 +       /*
2260 +        * If the orphans has i_nlinks > 0 then it should be able to be
2261 +        * truncated, otherwise it won't be removed from the orphan list
2262 +        * during processing and an infinite loop will result.
2263 +        */
2264 +       if (inode->i_nlink && !ext4_can_truncate(inode))
2265 +               goto bad_orphan;
2266 +
2267         if (NEXT_ORPHAN(inode) > max_ino)
2268                 goto bad_orphan;
2269         brelse(bitmap_bh);
2270 @@ -838,6 +948,7 @@ bad_orphan:
2271                 printk(KERN_NOTICE "NEXT_ORPHAN(inode)=%u\n",
2272                        NEXT_ORPHAN(inode));
2273                 printk(KERN_NOTICE "max_ino=%lu\n", max_ino);
2274 +               printk(KERN_NOTICE "i_nlink=%u\n", inode->i_nlink);
2275                 /* Avoid freeing blocks if we got a bad deleted inode */
2276                 if (inode->i_nlink == 0)
2277                         inode->i_blocks = 0;
2278 @@ -868,7 +979,7 @@ unsigned long ext4_count_free_inodes (struct super_block * sb)
2279                         continue;
2280                 desc_count += le16_to_cpu(gdp->bg_free_inodes_count);
2281                 brelse(bitmap_bh);
2282 -               bitmap_bh = read_inode_bitmap(sb, i);
2283 +               bitmap_bh = ext4_read_inode_bitmap(sb, i);
2284                 if (!bitmap_bh)
2285                         continue;
2286
2287 diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
2288 index 8d97077..3c0195a 100644
2289 --- a/fs/ext4/inode.c
2290 +++ b/fs/ext4/inode.c
2291 @@ -32,12 +32,25 @@
2292  #include <linux/string.h>
2293  #include <linux/buffer_head.h>
2294  #include <linux/writeback.h>
2295 +#include <linux/pagevec.h>
2296  #include <linux/mpage.h>
2297  #include <linux/uio.h>
2298  #include <linux/bio.h>
2299  #include "ext4_jbd2.h"
2300  #include "xattr.h"
2301  #include "acl.h"
2302 +#include "ext4_extents.h"
2303 +
2304 +#define MPAGE_DA_EXTENT_TAIL 0x01
2305 +
2306 +static inline int ext4_begin_ordered_truncate(struct inode *inode,
2307 +                                             loff_t new_size)
2308 +{
2309 +       return jbd2_journal_begin_ordered_truncate(&EXT4_I(inode)->jinode,
2310 +                                                  new_size);
2311 +}
2312 +
2313 +static void ext4_invalidatepage(struct page *page, unsigned long offset);
2314
2315  /*
2316   * Test whether an inode is a fast symlink.
2317 @@ -180,14 +193,18 @@ static int ext4_journal_test_restart(handle_t *handle, struct inode *inode)
2318  void ext4_delete_inode (struct inode * inode)
2319  {
2320         handle_t *handle;
2321 +       int err;
2322
2323 +       if (ext4_should_order_data(inode))
2324 +               ext4_begin_ordered_truncate(inode, 0);
2325         truncate_inode_pages(&inode->i_data, 0);
2326
2327         if (is_bad_inode(inode))
2328                 goto no_delete;
2329
2330 -       handle = start_transaction(inode);
2331 +       handle = ext4_journal_start(inode, blocks_for_truncate(inode)+3);
2332         if (IS_ERR(handle)) {
2333 +               ext4_std_error(inode->i_sb, PTR_ERR(handle));
2334                 /*
2335                  * If we're going to skip the normal cleanup, we still need to
2336                  * make sure that the in-core orphan linked list is properly
2337 @@ -200,8 +217,34 @@ void ext4_delete_inode (struct inode * inode)
2338         if (IS_SYNC(inode))
2339                 handle->h_sync = 1;
2340         inode->i_size = 0;
2341 +       err = ext4_mark_inode_dirty(handle, inode);
2342 +       if (err) {
2343 +               ext4_warning(inode->i_sb, __func__,
2344 +                            "couldn't mark inode dirty (err %d)", err);
2345 +               goto stop_handle;
2346 +       }
2347         if (inode->i_blocks)
2348                 ext4_truncate(inode);
2349 +
2350 +       /*
2351 +        * ext4_ext_truncate() doesn't reserve any slop when it
2352 +        * restarts journal transactions; therefore there may not be
2353 +        * enough credits left in the handle to remove the inode from
2354 +        * the orphan list and set the dtime field.
2355 +        */
2356 +       if (handle->h_buffer_credits < 3) {
2357 +               err = ext4_journal_extend(handle, 3);
2358 +               if (err > 0)
2359 +                       err = ext4_journal_restart(handle, 3);
2360 +               if (err != 0) {
2361 +                       ext4_warning(inode->i_sb, __func__,
2362 +                                    "couldn't extend journal (err %d)", err);
2363 +               stop_handle:
2364 +                       ext4_journal_stop(handle);
2365 +                       goto no_delete;
2366 +               }
2367 +       }
2368 +
2369         /*
2370          * Kill off the orphan record which ext4_truncate created.
2371          * AKPM: I think this can be inside the above `if'.
2372 @@ -508,11 +551,12 @@ static int ext4_blks_to_allocate(Indirect *branch, int k, unsigned long blks,
2373   *             direct blocks
2374   */
2375  static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
2376 -                       ext4_fsblk_t goal, int indirect_blks, int blks,
2377 -                       ext4_fsblk_t new_blocks[4], int *err)
2378 +                               ext4_lblk_t iblock, ext4_fsblk_t goal,
2379 +                               int indirect_blks, int blks,
2380 +                               ext4_fsblk_t new_blocks[4], int *err)
2381  {
2382         int target, i;
2383 -       unsigned long count = 0;
2384 +       unsigned long count = 0, blk_allocated = 0;
2385         int index = 0;
2386         ext4_fsblk_t current_block = 0;
2387         int ret = 0;
2388 @@ -525,12 +569,13 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
2389          * the first direct block of this branch.  That's the
2390          * minimum number of blocks need to allocate(required)
2391          */
2392 -       target = blks + indirect_blks;
2393 -
2394 -       while (1) {
2395 +       /* first we try to allocate the indirect blocks */
2396 +       target = indirect_blks;
2397 +       while (target > 0) {
2398                 count = target;
2399                 /* allocating blocks for indirect blocks and direct blocks */
2400 -               current_block = ext4_new_blocks(handle,inode,goal,&count,err);
2401 +               current_block = ext4_new_meta_blocks(handle, inode,
2402 +                                                       goal, &count, err);
2403                 if (*err)
2404                         goto failed_out;
2405
2406 @@ -540,16 +585,48 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
2407                         new_blocks[index++] = current_block++;
2408                         count--;
2409                 }
2410 -
2411 -               if (count > 0)
2412 +               if (count > 0) {
2413 +                       /*
2414 +                        * save the new block number
2415 +                        * for the first direct block
2416 +                        */
2417 +                       new_blocks[index] = current_block;
2418 +                       printk(KERN_INFO "%s returned more blocks than "
2419 +                                               "requested\n", __func__);
2420 +                       WARN_ON(1);
2421                         break;
2422 +               }
2423         }
2424
2425 -       /* save the new block number for the first direct block */
2426 -       new_blocks[index] = current_block;
2427 -
2428 +       target = blks - count ;
2429 +       blk_allocated = count;
2430 +       if (!target)
2431 +               goto allocated;
2432 +       /* Now allocate data blocks */
2433 +       count = target;
2434 +       /* allocating blocks for data blocks */
2435 +       current_block = ext4_new_blocks(handle, inode, iblock,
2436 +                                               goal, &count, err);
2437 +       if (*err && (target == blks)) {
2438 +               /*
2439 +                * if the allocation failed and we didn't allocate
2440 +                * any blocks before
2441 +                */
2442 +               goto failed_out;
2443 +       }
2444 +       if (!*err) {
2445 +               if (target == blks) {
2446 +               /*
2447 +                * save the new block number
2448 +                * for the first direct block
2449 +                */
2450 +                       new_blocks[index] = current_block;
2451 +               }
2452 +               blk_allocated += count;
2453 +       }
2454 +allocated:
2455         /* total number of blocks allocated for direct blocks */
2456 -       ret = count;
2457 +       ret = blk_allocated;
2458         *err = 0;
2459         return ret;
2460  failed_out:
2461 @@ -584,8 +661,9 @@ failed_out:
2462   *     as described above and return 0.
2463   */
2464  static int ext4_alloc_branch(handle_t *handle, struct inode *inode,
2465 -                       int indirect_blks, int *blks, ext4_fsblk_t goal,
2466 -                       ext4_lblk_t *offsets, Indirect *branch)
2467 +                               ext4_lblk_t iblock, int indirect_blks,
2468 +                               int *blks, ext4_fsblk_t goal,
2469 +                               ext4_lblk_t *offsets, Indirect *branch)
2470  {
2471         int blocksize = inode->i_sb->s_blocksize;
2472         int i, n = 0;
2473 @@ -595,7 +673,7 @@ static int ext4_alloc_branch(handle_t *handle, struct inode *inode,
2474         ext4_fsblk_t new_blocks[4];
2475         ext4_fsblk_t current_block;
2476
2477 -       num = ext4_alloc_blocks(handle, inode, goal, indirect_blks,
2478 +       num = ext4_alloc_blocks(handle, inode, iblock, goal, indirect_blks,
2479                                 *blks, new_blocks, &err);
2480         if (err)
2481                 return err;
2482 @@ -799,6 +877,7 @@ int ext4_get_blocks_handle(handle_t *handle, struct inode *inode,
2483         struct ext4_inode_info *ei = EXT4_I(inode);
2484         int count = 0;
2485         ext4_fsblk_t first_block = 0;
2486 +       loff_t disksize;
2487
2488
2489         J_ASSERT(!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL));
2490 @@ -855,8 +934,9 @@ int ext4_get_blocks_handle(handle_t *handle, struct inode *inode,
2491         /*
2492          * Block out ext4_truncate while we alter the tree
2493          */
2494 -       err = ext4_alloc_branch(handle, inode, indirect_blks, &count, goal,
2495 -                               offsets + (partial - chain), partial);
2496 +       err = ext4_alloc_branch(handle, inode, iblock, indirect_blks,
2497 +                                       &count, goal,
2498 +                                       offsets + (partial - chain), partial);
2499
2500         /*
2501          * The ext4_splice_branch call will free and forget any buffers
2502 @@ -873,8 +953,13 @@ int ext4_get_blocks_handle(handle_t *handle, struct inode *inode,
2503          * protect it if you're about to implement concurrent
2504          * ext4_get_block() -bzzz
2505         */
2506 -       if (!err && extend_disksize && inode->i_size > ei->i_disksize)
2507 -               ei->i_disksize = inode->i_size;
2508 +       if (!err && extend_disksize) {
2509 +               disksize = ((loff_t) iblock + count) << inode->i_blkbits;
2510 +               if (disksize > i_size_read(inode))
2511 +                       disksize = i_size_read(inode);
2512 +               if (disksize > ei->i_disksize)
2513 +                       ei->i_disksize = disksize;
2514 +       }
2515         if (err)
2516                 goto cleanup;
2517
2518 @@ -897,23 +982,74 @@ out:
2519         return err;
2520  }
2521
2522 -/* Maximum number of blocks we map for direct IO at once. */
2523 -#define DIO_MAX_BLOCKS 4096
2524  /*
2525 - * Number of credits we need for writing DIO_MAX_BLOCKS:
2526 - * We need sb + group descriptor + bitmap + inode -> 4
2527 - * For B blocks with A block pointers per block we need:
2528 - * 1 (triple ind.) + (B/A/A + 2) (doubly ind.) + (B/A + 2) (indirect).
2529 - * If we plug in 4096 for B and 256 for A (for 1KB block size), we get 25.
2530 + * Calculate the number of metadata blocks need to reserve
2531 + * to allocate @blocks for non extent file based file
2532   */
2533 -#define DIO_CREDITS 25
2534 +static int ext4_indirect_calc_metadata_amount(struct inode *inode, int blocks)
2535 +{
2536 +       int icap = EXT4_ADDR_PER_BLOCK(inode->i_sb);
2537 +       int ind_blks, dind_blks, tind_blks;
2538 +
2539 +       /* number of new indirect blocks needed */
2540 +       ind_blks = (blocks + icap - 1) / icap;
2541 +
2542 +       dind_blks = (ind_blks + icap - 1) / icap;
2543
2544 +       tind_blks = 1;
2545 +
2546 +       return ind_blks + dind_blks + tind_blks;
2547 +}
2548
2549  /*
2550 + * Calculate the number of metadata blocks need to reserve
2551 + * to allocate given number of blocks
2552 + */
2553 +static int ext4_calc_metadata_amount(struct inode *inode, int blocks)
2554 +{
2555 +       if (!blocks)
2556 +               return 0;
2557 +
2558 +       if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)
2559 +               return ext4_ext_calc_metadata_amount(inode, blocks);
2560 +
2561 +       return ext4_indirect_calc_metadata_amount(inode, blocks);
2562 +}
2563 +
2564 +static void ext4_da_update_reserve_space(struct inode *inode, int used)
2565 +{
2566 +       struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
2567 +       int total, mdb, mdb_free;
2568 +
2569 +       spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
2570 +       /* recalculate the number of metablocks still need to be reserved */
2571 +       total = EXT4_I(inode)->i_reserved_data_blocks - used;
2572 +       mdb = ext4_calc_metadata_amount(inode, total);
2573 +
2574 +       /* figure out how many metablocks to release */
2575 +       BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks);
2576 +       mdb_free = EXT4_I(inode)->i_reserved_meta_blocks - mdb;
2577 +
2578 +       /* Account for allocated meta_blocks */
2579 +       mdb_free -= EXT4_I(inode)->i_allocated_meta_blocks;
2580 +
2581 +       /* update fs free blocks counter for truncate case */
2582 +       percpu_counter_add(&sbi->s_freeblocks_counter, mdb_free);
2583 +
2584 +       /* update per-inode reservations */
2585 +       BUG_ON(used  > EXT4_I(inode)->i_reserved_data_blocks);
2586 +       EXT4_I(inode)->i_reserved_data_blocks -= used;
2587 +
2588 +       BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks);
2589 +       EXT4_I(inode)->i_reserved_meta_blocks = mdb;
2590 +       EXT4_I(inode)->i_allocated_meta_blocks = 0;
2591 +       spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
2592 +}
2593 +
2594 +/*
2595 + * The ext4_get_blocks_wrap() function try to look up the requested blocks,
2596 + * and returns if the blocks are already mapped.
2597   *
2598 - *
2599 - * ext4_ext4 get_block() wrapper function
2600 - * It will do a look up first, and returns if the blocks already mapped.
2601   * Otherwise it takes the write lock of the i_data_sem and allocate blocks
2602   * and store the allocated blocks in the result buffer head and mark it
2603   * mapped.
2604 @@ -934,7 +1070,7 @@ out:
2605   */
2606  int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block,
2607                         unsigned long max_blocks, struct buffer_head *bh,
2608 -                       int create, int extend_disksize)
2609 +                       int create, int extend_disksize, int flag)
2610  {
2611         int retval;
2612
2613 @@ -975,6 +1111,15 @@ int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block,
2614          * with create == 1 flag.
2615          */
2616         down_write((&EXT4_I(inode)->i_data_sem));
2617 +
2618 +       /*
2619 +        * if the caller is from delayed allocation writeout path
2620 +        * we have already reserved fs blocks for allocation
2621 +        * let the underlying get_block() function know to
2622 +        * avoid double accounting
2623 +        */
2624 +       if (flag)
2625 +               EXT4_I(inode)->i_delalloc_reserved_flag = 1;
2626         /*
2627          * We need to check for EXT4 here because migrate
2628          * could have changed the inode type in between
2629 @@ -996,23 +1141,39 @@ int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block,
2630                                                         ~EXT4_EXT_MIGRATE;
2631                 }
2632         }
2633 +
2634 +       if (flag) {
2635 +               EXT4_I(inode)->i_delalloc_reserved_flag = 0;
2636 +               /*
2637 +                * Update reserved blocks/metadata blocks
2638 +                * after successful block allocation
2639 +                * which were deferred till now
2640 +                */
2641 +               if ((retval > 0) && buffer_delay(bh))
2642 +                       ext4_da_update_reserve_space(inode, retval);
2643 +       }
2644 +
2645         up_write((&EXT4_I(inode)->i_data_sem));
2646         return retval;
2647  }
2648
2649 +/* Maximum number of blocks we map for direct IO at once. */
2650 +#define DIO_MAX_BLOCKS 4096
2651 +
2652  static int ext4_get_block(struct inode *inode, sector_t iblock,
2653                         struct buffer_head *bh_result, int create)
2654  {
2655         handle_t *handle = ext4_journal_current_handle();
2656         int ret = 0, started = 0;
2657         unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
2658 +       int dio_credits;
2659
2660         if (create && !handle) {
2661                 /* Direct IO write... */
2662                 if (max_blocks > DIO_MAX_BLOCKS)
2663                         max_blocks = DIO_MAX_BLOCKS;
2664 -               handle = ext4_journal_start(inode, DIO_CREDITS +
2665 -                             2 * EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb));
2666 +               dio_credits = ext4_chunk_trans_blocks(inode, max_blocks);
2667 +               handle = ext4_journal_start(inode, dio_credits);
2668                 if (IS_ERR(handle)) {
2669                         ret = PTR_ERR(handle);
2670                         goto out;
2671 @@ -1021,7 +1182,7 @@ static int ext4_get_block(struct inode *inode, sector_t iblock,
2672         }
2673
2674         ret = ext4_get_blocks_wrap(handle, inode, iblock,
2675 -                                       max_blocks, bh_result, create, 0);
2676 +                                       max_blocks, bh_result, create, 0, 0);
2677         if (ret > 0) {
2678                 bh_result->b_size = (ret << inode->i_blkbits);
2679                 ret = 0;
2680 @@ -1047,7 +1208,7 @@ struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode,
2681         dummy.b_blocknr = -1000;
2682         buffer_trace_init(&dummy.b_history);
2683         err = ext4_get_blocks_wrap(handle, inode, block, 1,
2684 -                                       &dummy, create, 1);
2685 +                                       &dummy, create, 1, 0);
2686         /*
2687          * ext4_get_blocks_handle() returns number of blocks
2688          * mapped. 0 in case of a HOLE.
2689 @@ -1203,19 +1364,20 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping,
2690         to = from + len;
2691
2692  retry:
2693 -       page = __grab_cache_page(mapping, index);
2694 -       if (!page)
2695 -               return -ENOMEM;
2696 -       *pagep = page;
2697 -
2698         handle = ext4_journal_start(inode, needed_blocks);
2699         if (IS_ERR(handle)) {
2700 -               unlock_page(page);
2701 -               page_cache_release(page);
2702                 ret = PTR_ERR(handle);
2703                 goto out;
2704         }
2705
2706 +       page = __grab_cache_page(mapping, index);
2707 +       if (!page) {
2708 +               ext4_journal_stop(handle);
2709 +               ret = -ENOMEM;
2710 +               goto out;
2711 +       }
2712 +       *pagep = page;
2713 +
2714         ret = block_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
2715                                                         ext4_get_block);
2716
2717 @@ -1225,8 +1387,8 @@ retry:
2718         }
2719
2720         if (ret) {
2721 -               ext4_journal_stop(handle);
2722                 unlock_page(page);
2723 +               ext4_journal_stop(handle);
2724                 page_cache_release(page);
2725         }
2726
2727 @@ -1236,15 +1398,6 @@ out:
2728         return ret;
2729  }
2730
2731 -int ext4_journal_dirty_data(handle_t *handle, struct buffer_head *bh)
2732 -{
2733 -       int err = jbd2_journal_dirty_data(handle, bh);
2734 -       if (err)
2735 -               ext4_journal_abort_handle(__func__, __func__,
2736 -                                               bh, handle, err);
2737 -       return err;
2738 -}
2739 -
2740  /* For write_end() in data=journal mode */
2741  static int write_end_fn(handle_t *handle, struct buffer_head *bh)
2742  {
2743 @@ -1255,29 +1408,6 @@ static int write_end_fn(handle_t *handle, struct buffer_head *bh)
2744  }
2745
2746  /*
2747 - * Generic write_end handler for ordered and writeback ext4 journal modes.
2748 - * We can't use generic_write_end, because that unlocks the page and we need to
2749 - * unlock the page after ext4_journal_stop, but ext4_journal_stop must run
2750 - * after block_write_end.
2751 - */
2752 -static int ext4_generic_write_end(struct file *file,
2753 -                               struct address_space *mapping,
2754 -                               loff_t pos, unsigned len, unsigned copied,
2755 -                               struct page *page, void *fsdata)
2756 -{
2757 -       struct inode *inode = file->f_mapping->host;
2758 -
2759 -       copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
2760 -
2761 -       if (pos+copied > inode->i_size) {
2762 -               i_size_write(inode, pos+copied);
2763 -               mark_inode_dirty(inode);
2764 -       }
2765 -
2766 -       return copied;
2767 -}
2768 -
2769 -/*
2770   * We need to pick up the new inode size which generic_commit_write gave us
2771   * `file' can be NULL - eg, when called from page_symlink().
2772   *
2773 @@ -1290,15 +1420,10 @@ static int ext4_ordered_write_end(struct file *file,
2774                                 struct page *page, void *fsdata)
2775  {
2776         handle_t *handle = ext4_journal_current_handle();
2777 -       struct inode *inode = file->f_mapping->host;
2778 -       unsigned from, to;
2779 +       struct inode *inode = mapping->host;
2780         int ret = 0, ret2;
2781
2782 -       from = pos & (PAGE_CACHE_SIZE - 1);
2783 -       to = from + len;
2784 -
2785 -       ret = walk_page_buffers(handle, page_buffers(page),
2786 -               from, to, NULL, ext4_journal_dirty_data);
2787 +       ret = ext4_jbd2_file_inode(handle, inode);
2788
2789         if (ret == 0) {
2790                 /*
2791 @@ -1311,7 +1436,7 @@ static int ext4_ordered_write_end(struct file *file,
2792                 new_i_size = pos + copied;
2793                 if (new_i_size > EXT4_I(inode)->i_disksize)
2794                         EXT4_I(inode)->i_disksize = new_i_size;
2795 -               ret2 = ext4_generic_write_end(file, mapping, pos, len, copied,
2796 +               ret2 = generic_write_end(file, mapping, pos, len, copied,
2797                                                         page, fsdata);
2798                 copied = ret2;
2799                 if (ret2 < 0)
2800 @@ -1320,8 +1445,6 @@ static int ext4_ordered_write_end(struct file *file,
2801         ret2 = ext4_journal_stop(handle);
2802         if (!ret)
2803                 ret = ret2;
2804 -       unlock_page(page);
2805 -       page_cache_release(page);
2806
2807         return ret ? ret : copied;
2808  }
2809 @@ -1332,7 +1455,7 @@ static int ext4_writeback_write_end(struct file *file,
2810                                 struct page *page, void *fsdata)
2811  {
2812         handle_t *handle = ext4_journal_current_handle();
2813 -       struct inode *inode = file->f_mapping->host;
2814 +       struct inode *inode = mapping->host;
2815         int ret = 0, ret2;
2816         loff_t new_i_size;
2817
2818 @@ -1340,7 +1463,7 @@ static int ext4_writeback_write_end(struct file *file,
2819         if (new_i_size > EXT4_I(inode)->i_disksize)
2820                 EXT4_I(inode)->i_disksize = new_i_size;
2821
2822 -       ret2 = ext4_generic_write_end(file, mapping, pos, len, copied,
2823 +       ret2 = generic_write_end(file, mapping, pos, len, copied,
2824                                                         page, fsdata);
2825         copied = ret2;
2826         if (ret2 < 0)
2827 @@ -1349,8 +1472,6 @@ static int ext4_writeback_write_end(struct file *file,
2828         ret2 = ext4_journal_stop(handle);
2829         if (!ret)
2830                 ret = ret2;
2831 -       unlock_page(page);
2832 -       page_cache_release(page);
2833
2834         return ret ? ret : copied;
2835  }
2836 @@ -1389,15 +1510,1028 @@ static int ext4_journalled_write_end(struct file *file,
2837                         ret = ret2;
2838         }
2839
2840 +       unlock_page(page);
2841         ret2 = ext4_journal_stop(handle);
2842         if (!ret)
2843                 ret = ret2;
2844 -       unlock_page(page);
2845         page_cache_release(page);
2846
2847         return ret ? ret : copied;
2848  }
2849
2850 +static int ext4_da_reserve_space(struct inode *inode, int nrblocks)
2851 +{
2852 +       struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
2853 +       unsigned long md_needed, mdblocks, total = 0;
2854 +
2855 +       /*
2856 +        * recalculate the amount of metadata blocks to reserve
2857 +        * in order to allocate nrblocks
2858 +        * worse case is one extent per block
2859 +        */
2860 +       spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
2861 +       total = EXT4_I(inode)->i_reserved_data_blocks + nrblocks;
2862 +       mdblocks = ext4_calc_metadata_amount(inode, total);
2863 +       BUG_ON(mdblocks < EXT4_I(inode)->i_reserved_meta_blocks);
2864 +
2865 +       md_needed = mdblocks - EXT4_I(inode)->i_reserved_meta_blocks;
2866 +       total = md_needed + nrblocks;
2867 +
2868 +       if (ext4_has_free_blocks(sbi, total) < total) {
2869 +               spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
2870 +               return -ENOSPC;
2871 +       }
2872 +       /* reduce fs free blocks counter */
2873 +       percpu_counter_sub(&sbi->s_freeblocks_counter, total);
2874 +
2875 +       EXT4_I(inode)->i_reserved_data_blocks += nrblocks;
2876 +       EXT4_I(inode)->i_reserved_meta_blocks = mdblocks;
2877 +
2878 +       spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
2879 +       return 0;       /* success */
2880 +}
2881 +
2882 +static void ext4_da_release_space(struct inode *inode, int to_free)
2883 +{
2884 +       struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
2885 +       int total, mdb, mdb_free, release;
2886 +
2887 +       if (!to_free)
2888 +               return;         /* Nothing to release, exit */
2889 +
2890 +       spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
2891 +
2892 +       if (!EXT4_I(inode)->i_reserved_data_blocks) {
2893 +               /*
2894 +                * if there is no reserved blocks, but we try to free some
2895 +                * then the counter is messed up somewhere.
2896 +                * but since this function is called from invalidate
2897 +                * page, it's harmless to return without any action
2898 +                */
2899 +               printk(KERN_INFO "ext4 delalloc try to release %d reserved "
2900 +                           "blocks for inode %lu, but there is no reserved "
2901 +                           "data blocks\n", to_free, inode->i_ino);
2902 +               spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
2903 +               return;
2904 +       }
2905 +
2906 +       /* recalculate the number of metablocks still need to be reserved */
2907 +       total = EXT4_I(inode)->i_reserved_data_blocks - to_free;
2908 +       mdb = ext4_calc_metadata_amount(inode, total);
2909 +
2910 +       /* figure out how many metablocks to release */
2911 +       BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks);
2912 +       mdb_free = EXT4_I(inode)->i_reserved_meta_blocks - mdb;
2913 +
2914 +       release = to_free + mdb_free;
2915 +
2916 +       /* update fs free blocks counter for truncate case */
2917 +       percpu_counter_add(&sbi->s_freeblocks_counter, release);
2918 +
2919 +       /* update per-inode reservations */
2920 +       BUG_ON(to_free > EXT4_I(inode)->i_reserved_data_blocks);
2921 +       EXT4_I(inode)->i_reserved_data_blocks -= to_free;
2922 +
2923 +       BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks);
2924 +       EXT4_I(inode)->i_reserved_meta_blocks = mdb;
2925 +       spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
2926 +}
2927 +
2928 +static void ext4_da_page_release_reservation(struct page *page,
2929 +                                               unsigned long offset)
2930 +{
2931 +       int to_release = 0;
2932 +       struct buffer_head *head, *bh;
2933 +       unsigned int curr_off = 0;
2934 +
2935 +       head = page_buffers(page);
2936 +       bh = head;
2937 +       do {
2938 +               unsigned int next_off = curr_off + bh->b_size;
2939 +
2940 +               if ((offset <= curr_off) && (buffer_delay(bh))) {
2941 +                       to_release++;
2942 +                       clear_buffer_delay(bh);
2943 +               }
2944 +               curr_off = next_off;
2945 +       } while ((bh = bh->b_this_page) != head);
2946 +       ext4_da_release_space(page->mapping->host, to_release);
2947 +}
2948 +
2949 +/*
2950 + * Delayed allocation stuff
2951 + */
2952 +
2953 +struct mpage_da_data {
2954 +       struct inode *inode;
2955 +       struct buffer_head lbh;                 /* extent of blocks */
2956 +       unsigned long first_page, next_page;    /* extent of pages */
2957 +       get_block_t *get_block;
2958 +       struct writeback_control *wbc;
2959 +       int io_done;
2960 +       long pages_written;
2961 +};
2962 +
2963 +/*
2964 + * mpage_da_submit_io - walks through extent of pages and try to write
2965 + * them with writepage() call back
2966 + *
2967 + * @mpd->inode: inode
2968 + * @mpd->first_page: first page of the extent
2969 + * @mpd->next_page: page after the last page of the extent
2970 + * @mpd->get_block: the filesystem's block mapper function
2971 + *
2972 + * By the time mpage_da_submit_io() is called we expect all blocks
2973 + * to be allocated. this may be wrong if allocation failed.
2974 + *
2975 + * As pages are already locked by write_cache_pages(), we can't use it
2976 + */
2977 +static int mpage_da_submit_io(struct mpage_da_data *mpd)
2978 +{
2979 +       struct address_space *mapping = mpd->inode->i_mapping;
2980 +       int ret = 0, err, nr_pages, i;
2981 +       unsigned long index, end;
2982 +       struct pagevec pvec;
2983 +
2984 +       BUG_ON(mpd->next_page <= mpd->first_page);
2985 +       pagevec_init(&pvec, 0);
2986 +       index = mpd->first_page;
2987 +       end = mpd->next_page - 1;
2988 +
2989 +       while (index <= end) {
2990 +               /* XXX: optimize tail */
2991 +               nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE);
2992 +               if (nr_pages == 0)
2993 +                       break;
2994 +               for (i = 0; i < nr_pages; i++) {
2995 +                       struct page *page = pvec.pages[i];
2996 +
2997 +                       index = page->index;
2998 +                       if (index > end)
2999 +                               break;
3000 +                       index++;
3001 +
3002 +                       err = mapping->a_ops->writepage(page, mpd->wbc);
3003 +                       if (!err)
3004 +                               mpd->pages_written++;
3005 +                       /*
3006 +                        * In error case, we have to continue because
3007 +                        * remaining pages are still locked
3008 +                        * XXX: unlock and re-dirty them?
3009 +                        */
3010 +                       if (ret == 0)
3011 +                               ret = err;
3012 +               }
3013 +               pagevec_release(&pvec);
3014 +       }
3015 +       return ret;
3016 +}
3017 +
3018 +/*
3019 + * mpage_put_bnr_to_bhs - walk blocks and assign them actual numbers
3020 + *
3021 + * @mpd->inode - inode to walk through
3022 + * @exbh->b_blocknr - first block on a disk
3023 + * @exbh->b_size - amount of space in bytes
3024 + * @logical - first logical block to start assignment with
3025 + *
3026 + * the function goes through all passed space and put actual disk
3027 + * block numbers into buffer heads, dropping BH_Delay
3028 + */
3029 +static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd, sector_t logical,
3030 +                                struct buffer_head *exbh)
3031 +{
3032 +       struct inode *inode = mpd->inode;
3033 +       struct address_space *mapping = inode->i_mapping;
3034 +       int blocks = exbh->b_size >> inode->i_blkbits;
3035 +       sector_t pblock = exbh->b_blocknr, cur_logical;
3036 +       struct buffer_head *head, *bh;
3037 +       pgoff_t index, end;
3038 +       struct pagevec pvec;
3039 +       int nr_pages, i;
3040 +
3041 +       index = logical >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
3042 +       end = (logical + blocks - 1) >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
3043 +       cur_logical = index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
3044 +
3045 +       pagevec_init(&pvec, 0);
3046 +
3047 +       while (index <= end) {
3048 +               /* XXX: optimize tail */
3049 +               nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE);
3050 +               if (nr_pages == 0)
3051 +                       break;
3052 +               for (i = 0; i < nr_pages; i++) {
3053 +                       struct page *page = pvec.pages[i];
3054 +
3055 +                       index = page->index;
3056 +                       if (index > end)
3057 +                               break;
3058 +                       index++;
3059 +
3060 +                       BUG_ON(!PageLocked(page));
3061 +                       BUG_ON(PageWriteback(page));
3062 +                       BUG_ON(!page_has_buffers(page));
3063 +
3064 +                       bh = page_buffers(page);
3065 +                       head = bh;
3066 +
3067 +                       /* skip blocks out of the range */
3068 +                       do {
3069 +                               if (cur_logical >= logical)
3070 +                                       break;
3071 +                               cur_logical++;
3072 +                       } while ((bh = bh->b_this_page) != head);
3073 +
3074 +                       do {
3075 +                               if (cur_logical >= logical + blocks)
3076 +                                       break;
3077 +                               if (buffer_delay(bh)) {
3078 +                                       bh->b_blocknr = pblock;
3079 +                                       clear_buffer_delay(bh);
3080 +                                       bh->b_bdev = inode->i_sb->s_bdev;
3081 +                               } else if (buffer_unwritten(bh)) {
3082 +                                       bh->b_blocknr = pblock;
3083 +                                       clear_buffer_unwritten(bh);
3084 +                                       set_buffer_mapped(bh);
3085 +                                       set_buffer_new(bh);
3086 +                                       bh->b_bdev = inode->i_sb->s_bdev;
3087 +                               } else if (buffer_mapped(bh))
3088 +                                       BUG_ON(bh->b_blocknr != pblock);
3089 +
3090 +                               cur_logical++;
3091 +                               pblock++;
3092 +                       } while ((bh = bh->b_this_page) != head);
3093 +               }
3094 +               pagevec_release(&pvec);
3095 +       }
3096 +}
3097 +
3098 +
3099 +/*
3100 + * __unmap_underlying_blocks - just a helper function to unmap
3101 + * set of blocks described by @bh
3102 + */
3103 +static inline void __unmap_underlying_blocks(struct inode *inode,
3104 +                                            struct buffer_head *bh)
3105 +{
3106 +       struct block_device *bdev = inode->i_sb->s_bdev;
3107 +       int blocks, i;
3108 +
3109 +       blocks = bh->b_size >> inode->i_blkbits;
3110 +       for (i = 0; i < blocks; i++)
3111 +               unmap_underlying_metadata(bdev, bh->b_blocknr + i);
3112 +}
3113 +
3114 +/*
3115 + * mpage_da_map_blocks - go through given space
3116 + *
3117 + * @mpd->lbh - bh describing space
3118 + * @mpd->get_block - the filesystem's block mapper function
3119 + *
3120 + * The function skips space we know is already mapped to disk blocks.
3121 + *
3122 + */
3123 +static void mpage_da_map_blocks(struct mpage_da_data *mpd)
3124 +{
3125 +       int err = 0;
3126 +       struct buffer_head *lbh = &mpd->lbh;
3127 +       sector_t next = lbh->b_blocknr;
3128 +       struct buffer_head new;
3129 +
3130 +       /*
3131 +        * We consider only non-mapped and non-allocated blocks
3132 +        */
3133 +       if (buffer_mapped(lbh) && !buffer_delay(lbh))
3134 +               return;
3135 +
3136 +       new.b_state = lbh->b_state;
3137 +       new.b_blocknr = 0;
3138 +       new.b_size = lbh->b_size;
3139 +
3140 +       /*
3141 +        * If we didn't accumulate anything
3142 +        * to write simply return
3143 +        */
3144 +       if (!new.b_size)
3145 +               return;
3146 +       err = mpd->get_block(mpd->inode, next, &new, 1);
3147 +       if (err)
3148 +               return;
3149 +       BUG_ON(new.b_size == 0);
3150 +
3151 +       if (buffer_new(&new))
3152 +               __unmap_underlying_blocks(mpd->inode, &new);
3153 +
3154 +       /*
3155 +        * If blocks are delayed marked, we need to
3156 +        * put actual blocknr and drop delayed bit
3157 +        */
3158 +       if (buffer_delay(lbh) || buffer_unwritten(lbh))
3159 +               mpage_put_bnr_to_bhs(mpd, next, &new);
3160 +
3161 +       return;
3162 +}
3163 +
3164 +#define BH_FLAGS ((1 << BH_Uptodate) | (1 << BH_Mapped) | \
3165 +               (1 << BH_Delay) | (1 << BH_Unwritten))
3166 +
3167 +/*
3168 + * mpage_add_bh_to_extent - try to add one more block to extent of blocks
3169 + *
3170 + * @mpd->lbh - extent of blocks
3171 + * @logical - logical number of the block in the file
3172 + * @bh - bh of the block (used to access block's state)
3173 + *
3174 + * the function is used to collect contig. blocks in same state
3175 + */
3176 +static void mpage_add_bh_to_extent(struct mpage_da_data *mpd,
3177 +                                  sector_t logical, struct buffer_head *bh)
3178 +{
3179 +       sector_t next;
3180 +       size_t b_size = bh->b_size;
3181 +       struct buffer_head *lbh = &mpd->lbh;
3182 +       int nrblocks = lbh->b_size >> mpd->inode->i_blkbits;
3183 +
3184 +       /* check if thereserved journal credits might overflow */
3185 +       if (!(EXT4_I(mpd->inode)->i_flags & EXT4_EXTENTS_FL)) {
3186 +               if (nrblocks >= EXT4_MAX_TRANS_DATA) {
3187 +                       /*
3188 +                        * With non-extent format we are limited by the journal
3189 +                        * credit available.  Total credit needed to insert
3190 +                        * nrblocks contiguous blocks is dependent on the
3191 +                        * nrblocks.  So limit nrblocks.
3192 +                        */
3193 +                       goto flush_it;
3194 +               } else if ((nrblocks + (b_size >> mpd->inode->i_blkbits)) >
3195 +                               EXT4_MAX_TRANS_DATA) {
3196 +                       /*
3197 +                        * Adding the new buffer_head would make it cross the
3198 +                        * allowed limit for which we have journal credit
3199 +                        * reserved. So limit the new bh->b_size
3200 +                        */
3201 +                       b_size = (EXT4_MAX_TRANS_DATA - nrblocks) <<
3202 +                                               mpd->inode->i_blkbits;
3203 +                       /* we will do mpage_da_submit_io in the next loop */
3204 +               }
3205 +       }
3206 +       /*
3207 +        * First block in the extent
3208 +        */
3209 +       if (lbh->b_size == 0) {
3210 +               lbh->b_blocknr = logical;
3211 +               lbh->b_size = b_size;
3212 +               lbh->b_state = bh->b_state & BH_FLAGS;
3213 +               return;
3214 +       }
3215 +
3216 +       next = lbh->b_blocknr + nrblocks;
3217 +       /*
3218 +        * Can we merge the block to our big extent?
3219 +        */
3220 +       if (logical == next && (bh->b_state & BH_FLAGS) == lbh->b_state) {
3221 +               lbh->b_size += b_size;
3222 +               return;
3223 +       }
3224 +
3225 +flush_it:
3226 +       /*
3227 +        * We couldn't merge the block to our extent, so we
3228 +        * need to flush current  extent and start new one
3229 +        */
3230 +       mpage_da_map_blocks(mpd);
3231 +       mpage_da_submit_io(mpd);
3232 +       mpd->io_done = 1;
3233 +       return;
3234 +}
3235 +
3236 +/*
3237 + * __mpage_da_writepage - finds extent of pages and blocks
3238 + *
3239 + * @page: page to consider
3240 + * @wbc: not used, we just follow rules
3241 + * @data: context
3242 + *
3243 + * The function finds extents of pages and scan them for all blocks.
3244 + */
3245 +static int __mpage_da_writepage(struct page *page,
3246 +                               struct writeback_control *wbc, void *data)
3247 +{
3248 +       struct mpage_da_data *mpd = data;
3249 +       struct inode *inode = mpd->inode;
3250 +       struct buffer_head *bh, *head, fake;
3251 +       sector_t logical;
3252 +
3253 +       if (mpd->io_done) {
3254 +               /*
3255 +                * Rest of the page in the page_vec
3256 +                * redirty then and skip then. We will
3257 +                * try to to write them again after
3258 +                * starting a new transaction
3259 +                */
3260 +               redirty_page_for_writepage(wbc, page);
3261 +               unlock_page(page);
3262 +               return MPAGE_DA_EXTENT_TAIL;
3263 +       }
3264 +       /*
3265 +        * Can we merge this page to current extent?
3266 +        */
3267 +       if (mpd->next_page != page->index) {
3268 +               /*
3269 +                * Nope, we can't. So, we map non-allocated blocks
3270 +                * and start IO on them using writepage()
3271 +                */
3272 +               if (mpd->next_page != mpd->first_page) {
3273 +                       mpage_da_map_blocks(mpd);
3274 +                       mpage_da_submit_io(mpd);
3275 +                       /*
3276 +                        * skip rest of the page in the page_vec
3277 +                        */
3278 +                       mpd->io_done = 1;
3279 +                       redirty_page_for_writepage(wbc, page);
3280 +                       unlock_page(page);
3281 +                       return MPAGE_DA_EXTENT_TAIL;
3282 +               }
3283 +
3284 +               /*
3285 +                * Start next extent of pages ...
3286 +                */
3287 +               mpd->first_page = page->index;
3288 +
3289 +               /*
3290 +                * ... and blocks
3291 +                */
3292 +               mpd->lbh.b_size = 0;
3293 +               mpd->lbh.b_state = 0;
3294 +               mpd->lbh.b_blocknr = 0;
3295 +       }
3296 +
3297 +       mpd->next_page = page->index + 1;
3298 +       logical = (sector_t) page->index <<
3299 +                 (PAGE_CACHE_SHIFT - inode->i_blkbits);
3300 +
3301 +       if (!page_has_buffers(page)) {
3302 +               /*
3303 +                * There is no attached buffer heads yet (mmap?)
3304 +                * we treat the page asfull of dirty blocks
3305 +                */
3306 +               bh = &fake;
3307 +               bh->b_size = PAGE_CACHE_SIZE;
3308 +               bh->b_state = 0;
3309 +               set_buffer_dirty(bh);
3310 +               set_buffer_uptodate(bh);
3311 +               mpage_add_bh_to_extent(mpd, logical, bh);
3312 +               if (mpd->io_done)
3313 +                       return MPAGE_DA_EXTENT_TAIL;
3314 +       } else {
3315 +               /*
3316 +                * Page with regular buffer heads, just add all dirty ones
3317 +                */
3318 +               head = page_buffers(page);
3319 +               bh = head;
3320 +               do {
3321 +                       BUG_ON(buffer_locked(bh));
3322 +                       if (buffer_dirty(bh) &&
3323 +                               (!buffer_mapped(bh) || buffer_delay(bh))) {
3324 +                               mpage_add_bh_to_extent(mpd, logical, bh);
3325 +                               if (mpd->io_done)
3326 +                                       return MPAGE_DA_EXTENT_TAIL;
3327 +                       }
3328 +                       logical++;
3329 +               } while ((bh = bh->b_this_page) != head);
3330 +       }
3331 +
3332 +       return 0;
3333 +}
3334 +
3335 +/*
3336 + * mpage_da_writepages - walk the list of dirty pages of the given
3337 + * address space, allocates non-allocated blocks, maps newly-allocated
3338 + * blocks to existing bhs and issue IO them
3339 + *
3340 + * @mapping: address space structure to write
3341 + * @wbc: subtract the number of written pages from *@wbc->nr_to_write
3342 + * @get_block: the filesystem's block mapper function.
3343 + *
3344 + * This is a library function, which implements the writepages()
3345 + * address_space_operation.
3346 + */
3347 +static int mpage_da_writepages(struct address_space *mapping,
3348 +                              struct writeback_control *wbc,
3349 +                              get_block_t get_block)
3350 +{
3351 +       struct mpage_da_data mpd;
3352 +       long to_write;
3353 +       int ret;
3354 +
3355 +       if (!get_block)
3356 +               return generic_writepages(mapping, wbc);
3357 +
3358 +       mpd.wbc = wbc;
3359 +       mpd.inode = mapping->host;
3360 +       mpd.lbh.b_size = 0;
3361 +       mpd.lbh.b_state = 0;
3362 +       mpd.lbh.b_blocknr = 0;
3363 +       mpd.first_page = 0;
3364 +       mpd.next_page = 0;
3365 +       mpd.get_block = get_block;
3366 +       mpd.io_done = 0;
3367 +       mpd.pages_written = 0;
3368 +
3369 +       to_write = wbc->nr_to_write;
3370 +
3371 +       ret = write_cache_pages(mapping, wbc, __mpage_da_writepage, &mpd);
3372 +
3373 +       /*
3374 +        * Handle last extent of pages
3375 +        */
3376 +       if (!mpd.io_done && mpd.next_page != mpd.first_page) {
3377 +               mpage_da_map_blocks(&mpd);
3378 +               mpage_da_submit_io(&mpd);
3379 +       }
3380 +
3381 +       wbc->nr_to_write = to_write - mpd.pages_written;
3382 +       return ret;
3383 +}
3384 +
3385 +/*
3386 + * this is a special callback for ->write_begin() only
3387 + * it's intention is to return mapped block or reserve space
3388 + */
3389 +static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
3390 +                                 struct buffer_head *bh_result, int create)
3391 +{
3392 +       int ret = 0;
3393 +
3394 +       BUG_ON(create == 0);
3395 +       BUG_ON(bh_result->b_size != inode->i_sb->s_blocksize);
3396 +
3397 +       /*
3398 +        * first, we need to know whether the block is allocated already
3399 +        * preallocated blocks are unmapped but should treated
3400 +        * the same as allocated blocks.
3401 +        */
3402 +       ret = ext4_get_blocks_wrap(NULL, inode, iblock, 1,  bh_result, 0, 0, 0);
3403 +       if ((ret == 0) && !buffer_delay(bh_result)) {
3404 +               /* the block isn't (pre)allocated yet, let's reserve space */
3405 +               /*
3406 +                * XXX: __block_prepare_write() unmaps passed block,
3407 +                * is it OK?
3408 +                */
3409 +               ret = ext4_da_reserve_space(inode, 1);
3410 +               if (ret)
3411 +                       /* not enough space to reserve */
3412 +                       return ret;
3413 +
3414 +               map_bh(bh_result, inode->i_sb, 0);
3415 +               set_buffer_new(bh_result);
3416 +               set_buffer_delay(bh_result);
3417 +       } else if (ret > 0) {
3418 +               bh_result->b_size = (ret << inode->i_blkbits);
3419 +               ret = 0;
3420 +       }
3421 +
3422 +       return ret;
3423 +}
3424 +#define                EXT4_DELALLOC_RSVED     1
3425 +static int ext4_da_get_block_write(struct inode *inode, sector_t iblock,
3426 +                                  struct buffer_head *bh_result, int create)
3427 +{
3428 +       int ret;
3429 +       unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
3430 +       loff_t disksize = EXT4_I(inode)->i_disksize;
3431 +       handle_t *handle = NULL;
3432 +
3433 +       handle = ext4_journal_current_handle();
3434 +       if (!handle) {
3435 +               ret = ext4_get_blocks_wrap(handle, inode, iblock, max_blocks,
3436 +                                  bh_result, 0, 0, 0);
3437 +               BUG_ON(!ret);
3438 +       } else {
3439 +               ret = ext4_get_blocks_wrap(handle, inode, iblock, max_blocks,
3440 +                                  bh_result, create, 0, EXT4_DELALLOC_RSVED);
3441 +       }
3442 +
3443 +       if (ret > 0) {
3444 +               bh_result->b_size = (ret << inode->i_blkbits);
3445 +
3446 +               /*
3447 +                * Update on-disk size along with block allocation
3448 +                * we don't use 'extend_disksize' as size may change
3449 +                * within already allocated block -bzzz
3450 +                */
3451 +               disksize = ((loff_t) iblock + ret) << inode->i_blkbits;
3452 +               if (disksize > i_size_read(inode))
3453 +                       disksize = i_size_read(inode);
3454 +               if (disksize > EXT4_I(inode)->i_disksize) {
3455 +                       /*
3456 +                        * XXX: replace with spinlock if seen contended -bzzz
3457 +                        */
3458 +                       down_write(&EXT4_I(inode)->i_data_sem);
3459 +                       if (disksize > EXT4_I(inode)->i_disksize)
3460 +                               EXT4_I(inode)->i_disksize = disksize;
3461 +                       up_write(&EXT4_I(inode)->i_data_sem);
3462 +
3463 +                       if (EXT4_I(inode)->i_disksize == disksize) {
3464 +                               ret = ext4_mark_inode_dirty(handle, inode);
3465 +                               return ret;
3466 +                       }
3467 +               }
3468 +               ret = 0;
3469 +       }
3470 +       return ret;
3471 +}
3472 +
3473 +static int ext4_bh_unmapped_or_delay(handle_t *handle, struct buffer_head *bh)
3474 +{
3475 +       /*
3476 +        * unmapped buffer is possible for holes.
3477 +        * delay buffer is possible with delayed allocation
3478 +        */
3479 +       return ((!buffer_mapped(bh) || buffer_delay(bh)) && buffer_dirty(bh));
3480 +}
3481 +
3482 +static int ext4_normal_get_block_write(struct inode *inode, sector_t iblock,
3483 +                                  struct buffer_head *bh_result, int create)
3484 +{
3485 +       int ret = 0;
3486 +       unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
3487 +
3488 +       /*
3489 +        * we don't want to do block allocation in writepage
3490 +        * so call get_block_wrap with create = 0
3491 +        */
3492 +       ret = ext4_get_blocks_wrap(NULL, inode, iblock, max_blocks,
3493 +                                  bh_result, 0, 0, 0);
3494 +       if (ret > 0) {
3495 +               bh_result->b_size = (ret << inode->i_blkbits);
3496 +               ret = 0;
3497 +       }
3498 +       return ret;
3499 +}
3500 +
3501 +/*
3502 + * get called vi ext4_da_writepages after taking page lock (have journal handle)
3503 + * get called via journal_submit_inode_data_buffers (no journal handle)
3504 + * get called via shrink_page_list via pdflush (no journal handle)
3505 + * or grab_page_cache when doing write_begin (have journal handle)
3506 + */
3507 +static int ext4_da_writepage(struct page *page,
3508 +                               struct writeback_control *wbc)
3509 +{
3510 +       int ret = 0;
3511 +       loff_t size;
3512 +       unsigned long len;
3513 +       struct buffer_head *page_bufs;
3514 +       struct inode *inode = page->mapping->host;
3515 +
3516 +       size = i_size_read(inode);
3517 +       if (page->index == size >> PAGE_CACHE_SHIFT)
3518 +               len = size & ~PAGE_CACHE_MASK;
3519 +       else
3520 +               len = PAGE_CACHE_SIZE;
3521 +
3522 +       if (page_has_buffers(page)) {
3523 +               page_bufs = page_buffers(page);
3524 +               if (walk_page_buffers(NULL, page_bufs, 0, len, NULL,
3525 +                                       ext4_bh_unmapped_or_delay)) {
3526 +                       /*
3527 +                        * We don't want to do  block allocation
3528 +                        * So redirty the page and return
3529 +                        * We may reach here when we do a journal commit
3530 +                        * via journal_submit_inode_data_buffers.
3531 +                        * If we don't have mapping block we just ignore
3532 +                        * them. We can also reach here via shrink_page_list
3533 +                        */
3534 +                       redirty_page_for_writepage(wbc, page);
3535 +                       unlock_page(page);
3536 +                       return 0;
3537 +               }
3538 +       } else {
3539 +               /*
3540 +                * The test for page_has_buffers() is subtle:
3541 +                * We know the page is dirty but it lost buffers. That means
3542 +                * that at some moment in time after write_begin()/write_end()
3543 +                * has been called all buffers have been clean and thus they
3544 +                * must have been written at least once. So they are all
3545 +                * mapped and we can happily proceed with mapping them
3546 +                * and writing the page.
3547 +                *
3548 +                * Try to initialize the buffer_heads and check whether
3549 +                * all are mapped and non delay. We don't want to
3550 +                * do block allocation here.
3551 +                */
3552 +               ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE,
3553 +                                               ext4_normal_get_block_write);
3554 +               if (!ret) {
3555 +                       page_bufs = page_buffers(page);
3556 +                       /* check whether all are mapped and non delay */
3557 +                       if (walk_page_buffers(NULL, page_bufs, 0, len, NULL,
3558 +                                               ext4_bh_unmapped_or_delay)) {
3559 +                               redirty_page_for_writepage(wbc, page);
3560 +                               unlock_page(page);
3561 +                               return 0;
3562 +                       }
3563 +               } else {
3564 +                       /*
3565 +                        * We can't do block allocation here
3566 +                        * so just redity the page and unlock
3567 +                        * and return
3568 +                        */
3569 +                       redirty_page_for_writepage(wbc, page);
3570 +                       unlock_page(page);
3571 +                       return 0;
3572 +               }
3573 +       }
3574 +
3575 +       if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode))
3576 +               ret = nobh_writepage(page, ext4_normal_get_block_write, wbc);
3577 +       else
3578 +               ret = block_write_full_page(page,
3579 +                                               ext4_normal_get_block_write,
3580 +                                               wbc);
3581 +
3582 +       return ret;
3583 +}
3584 +
3585 +/*
3586 + * This is called via ext4_da_writepages() to
3587 + * calulate the total number of credits to reserve to fit
3588 + * a single extent allocation into a single transaction,
3589 + * ext4_da_writpeages() will loop calling this before
3590 + * the block allocation.
3591 + */
3592 +
3593 +static int ext4_da_writepages_trans_blocks(struct inode *inode)
3594 +{
3595 +       int max_blocks = EXT4_I(inode)->i_reserved_data_blocks;
3596 +
3597 +       /*
3598 +        * With non-extent format the journal credit needed to
3599 +        * insert nrblocks contiguous block is dependent on
3600 +        * number of contiguous block. So we will limit
3601 +        * number of contiguous block to a sane value
3602 +        */
3603 +       if (!(inode->i_flags & EXT4_EXTENTS_FL) &&
3604 +           (max_blocks > EXT4_MAX_TRANS_DATA))
3605 +               max_blocks = EXT4_MAX_TRANS_DATA;
3606 +
3607 +       return ext4_chunk_trans_blocks(inode, max_blocks);
3608 +}
3609 +
3610 +static int ext4_da_writepages(struct address_space *mapping,
3611 +                             struct writeback_control *wbc)
3612 +{
3613 +       handle_t *handle = NULL;
3614 +       loff_t range_start = 0;
3615 +       struct inode *inode = mapping->host;
3616 +       int needed_blocks, ret = 0, nr_to_writebump = 0;
3617 +       long to_write, pages_skipped = 0;
3618 +       struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);
3619 +
3620 +       /*
3621 +        * No pages to write? This is mainly a kludge to avoid starting
3622 +        * a transaction for special inodes like journal inode on last iput()
3623 +        * because that could violate lock ordering on umount
3624 +        */
3625 +       if (!mapping->nrpages || !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
3626 +               return 0;
3627 +       /*
3628 +        * Make sure nr_to_write is >= sbi->s_mb_stream_request
3629 +        * This make sure small files blocks are allocated in
3630 +        * single attempt. This ensure that small files
3631 +        * get less fragmented.
3632 +        */
3633 +       if (wbc->nr_to_write < sbi->s_mb_stream_request) {
3634 +               nr_to_writebump = sbi->s_mb_stream_request - wbc->nr_to_write;
3635 +               wbc->nr_to_write = sbi->s_mb_stream_request;
3636 +       }
3637 +
3638 +       if (!wbc->range_cyclic)
3639 +               /*
3640 +                * If range_cyclic is not set force range_cont
3641 +                * and save the old writeback_index
3642 +                */
3643 +               wbc->range_cont = 1;
3644 +
3645 +       range_start =  wbc->range_start;
3646 +       pages_skipped = wbc->pages_skipped;
3647 +
3648 +restart_loop:
3649 +       to_write = wbc->nr_to_write;
3650 +       while (!ret && to_write > 0) {
3651 +
3652 +               /*
3653 +                * we  insert one extent at a time. So we need
3654 +                * credit needed for single extent allocation.
3655 +                * journalled mode is currently not supported
3656 +                * by delalloc
3657 +                */
3658 +               BUG_ON(ext4_should_journal_data(inode));
3659 +               needed_blocks = ext4_da_writepages_trans_blocks(inode);
3660 +
3661 +               /* start a new transaction*/
3662 +               handle = ext4_journal_start(inode, needed_blocks);
3663 +               if (IS_ERR(handle)) {
3664 +                       ret = PTR_ERR(handle);
3665 +                       printk(KERN_EMERG "%s: jbd2_start: "
3666 +                              "%ld pages, ino %lu; err %d\n", __func__,
3667 +                               wbc->nr_to_write, inode->i_ino, ret);
3668 +                       dump_stack();
3669 +                       goto out_writepages;
3670 +               }
3671 +               if (ext4_should_order_data(inode)) {
3672 +                       /*
3673 +                        * With ordered mode we need to add
3674 +                        * the inode to the journal handl
3675 +                        * when we do block allocation.
3676 +                        */
3677 +                       ret = ext4_jbd2_file_inode(handle, inode);
3678 +                       if (ret) {
3679 +                               ext4_journal_stop(handle);
3680 +                               goto out_writepages;
3681 +                       }
3682 +               }
3683 +
3684 +               to_write -= wbc->nr_to_write;
3685 +               ret = mpage_da_writepages(mapping, wbc,
3686 +                                         ext4_da_get_block_write);
3687 +               ext4_journal_stop(handle);
3688 +               if (ret == MPAGE_DA_EXTENT_TAIL) {
3689 +                       /*
3690 +                        * got one extent now try with
3691 +                        * rest of the pages
3692 +                        */
3693 +                       to_write += wbc->nr_to_write;
3694 +                       ret = 0;
3695 +               } else if (wbc->nr_to_write) {
3696 +                       /*
3697 +                        * There is no more writeout needed
3698 +                        * or we requested for a noblocking writeout
3699 +                        * and we found the device congested
3700 +                        */
3701 +                       to_write += wbc->nr_to_write;
3702 +                       break;
3703 +               }
3704 +               wbc->nr_to_write = to_write;
3705 +       }
3706 +
3707 +       if (wbc->range_cont && (pages_skipped != wbc->pages_skipped)) {
3708 +               /* We skipped pages in this loop */
3709 +               wbc->range_start = range_start;
3710 +               wbc->nr_to_write = to_write +
3711 +                               wbc->pages_skipped - pages_skipped;
3712 +               wbc->pages_skipped = pages_skipped;
3713 +               goto restart_loop;
3714 +       }
3715 +
3716 +out_writepages:
3717 +       wbc->nr_to_write = to_write - nr_to_writebump;
3718 +       wbc->range_start = range_start;
3719 +       return ret;
3720 +}
3721 +
3722 +static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
3723 +                               loff_t pos, unsigned len, unsigned flags,
3724 +                               struct page **pagep, void **fsdata)
3725 +{
3726 +       int ret, retries = 0;
3727 +       struct page *page;
3728 +       pgoff_t index;
3729 +       unsigned from, to;
3730 +       struct inode *inode = mapping->host;
3731 +       handle_t *handle;
3732 +
3733 +       index = pos >> PAGE_CACHE_SHIFT;
3734 +       from = pos & (PAGE_CACHE_SIZE - 1);
3735 +       to = from + len;
3736 +
3737 +retry:
3738 +       /*
3739 +        * With delayed allocation, we don't log the i_disksize update
3740 +        * if there is delayed block allocation. But we still need
3741 +        * to journalling the i_disksize update if writes to the end
3742 +        * of file which has an already mapped buffer.
3743 +        */
3744 +       handle = ext4_journal_start(inode, 1);
3745 +       if (IS_ERR(handle)) {
3746 +               ret = PTR_ERR(handle);
3747 +               goto out;
3748 +       }
3749 +
3750 +       page = __grab_cache_page(mapping, index);
3751 +       if (!page) {
3752 +               ext4_journal_stop(handle);
3753 +               ret = -ENOMEM;
3754 +               goto out;
3755 +       }
3756 +       *pagep = page;
3757 +
3758 +       ret = block_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
3759 +                                                       ext4_da_get_block_prep);
3760 +       if (ret < 0) {
3761 +               unlock_page(page);
3762 +               ext4_journal_stop(handle);
3763 +               page_cache_release(page);
3764 +       }
3765 +
3766 +       if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
3767 +               goto retry;
3768 +out:
3769 +       return ret;
3770 +}
3771 +
3772 +/*
3773 + * Check if we should update i_disksize
3774 + * when write to the end of file but not require block allocation
3775 + */
3776 +static int ext4_da_should_update_i_disksize(struct page *page,
3777 +                                        unsigned long offset)
3778 +{
3779 +       struct buffer_head *bh;
3780 +       struct inode *inode = page->mapping->host;
3781 +       unsigned int idx;
3782 +       int i;
3783 +
3784 +       bh = page_buffers(page);
3785 +       idx = offset >> inode->i_blkbits;
3786 +
3787 +       for (i=0; i < idx; i++)
3788 +               bh = bh->b_this_page;
3789 +
3790 +       if (!buffer_mapped(bh) || (buffer_delay(bh)))
3791 +               return 0;
3792 +       return 1;
3793 +}
3794 +
3795 +static int ext4_da_write_end(struct file *file,
3796 +                               struct address_space *mapping,
3797 +                               loff_t pos, unsigned len, unsigned copied,
3798 +                               struct page *page, void *fsdata)
3799 +{
3800 +       struct inode *inode = mapping->host;
3801 +       int ret = 0, ret2;
3802 +       handle_t *handle = ext4_journal_current_handle();
3803 +       loff_t new_i_size;
3804 +       unsigned long start, end;
3805 +
3806 +       start = pos & (PAGE_CACHE_SIZE - 1);
3807 +       end = start + copied -1;
3808 +
3809 +       /*
3810 +        * generic_write_end() will run mark_inode_dirty() if i_size
3811 +        * changes.  So let's piggyback the i_disksize mark_inode_dirty
3812 +        * into that.
3813 +        */
3814 +
3815 +       new_i_size = pos + copied;
3816 +       if (new_i_size > EXT4_I(inode)->i_disksize) {
3817 +               if (ext4_da_should_update_i_disksize(page, end)) {
3818 +                       down_write(&EXT4_I(inode)->i_data_sem);
3819 +                       if (new_i_size > EXT4_I(inode)->i_disksize) {
3820 +                               /*
3821 +                                * Updating i_disksize when extending file
3822 +                                * without needing block allocation
3823 +                                */
3824 +                               if (ext4_should_order_data(inode))
3825 +                                       ret = ext4_jbd2_file_inode(handle,
3826 +                                                                  inode);
3827 +
3828 +                               EXT4_I(inode)->i_disksize = new_i_size;
3829 +                       }
3830 +                       up_write(&EXT4_I(inode)->i_data_sem);
3831 +               }
3832 +       }
3833 +       ret2 = generic_write_end(file, mapping, pos, len, copied,
3834 +                                                       page, fsdata);
3835 +       copied = ret2;
3836 +       if (ret2 < 0)
3837 +               ret = ret2;
3838 +       ret2 = ext4_journal_stop(handle);
3839 +       if (!ret)
3840 +               ret = ret2;
3841 +
3842 +       return ret ? ret : copied;
3843 +}
3844 +
3845 +static void ext4_da_invalidatepage(struct page *page, unsigned long offset)
3846 +{
3847 +       /*
3848 +        * Drop reserved blocks
3849 +        */
3850 +       BUG_ON(!PageLocked(page));
3851 +       if (!page_has_buffers(page))
3852 +               goto out;
3853 +
3854 +       ext4_da_page_release_reservation(page, offset);
3855 +
3856 +out:
3857 +       ext4_invalidatepage(page, offset);
3858 +
3859 +       return;
3860 +}
3861 +
3862 +
3863  /*
3864   * bmap() is special.  It gets used by applications such as lilo and by
3865   * the swapper to find the on-disk block of a specific piece of data.
3866 @@ -1418,6 +2552,16 @@ static sector_t ext4_bmap(struct address_space *mapping, sector_t block)
3867         journal_t *journal;
3868         int err;
3869
3870 +       if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY) &&
3871 +                       test_opt(inode->i_sb, DELALLOC)) {
3872 +               /*
3873 +                * With delalloc we want to sync the file
3874 +                * so that we can make sure we allocate
3875 +                * blocks for file
3876 +                */
3877 +               filemap_write_and_wait(mapping);
3878 +       }
3879 +
3880         if (EXT4_I(inode)->i_state & EXT4_STATE_JDATA) {
3881                 /*
3882                  * This is a REALLY heavyweight approach, but the use of
3883 @@ -1462,21 +2606,17 @@ static int bput_one(handle_t *handle, struct buffer_head *bh)
3884         return 0;
3885  }
3886
3887 -static int jbd2_journal_dirty_data_fn(handle_t *handle, struct buffer_head *bh)
3888 -{
3889 -       if (buffer_mapped(bh))
3890 -               return ext4_journal_dirty_data(handle, bh);
3891 -       return 0;
3892 -}
3893 -
3894  /*
3895 - * Note that we always start a transaction even if we're not journalling
3896 - * data.  This is to preserve ordering: any hole instantiation within
3897 - * __block_write_full_page -> ext4_get_block() should be journalled
3898 - * along with the data so we don't crash and then get metadata which
3899 - * refers to old data.
3900 + * Note that we don't need to start a transaction unless we're journaling data
3901 + * because we should have holes filled from ext4_page_mkwrite(). We even don't
3902 + * need to file the inode to the transaction's list in ordered mode because if
3903 + * we are writing back data added by write(), the inode is already there and if
3904 + * we are writing back data modified via mmap(), noone guarantees in which
3905 + * transaction the data will hit the disk. In case we are journaling data, we
3906 + * cannot start transaction directly because transaction start ranks above page
3907 + * lock so we have to do some magic.
3908   *
3909 - * In all journalling modes block_write_full_page() will start the I/O.
3910 + * In all journaling modes block_write_full_page() will start the I/O.
3911   *
3912   * Problem:
3913   *
3914 @@ -1518,105 +2658,103 @@ static int jbd2_journal_dirty_data_fn(handle_t *handle, struct buffer_head *bh)
3915   * disastrous.  Any write() or metadata operation will sync the fs for
3916   * us.
3917   *
3918 - * AKPM2: if all the page's buffers are mapped to disk and !data=journal,
3919 - * we don't need to open a transaction here.
3920   */
3921 -static int ext4_ordered_writepage(struct page *page,
3922 +static int __ext4_normal_writepage(struct page *page,
3923                                 struct writeback_control *wbc)
3924  {
3925         struct inode *inode = page->mapping->host;
3926 -       struct buffer_head *page_bufs;
3927 -       handle_t *handle = NULL;
3928 -       int ret = 0;
3929 -       int err;
3930
3931 -       J_ASSERT(PageLocked(page));
3932 -
3933 -       /*
3934 -        * We give up here if we're reentered, because it might be for a
3935 -        * different filesystem.
3936 -        */
3937 -       if (ext4_journal_current_handle())
3938 -               goto out_fail;
3939 -
3940 -       handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode));
3941 +       if (test_opt(inode->i_sb, NOBH))
3942 +               return nobh_writepage(page,
3943 +                                       ext4_normal_get_block_write, wbc);
3944 +       else
3945 +               return block_write_full_page(page,
3946 +                                               ext4_normal_get_block_write,
3947 +                                               wbc);
3948 +}
3949
3950 -       if (IS_ERR(handle)) {
3951 -               ret = PTR_ERR(handle);
3952 -               goto out_fail;
3953 -       }
3954 +static int ext4_normal_writepage(struct page *page,
3955 +                               struct writeback_control *wbc)
3956 +{
3957 +       struct inode *inode = page->mapping->host;
3958 +       loff_t size = i_size_read(inode);
3959 +       loff_t len;
3960
3961 -       if (!page_has_buffers(page)) {
3962 -               create_empty_buffers(page, inode->i_sb->s_blocksize,
3963 -                               (1 << BH_Dirty)|(1 << BH_Uptodate));
3964 +       J_ASSERT(PageLocked(page));
3965 +       if (page->index == size >> PAGE_CACHE_SHIFT)
3966 +               len = size & ~PAGE_CACHE_MASK;
3967 +       else
3968 +               len = PAGE_CACHE_SIZE;
3969 +
3970 +       if (page_has_buffers(page)) {
3971 +               /* if page has buffers it should all be mapped
3972 +                * and allocated. If there are not buffers attached
3973 +                * to the page we know the page is dirty but it lost
3974 +                * buffers. That means that at some moment in time
3975 +                * after write_begin() / write_end() has been called
3976 +                * all buffers have been clean and thus they must have been
3977 +                * written at least once. So they are all mapped and we can
3978 +                * happily proceed with mapping them and writing the page.
3979 +                */
3980 +               BUG_ON(walk_page_buffers(NULL, page_buffers(page), 0, len, NULL,
3981 +                                       ext4_bh_unmapped_or_delay));
3982         }
3983 -       page_bufs = page_buffers(page);
3984 -       walk_page_buffers(handle, page_bufs, 0,
3985 -                       PAGE_CACHE_SIZE, NULL, bget_one);
3986 -
3987 -       ret = block_write_full_page(page, ext4_get_block, wbc);
3988 -
3989 -       /*
3990 -        * The page can become unlocked at any point now, and
3991 -        * truncate can then come in and change things.  So we
3992 -        * can't touch *page from now on.  But *page_bufs is
3993 -        * safe due to elevated refcount.
3994 -        */
3995
3996 -       /*
3997 -        * And attach them to the current transaction.  But only if
3998 -        * block_write_full_page() succeeded.  Otherwise they are unmapped,
3999 -        * and generally junk.
4000 -        */
4001 -       if (ret == 0) {
4002 -               err = walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE,
4003 -                                       NULL, jbd2_journal_dirty_data_fn);
4004 -               if (!ret)
4005 -                       ret = err;
4006 -       }
4007 -       walk_page_buffers(handle, page_bufs, 0,
4008 -                       PAGE_CACHE_SIZE, NULL, bput_one);
4009 -       err = ext4_journal_stop(handle);
4010 -       if (!ret)
4011 -               ret = err;
4012 -       return ret;
4013 +       if (!ext4_journal_current_handle())
4014 +               return __ext4_normal_writepage(page, wbc);
4015
4016 -out_fail:
4017         redirty_page_for_writepage(wbc, page);
4018         unlock_page(page);
4019 -       return ret;
4020 +       return 0;
4021  }
4022
4023 -static int ext4_writeback_writepage(struct page *page,
4024 +static int __ext4_journalled_writepage(struct page *page,
4025                                 struct writeback_control *wbc)
4026  {
4027 -       struct inode *inode = page->mapping->host;
4028 +       struct address_space *mapping = page->mapping;
4029 +       struct inode *inode = mapping->host;
4030 +       struct buffer_head *page_bufs;
4031         handle_t *handle = NULL;
4032         int ret = 0;
4033         int err;
4034
4035 -       if (ext4_journal_current_handle())
4036 -               goto out_fail;
4037 +       ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE,
4038 +                                       ext4_normal_get_block_write);
4039 +       if (ret != 0)
4040 +               goto out_unlock;
4041 +
4042 +       page_bufs = page_buffers(page);
4043 +       walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE, NULL,
4044 +                                                               bget_one);
4045 +       /* As soon as we unlock the page, it can go away, but we have
4046 +        * references to buffers so we are safe */
4047 +       unlock_page(page);
4048
4049         handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode));
4050         if (IS_ERR(handle)) {
4051                 ret = PTR_ERR(handle);
4052 -               goto out_fail;
4053 +               goto out;
4054         }
4055
4056 -       if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode))
4057 -               ret = nobh_writepage(page, ext4_get_block, wbc);
4058 -       else
4059 -               ret = block_write_full_page(page, ext4_get_block, wbc);
4060 +       ret = walk_page_buffers(handle, page_bufs, 0,
4061 +                       PAGE_CACHE_SIZE, NULL, do_journal_get_write_access);
4062
4063 +       err = walk_page_buffers(handle, page_bufs, 0,
4064 +                               PAGE_CACHE_SIZE, NULL, write_end_fn);
4065 +       if (ret == 0)
4066 +               ret = err;
4067         err = ext4_journal_stop(handle);
4068         if (!ret)
4069                 ret = err;
4070 -       return ret;
4071
4072 -out_fail:
4073 -       redirty_page_for_writepage(wbc, page);
4074 +       walk_page_buffers(handle, page_bufs, 0,
4075 +                               PAGE_CACHE_SIZE, NULL, bput_one);
4076 +       EXT4_I(inode)->i_state |= EXT4_STATE_JDATA;
4077 +       goto out;
4078 +
4079 +out_unlock:
4080         unlock_page(page);
4081 +out:
4082         return ret;
4083  }
4084
4085 @@ -1624,59 +2762,53 @@ static int ext4_journalled_writepage(struct page *page,
4086                                 struct writeback_control *wbc)
4087  {
4088         struct inode *inode = page->mapping->host;
4089 -       handle_t *handle = NULL;
4090 -       int ret = 0;
4091 -       int err;
4092 +       loff_t size = i_size_read(inode);
4093 +       loff_t len;
4094
4095 -       if (ext4_journal_current_handle())
4096 -               goto no_write;
4097 +       J_ASSERT(PageLocked(page));
4098 +       if (page->index == size >> PAGE_CACHE_SHIFT)
4099 +               len = size & ~PAGE_CACHE_MASK;
4100 +       else
4101 +               len = PAGE_CACHE_SIZE;
4102 +
4103 +       if (page_has_buffers(page)) {
4104 +               /* if page has buffers it should all be mapped
4105 +                * and allocated. If there are not buffers attached
4106 +                * to the page we know the page is dirty but it lost
4107 +                * buffers. That means that at some moment in time
4108 +                * after write_begin() / write_end() has been called
4109 +                * all buffers have been clean and thus they must have been
4110 +                * written at least once. So they are all mapped and we can
4111 +                * happily proceed with mapping them and writing the page.
4112 +                */
4113 +               BUG_ON(walk_page_buffers(NULL, page_buffers(page), 0, len, NULL,
4114 +                                       ext4_bh_unmapped_or_delay));
4115 +       }
4116
4117 -       handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode));
4118 -       if (IS_ERR(handle)) {
4119 -               ret = PTR_ERR(handle);
4120 +       if (ext4_journal_current_handle())
4121                 goto no_write;
4122 -       }
4123
4124 -       if (!page_has_buffers(page) || PageChecked(page)) {
4125 +       if (PageChecked(page)) {
4126                 /*
4127                  * It's mmapped pagecache.  Add buffers and journal it.  There
4128                  * doesn't seem much point in redirtying the page here.
4129                  */
4130                 ClearPageChecked(page);
4131 -               ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE,
4132 -                                       ext4_get_block);
4133 -               if (ret != 0) {
4134 -                       ext4_journal_stop(handle);
4135 -                       goto out_unlock;
4136 -               }
4137 -               ret = walk_page_buffers(handle, page_buffers(page), 0,
4138 -                       PAGE_CACHE_SIZE, NULL, do_journal_get_write_access);
4139 -
4140 -               err = walk_page_buffers(handle, page_buffers(page), 0,
4141 -                               PAGE_CACHE_SIZE, NULL, write_end_fn);
4142 -               if (ret == 0)
4143 -                       ret = err;
4144 -               EXT4_I(inode)->i_state |= EXT4_STATE_JDATA;
4145 -               unlock_page(page);
4146 +               return __ext4_journalled_writepage(page, wbc);
4147         } else {
4148                 /*
4149                  * It may be a page full of checkpoint-mode buffers.  We don't
4150                  * really know unless we go poke around in the buffer_heads.
4151                  * But block_write_full_page will do the right thing.
4152                  */
4153 -               ret = block_write_full_page(page, ext4_get_block, wbc);
4154 +               return block_write_full_page(page,
4155 +                                               ext4_normal_get_block_write,
4156 +                                               wbc);
4157         }
4158 -       err = ext4_journal_stop(handle);
4159 -       if (!ret)
4160 -               ret = err;
4161 -out:
4162 -       return ret;
4163 -
4164  no_write:
4165         redirty_page_for_writepage(wbc, page);
4166 -out_unlock:
4167         unlock_page(page);
4168 -       goto out;
4169 +       return 0;
4170  }
4171
4172  static int ext4_readpage(struct file *file, struct page *page)
4173 @@ -1819,7 +2951,7 @@ static int ext4_journalled_set_page_dirty(struct page *page)
4174  static const struct address_space_operations ext4_ordered_aops = {
4175         .readpage       = ext4_readpage,
4176         .readpages      = ext4_readpages,
4177 -       .writepage      = ext4_ordered_writepage,
4178 +       .writepage      = ext4_normal_writepage,
4179         .sync_page      = block_sync_page,
4180         .write_begin    = ext4_write_begin,
4181         .write_end      = ext4_ordered_write_end,
4182 @@ -1833,7 +2965,7 @@ static const struct address_space_operations ext4_ordered_aops = {
4183  static const struct address_space_operations ext4_writeback_aops = {
4184         .readpage       = ext4_readpage,
4185         .readpages      = ext4_readpages,
4186 -       .writepage      = ext4_writeback_writepage,
4187 +       .writepage      = ext4_normal_writepage,
4188         .sync_page      = block_sync_page,
4189         .write_begin    = ext4_write_begin,
4190         .write_end      = ext4_writeback_write_end,
4191 @@ -1857,10 +2989,31 @@ static const struct address_space_operations ext4_journalled_aops = {
4192         .releasepage    = ext4_releasepage,
4193  };
4194
4195 +static const struct address_space_operations ext4_da_aops = {
4196 +       .readpage       = ext4_readpage,
4197 +       .readpages      = ext4_readpages,
4198 +       .writepage      = ext4_da_writepage,
4199 +       .writepages     = ext4_da_writepages,
4200 +       .sync_page      = block_sync_page,
4201 +       .write_begin    = ext4_da_write_begin,
4202 +       .write_end      = ext4_da_write_end,
4203 +       .bmap           = ext4_bmap,
4204 +       .invalidatepage = ext4_da_invalidatepage,
4205 +       .releasepage    = ext4_releasepage,
4206 +       .direct_IO      = ext4_direct_IO,
4207 +       .migratepage    = buffer_migrate_page,
4208 +};
4209 +
4210  void ext4_set_aops(struct inode *inode)
4211  {
4212 -       if (ext4_should_order_data(inode))
4213 +       if (ext4_should_order_data(inode) &&
4214 +               test_opt(inode->i_sb, DELALLOC))
4215 +               inode->i_mapping->a_ops = &ext4_da_aops;
4216 +       else if (ext4_should_order_data(inode))
4217                 inode->i_mapping->a_ops = &ext4_ordered_aops;
4218 +       else if (ext4_should_writeback_data(inode) &&
4219 +                test_opt(inode->i_sb, DELALLOC))
4220 +               inode->i_mapping->a_ops = &ext4_da_aops;
4221         else if (ext4_should_writeback_data(inode))
4222                 inode->i_mapping->a_ops = &ext4_writeback_aops;
4223         else
4224 @@ -1873,7 +3026,7 @@ void ext4_set_aops(struct inode *inode)
4225   * This required during truncate. We need to physically zero the tail end
4226   * of that block so it doesn't yield old data if the file is later grown.
4227   */
4228 -int ext4_block_truncate_page(handle_t *handle, struct page *page,
4229 +int ext4_block_truncate_page(handle_t *handle,
4230                 struct address_space *mapping, loff_t from)
4231  {
4232         ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT;
4233 @@ -1882,8 +3035,13 @@ int ext4_block_truncate_page(handle_t *handle, struct page *page,
4234         ext4_lblk_t iblock;
4235         struct inode *inode = mapping->host;
4236         struct buffer_head *bh;
4237 +       struct page *page;
4238         int err = 0;
4239
4240 +       page = grab_cache_page(mapping, from >> PAGE_CACHE_SHIFT);
4241 +       if (!page)
4242 +               return -EINVAL;
4243 +
4244         blocksize = inode->i_sb->s_blocksize;
4245         length = blocksize - (offset & (blocksize - 1));
4246         iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
4247 @@ -1956,7 +3114,7 @@ int ext4_block_truncate_page(handle_t *handle, struct page *page,
4248                 err = ext4_journal_dirty_metadata(handle, bh);
4249         } else {
4250                 if (ext4_should_order_data(inode))
4251 -                       err = ext4_journal_dirty_data(handle, bh);
4252 +                       err = ext4_jbd2_file_inode(handle, inode);
4253                 mark_buffer_dirty(bh);
4254         }
4255
4256 @@ -2179,7 +3337,21 @@ static void ext4_free_data(handle_t *handle, struct inode *inode,
4257
4258         if (this_bh) {
4259                 BUFFER_TRACE(this_bh, "call ext4_journal_dirty_metadata");
4260 -               ext4_journal_dirty_metadata(handle, this_bh);
4261 +
4262 +               /*
4263 +                * The buffer head should have an attached journal head at this
4264 +                * point. However, if the data is corrupted and an indirect
4265 +                * block pointed to itself, it would have been detached when
4266 +                * the block was cleared. Check for this instead of OOPSing.
4267 +                */
4268 +               if (bh2jh(this_bh))
4269 +                       ext4_journal_dirty_metadata(handle, this_bh);
4270 +               else
4271 +                       ext4_error(inode->i_sb, __func__,
4272 +                                  "circular indirect block detected, "
4273 +                                  "inode=%lu, block=%llu",
4274 +                                  inode->i_ino,
4275 +                                  (unsigned long long) this_bh->b_blocknr);
4276         }
4277  }
4278
4279 @@ -2305,6 +3477,19 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
4280         }
4281  }
4282
4283 +int ext4_can_truncate(struct inode *inode)
4284 +{
4285 +       if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
4286 +               return 0;
4287 +       if (S_ISREG(inode->i_mode))
4288 +               return 1;
4289 +       if (S_ISDIR(inode->i_mode))
4290 +               return 1;
4291 +       if (S_ISLNK(inode->i_mode))
4292 +               return !ext4_inode_is_fast_symlink(inode);
4293 +       return 0;
4294 +}
4295 +
4296  /*
4297   * ext4_truncate()
4298   *
4299 @@ -2347,51 +3532,25 @@ void ext4_truncate(struct inode *inode)
4300         int n;
4301         ext4_lblk_t last_block;
4302         unsigned blocksize = inode->i_sb->s_blocksize;
4303 -       struct page *page;
4304
4305 -       if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
4306 -           S_ISLNK(inode->i_mode)))
4307 -               return;
4308 -       if (ext4_inode_is_fast_symlink(inode))
4309 -               return;
4310 -       if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
4311 +       if (!ext4_can_truncate(inode))
4312                 return;
4313
4314 -       /*
4315 -        * We have to lock the EOF page here, because lock_page() nests
4316 -        * outside jbd2_journal_start().
4317 -        */
4318 -       if ((inode->i_size & (blocksize - 1)) == 0) {
4319 -               /* Block boundary? Nothing to do */
4320 -               page = NULL;
4321 -       } else {
4322 -               page = grab_cache_page(mapping,
4323 -                               inode->i_size >> PAGE_CACHE_SHIFT);
4324 -               if (!page)
4325 -                       return;
4326 -       }
4327 -
4328         if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) {
4329 -               ext4_ext_truncate(inode, page);
4330 +               ext4_ext_truncate(inode);
4331                 return;
4332         }
4333
4334         handle = start_transaction(inode);
4335 -       if (IS_ERR(handle)) {
4336 -               if (page) {
4337 -                       clear_highpage(page);
4338 -                       flush_dcache_page(page);
4339 -                       unlock_page(page);
4340 -                       page_cache_release(page);
4341 -               }
4342 +       if (IS_ERR(handle))
4343                 return;         /* AKPM: return what? */
4344 -       }
4345
4346         last_block = (inode->i_size + blocksize-1)
4347                                         >> EXT4_BLOCK_SIZE_BITS(inode->i_sb);
4348
4349 -       if (page)
4350 -               ext4_block_truncate_page(handle, page, mapping, inode->i_size);
4351 +       if (inode->i_size & (blocksize - 1))
4352 +               if (ext4_block_truncate_page(handle, mapping, inode->i_size))
4353 +                       goto out_stop;
4354
4355         n = ext4_block_to_path(inode, last_block, offsets, NULL);
4356         if (n == 0)
4357 @@ -2410,6 +3569,14 @@ void ext4_truncate(struct inode *inode)
4358                 goto out_stop;
4359
4360         /*
4361 +        * From here we block out all ext4_get_block() callers who want to
4362 +        * modify the block allocation tree.
4363 +        */
4364 +       down_write(&ei->i_data_sem);
4365 +
4366 +       ext4_discard_reservation(inode);
4367 +
4368 +       /*
4369          * The orphan list entry will now protect us from any crash which
4370          * occurs before the truncate completes, so it is now safe to propagate
4371          * the new, shorter inode size (held for now in i_size) into the
4372 @@ -2418,12 +3585,6 @@ void ext4_truncate(struct inode *inode)
4373          */
4374         ei->i_disksize = inode->i_size;
4375
4376 -       /*
4377 -        * From here we block out all ext4_get_block() callers who want to
4378 -        * modify the block allocation tree.
4379 -        */
4380 -       down_write(&ei->i_data_sem);
4381 -
4382         if (n == 1) {           /* direct blocks */
4383                 ext4_free_data(handle, inode, NULL, i_data+offsets[0],
4384                                i_data + EXT4_NDIR_BLOCKS);
4385 @@ -2484,8 +3645,6 @@ do_indirects:
4386                 ;
4387         }
4388
4389 -       ext4_discard_reservation(inode);
4390 -
4391         up_write(&ei->i_data_sem);
4392         inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
4393         ext4_mark_inode_dirty(handle, inode);
4394 @@ -2571,6 +3730,16 @@ static int __ext4_get_inode_loc(struct inode *inode,
4395         }
4396         if (!buffer_uptodate(bh)) {
4397                 lock_buffer(bh);
4398 +
4399 +               /*
4400 +                * If the buffer has the write error flag, we have failed
4401 +                * to write out another inode in the same block.  In this
4402 +                * case, we don't have to read the block because we may
4403 +                * read the old inode data successfully.
4404 +                */
4405 +               if (buffer_write_io_error(bh) && !buffer_uptodate(bh))
4406 +                       set_buffer_uptodate(bh);
4407 +
4408                 if (buffer_uptodate(bh)) {
4409                         /* someone brought it uptodate while we waited */
4410                         unlock_buffer(bh);
4411 @@ -3107,7 +4276,14 @@ int ext4_write_inode(struct inode *inode, int wait)
4412   * be freed, so we have a strong guarantee that no future commit will
4413   * leave these blocks visible to the user.)
4414   *
4415 - * Called with inode->sem down.
4416 + * Another thing we have to assure is that if we are in ordered mode
4417 + * and inode is still attached to the committing transaction, we must
4418 + * we start writeout of all the dirty pages which are being truncated.
4419 + * This way we are sure that all the data written in the previous
4420 + * transaction are already on disk (truncate waits for pages under
4421 + * writeback).
4422 + *
4423 + * Called with inode->i_mutex down.
4424   */
4425  int ext4_setattr(struct dentry *dentry, struct iattr *attr)
4426  {
4427 @@ -3173,6 +4349,22 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
4428                 if (!error)
4429                         error = rc;
4430                 ext4_journal_stop(handle);
4431 +
4432 +               if (ext4_should_order_data(inode)) {
4433 +                       error = ext4_begin_ordered_truncate(inode,
4434 +                                                           attr->ia_size);
4435 +                       if (error) {
4436 +                               /* Do as much error cleanup as possible */
4437 +                               handle = ext4_journal_start(inode, 3);
4438 +                               if (IS_ERR(handle)) {
4439 +                                       ext4_orphan_del(NULL, inode);
4440 +                                       goto err_out;
4441 +                               }
4442 +                               ext4_orphan_del(handle, inode);
4443 +                               ext4_journal_stop(handle);
4444 +                               goto err_out;
4445 +                       }
4446 +               }
4447         }
4448
4449         rc = inode_setattr(inode, attr);
4450 @@ -3193,58 +4385,156 @@ err_out:
4451         return error;
4452  }
4453
4454 +int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry,
4455 +                struct kstat *stat)
4456 +{
4457 +       struct inode *inode;
4458 +       unsigned long delalloc_blocks;
4459 +
4460 +       inode = dentry->d_inode;
4461 +       generic_fillattr(inode, stat);
4462
4463 +       /*
4464 +        * We can't update i_blocks if the block allocation is delayed
4465 +        * otherwise in the case of system crash before the real block
4466 +        * allocation is done, we will have i_blocks inconsistent with
4467 +        * on-disk file blocks.
4468 +        * We always keep i_blocks updated together with real
4469 +        * allocation. But to not confuse with user, stat
4470 +        * will return the blocks that include the delayed allocation
4471 +        * blocks for this file.
4472 +        */
4473 +       spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
4474 +       delalloc_blocks = EXT4_I(inode)->i_reserved_data_blocks;
4475 +       spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
4476 +
4477 +       stat->blocks += (delalloc_blocks << inode->i_sb->s_blocksize_bits)>>9;
4478 +       return 0;
4479 +}
4480 +
4481 +static int ext4_indirect_trans_blocks(struct inode *inode, int nrblocks,
4482 +                                     int chunk)
4483 +{
4484 +       int indirects;
4485 +
4486 +       /* if nrblocks are contiguous */
4487 +       if (chunk) {
4488 +               /*
4489 +                * With N contiguous data blocks, it need at most
4490 +                * N/EXT4_ADDR_PER_BLOCK(inode->i_sb) indirect blocks
4491 +                * 2 dindirect blocks
4492 +                * 1 tindirect block
4493 +                */
4494 +               indirects = nrblocks / EXT4_ADDR_PER_BLOCK(inode->i_sb);
4495 +               return indirects + 3;
4496 +       }
4497 +       /*
4498 +        * if nrblocks are not contiguous, worse case, each block touch
4499 +        * a indirect block, and each indirect block touch a double indirect
4500 +        * block, plus a triple indirect block
4501 +        */
4502 +       indirects = nrblocks * 2 + 1;
4503 +       return indirects;
4504 +}
4505 +
4506 +static int ext4_index_trans_blocks(struct inode *inode, int nrblocks, int chunk)
4507 +{
4508 +       if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
4509 +               return ext4_indirect_trans_blocks(inode, nrblocks, 0);
4510 +       return ext4_ext_index_trans_blocks(inode, nrblocks, 0);
4511 +}
4512  /*
4513 - * How many blocks doth make a writepage()?
4514 - *
4515 - * With N blocks per page, it may be:
4516 - * N data blocks
4517 - * 2 indirect block
4518 - * 2 dindirect
4519 - * 1 tindirect
4520 - * N+5 bitmap blocks (from the above)
4521 - * N+5 group descriptor summary blocks
4522 - * 1 inode block
4523 - * 1 superblock.
4524 - * 2 * EXT4_SINGLEDATA_TRANS_BLOCKS for the quote files
4525 - *
4526 - * 3 * (N + 5) + 2 + 2 * EXT4_SINGLEDATA_TRANS_BLOCKS
4527 - *
4528 - * With ordered or writeback data it's the same, less the N data blocks.
4529 - *
4530 - * If the inode's direct blocks can hold an integral number of pages then a
4531 - * page cannot straddle two indirect blocks, and we can only touch one indirect
4532 - * and dindirect block, and the "5" above becomes "3".
4533 - *
4534 - * This still overestimates under most circumstances.  If we were to pass the
4535 - * start and end offsets in here as well we could do block_to_path() on each
4536 - * block and work out the exact number of indirects which are touched.  Pah.
4537 + * Account for index blocks, block groups bitmaps and block group
4538 + * descriptor blocks if modify datablocks and index blocks
4539 + * worse case, the indexs blocks spread over different block groups
4540 + *
4541 + * If datablocks are discontiguous, they are possible to spread over
4542 + * different block groups too. If they are contiugous, with flexbg,
4543 + * they could still across block group boundary.
4544 + *
4545 + * Also account for superblock, inode, quota and xattr blocks
4546   */
4547 +int ext4_meta_trans_blocks(struct inode *inode, int nrblocks, int chunk)
4548 +{
4549 +       int groups, gdpblocks;
4550 +       int idxblocks;
4551 +       int ret = 0;
4552 +
4553 +       /*
4554 +        * How many index blocks need to touch to modify nrblocks?
4555 +        * The "Chunk" flag indicating whether the nrblocks is
4556 +        * physically contiguous on disk
4557 +        *
4558 +        * For Direct IO and fallocate, they calls get_block to allocate
4559 +        * one single extent at a time, so they could set the "Chunk" flag
4560 +        */
4561 +       idxblocks = ext4_index_trans_blocks(inode, nrblocks, chunk);
4562
4563 +       ret = idxblocks;
4564 +
4565 +       /*
4566 +        * Now let's see how many group bitmaps and group descriptors need
4567 +        * to account
4568 +        */
4569 +       groups = idxblocks;
4570 +       if (chunk)
4571 +               groups += 1;
4572 +       else
4573 +               groups += nrblocks;
4574 +
4575 +       gdpblocks = groups;
4576 +       if (groups > EXT4_SB(inode->i_sb)->s_groups_count)
4577 +               groups = EXT4_SB(inode->i_sb)->s_groups_count;
4578 +       if (groups > EXT4_SB(inode->i_sb)->s_gdb_count)
4579 +               gdpblocks = EXT4_SB(inode->i_sb)->s_gdb_count;
4580 +
4581 +       /* bitmaps and block group descriptor blocks */
4582 +       ret += groups + gdpblocks;
4583 +
4584 +       /* Blocks for super block, inode, quota and xattr blocks */
4585 +       ret += EXT4_META_TRANS_BLOCKS(inode->i_sb);
4586 +
4587 +       return ret;
4588 +}
4589 +
4590 +/*
4591 + * Calulate the total number of credits to reserve to fit
4592 + * the modification of a single pages into a single transaction,
4593 + * which may include multiple chunks of block allocations.
4594 + *
4595 + * This could be called via ext4_write_begin()
4596 + *
4597 + * We need to consider the worse case, when
4598 + * one new block per extent.
4599 + */
4600  int ext4_writepage_trans_blocks(struct inode *inode)
4601  {
4602         int bpp = ext4_journal_blocks_per_page(inode);
4603 -       int indirects = (EXT4_NDIR_BLOCKS % bpp) ? 5 : 3;
4604         int ret;
4605
4606 -       if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)
4607 -               return ext4_ext_writepage_trans_blocks(inode, bpp);
4608 +       ret = ext4_meta_trans_blocks(inode, bpp, 0);
4609
4610 +       /* Account for data blocks for journalled mode */
4611         if (ext4_should_journal_data(inode))
4612 -               ret = 3 * (bpp + indirects) + 2;
4613 -       else
4614 -               ret = 2 * (bpp + indirects) + 2;
4615 -
4616 -#ifdef CONFIG_QUOTA
4617 -       /* We know that structure was already allocated during DQUOT_INIT so
4618 -        * we will be updating only the data blocks + inodes */
4619 -       ret += 2*EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb);
4620 -#endif
4621 -
4622 +               ret += bpp;
4623         return ret;
4624  }
4625
4626  /*
4627 + * Calculate the journal credits for a chunk of data modification.
4628 + *
4629 + * This is called from DIO, fallocate or whoever calling
4630 + * ext4_get_blocks_wrap() to map/allocate a chunk of contigous disk blocks.
4631 + *
4632 + * journal buffers for data blocks are not included here, as DIO
4633 + * and fallocate do no need to journal data buffers.
4634 + */
4635 +int ext4_chunk_trans_blocks(struct inode *inode, int nrblocks)
4636 +{
4637 +       return ext4_meta_trans_blocks(inode, nrblocks, 1);
4638 +}
4639 +
4640 +/*
4641   * The caller must have previously called ext4_reserve_inode_write().
4642   * Give this, we know that the caller already has write access to iloc->bh.
4643   */
4644 @@ -3506,3 +4796,64 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
4645
4646         return err;
4647  }
4648 +
4649 +static int ext4_bh_unmapped(handle_t *handle, struct buffer_head *bh)
4650 +{
4651 +       return !buffer_mapped(bh);
4652 +}
4653 +
4654 +int ext4_page_mkwrite(struct vm_area_struct *vma, struct page *page)
4655 +{
4656 +       loff_t size;
4657 +       unsigned long len;
4658 +       int ret = -EINVAL;
4659 +       struct file *file = vma->vm_file;
4660 +       struct inode *inode = file->f_path.dentry->d_inode;
4661 +       struct address_space *mapping = inode->i_mapping;
4662 +
4663 +       /*
4664 +        * Get i_alloc_sem to stop truncates messing with the inode. We cannot
4665 +        * get i_mutex because we are already holding mmap_sem.
4666 +        */
4667 +       down_read(&inode->i_alloc_sem);
4668 +       size = i_size_read(inode);
4669 +       if (page->mapping != mapping || size <= page_offset(page)
4670 +           || !PageUptodate(page)) {
4671 +               /* page got truncated from under us? */
4672 +               goto out_unlock;
4673 +       }
4674 +       ret = 0;
4675 +       if (PageMappedToDisk(page))
4676 +               goto out_unlock;
4677 +
4678 +       if (page->index == size >> PAGE_CACHE_SHIFT)
4679 +               len = size & ~PAGE_CACHE_MASK;
4680 +       else
4681 +               len = PAGE_CACHE_SIZE;
4682 +
4683 +       if (page_has_buffers(page)) {
4684 +               /* return if we have all the buffers mapped */
4685 +               if (!walk_page_buffers(NULL, page_buffers(page), 0, len, NULL,
4686 +                                      ext4_bh_unmapped))
4687 +                       goto out_unlock;
4688 +       }
4689 +       /*
4690 +        * OK, we need to fill the hole... Do write_begin write_end
4691 +        * to do block allocation/reservation.We are not holding
4692 +        * inode.i__mutex here. That allow * parallel write_begin,
4693 +        * write_end call. lock_page prevent this from happening
4694 +        * on the same page though
4695 +        */
4696 +       ret = mapping->a_ops->write_begin(file, mapping, page_offset(page),
4697 +                       len, AOP_FLAG_UNINTERRUPTIBLE, &page, NULL);
4698 +       if (ret < 0)
4699 +               goto out_unlock;
4700 +       ret = mapping->a_ops->write_end(file, mapping, page_offset(page),
4701 +                       len, len, page, NULL);
4702 +       if (ret < 0)
4703 +               goto out_unlock;
4704 +       ret = 0;
4705 +out_unlock:
4706 +       up_read(&inode->i_alloc_sem);
4707 +       return ret;
4708 +}
4709 diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
4710 index c9900aa..e0e3a5e 100644
4711 --- a/fs/ext4/mballoc.c
4712 +++ b/fs/ext4/mballoc.c
4713 @@ -381,22 +381,28 @@ static inline void mb_clear_bit_atomic(spinlock_t *lock, int bit, void *addr)
4714
4715  static inline int mb_find_next_zero_bit(void *addr, int max, int start)
4716  {
4717 -       int fix = 0;
4718 +       int fix = 0, ret, tmpmax;
4719         addr = mb_correct_addr_and_bit(&fix, addr);
4720 -       max += fix;
4721 +       tmpmax = max + fix;
4722         start += fix;
4723
4724 -       return ext4_find_next_zero_bit(addr, max, start) - fix;
4725 +       ret = ext4_find_next_zero_bit(addr, tmpmax, start) - fix;
4726 +       if (ret > max)
4727 +               return max;
4728 +       return ret;
4729  }
4730
4731  static inline int mb_find_next_bit(void *addr, int max, int start)
4732  {
4733 -       int fix = 0;
4734 +       int fix = 0, ret, tmpmax;
4735         addr = mb_correct_addr_and_bit(&fix, addr);
4736 -       max += fix;
4737 +       tmpmax = max + fix;
4738         start += fix;
4739
4740 -       return ext4_find_next_bit(addr, max, start) - fix;
4741 +       ret = ext4_find_next_bit(addr, tmpmax, start) - fix;
4742 +       if (ret > max)
4743 +               return max;
4744 +       return ret;
4745  }
4746
4747  static void *mb_find_buddy(struct ext4_buddy *e4b, int order, int *max)
4748 @@ -781,13 +787,16 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
4749                 if (bh_uptodate_or_lock(bh[i]))
4750                         continue;
4751
4752 +               spin_lock(sb_bgl_lock(EXT4_SB(sb), first_group + i));
4753                 if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
4754                         ext4_init_block_bitmap(sb, bh[i],
4755                                                 first_group + i, desc);
4756                         set_buffer_uptodate(bh[i]);
4757                         unlock_buffer(bh[i]);
4758 +                       spin_unlock(sb_bgl_lock(EXT4_SB(sb), first_group + i));
4759                         continue;
4760                 }
4761 +               spin_unlock(sb_bgl_lock(EXT4_SB(sb), first_group + i));
4762                 get_bh(bh[i]);
4763                 bh[i]->b_end_io = end_buffer_read_sync;
4764                 submit_bh(READ, bh[i]);
4765 @@ -803,6 +812,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
4766                 if (!buffer_uptodate(bh[i]))
4767                         goto out;
4768
4769 +       err = 0;
4770         first_block = page->index * blocks_per_page;
4771         for (i = 0; i < blocks_per_page; i++) {
4772                 int group;
4773 @@ -883,6 +893,7 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
4774         int pnum;
4775         int poff;
4776         struct page *page;
4777 +       int ret;
4778
4779         mb_debug("load group %lu\n", group);
4780
4781 @@ -914,15 +925,21 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
4782                 if (page) {
4783                         BUG_ON(page->mapping != inode->i_mapping);
4784                         if (!PageUptodate(page)) {
4785 -                               ext4_mb_init_cache(page, NULL);
4786 +                               ret = ext4_mb_init_cache(page, NULL);
4787 +                               if (ret) {
4788 +                                       unlock_page(page);
4789 +                                       goto err;
4790 +                               }
4791                                 mb_cmp_bitmaps(e4b, page_address(page) +
4792                                                (poff * sb->s_blocksize));
4793                         }
4794                         unlock_page(page);
4795                 }
4796         }
4797 -       if (page == NULL || !PageUptodate(page))
4798 +       if (page == NULL || !PageUptodate(page)) {
4799 +               ret = -EIO;
4800                 goto err;
4801 +       }
4802         e4b->bd_bitmap_page = page;
4803         e4b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize);
4804         mark_page_accessed(page);
4805 @@ -938,14 +955,20 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
4806                 page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
4807                 if (page) {
4808                         BUG_ON(page->mapping != inode->i_mapping);
4809 -                       if (!PageUptodate(page))
4810 -                               ext4_mb_init_cache(page, e4b->bd_bitmap);
4811 -
4812 +                       if (!PageUptodate(page)) {
4813 +                               ret = ext4_mb_init_cache(page, e4b->bd_bitmap);
4814 +                               if (ret) {
4815 +                                       unlock_page(page);
4816 +                                       goto err;
4817 +                               }
4818 +                       }
4819                         unlock_page(page);
4820                 }
4821         }
4822 -       if (page == NULL || !PageUptodate(page))
4823 +       if (page == NULL || !PageUptodate(page)) {
4824 +               ret = -EIO;
4825                 goto err;
4826 +       }
4827         e4b->bd_buddy_page = page;
4828         e4b->bd_buddy = page_address(page) + (poff * sb->s_blocksize);
4829         mark_page_accessed(page);
4830 @@ -962,7 +985,7 @@ err:
4831                 page_cache_release(e4b->bd_buddy_page);
4832         e4b->bd_buddy = NULL;
4833         e4b->bd_bitmap = NULL;
4834 -       return -EIO;
4835 +       return ret;
4836  }
4837
4838  static void ext4_mb_release_desc(struct ext4_buddy *e4b)
4839 @@ -1031,7 +1054,7 @@ static void mb_set_bits(spinlock_t *lock, void *bm, int cur, int len)
4840         }
4841  }
4842
4843 -static int mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
4844 +static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
4845                           int first, int count)
4846  {
4847         int block = 0;
4848 @@ -1071,11 +1094,12 @@ static int mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
4849                         blocknr += block;
4850                         blocknr +=
4851                             le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
4852 -
4853 +                       ext4_unlock_group(sb, e4b->bd_group);
4854                         ext4_error(sb, __func__, "double-free of inode"
4855                                    " %lu's block %llu(bit %u in group %lu)\n",
4856                                    inode ? inode->i_ino : 0, blocknr, block,
4857                                    e4b->bd_group);
4858 +                       ext4_lock_group(sb, e4b->bd_group);
4859                 }
4860                 mb_clear_bit(block, EXT4_MB_BITMAP(e4b));
4861                 e4b->bd_info->bb_counters[order]++;
4862 @@ -1113,8 +1137,6 @@ static int mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
4863                 } while (1);
4864         }
4865         mb_check_buddy(e4b);
4866 -
4867 -       return 0;
4868  }
4869
4870  static int mb_find_extent(struct ext4_buddy *e4b, int order, int block,
4871 @@ -1730,10 +1752,6 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
4872                 ac->ac_g_ex.fe_start = sbi->s_mb_last_start;
4873                 spin_unlock(&sbi->s_md_lock);
4874         }
4875 -
4876 -       /* searching for the right group start from the goal value specified */
4877 -       group = ac->ac_g_ex.fe_group;
4878 -
4879         /* Let's just scan groups to find more-less suitable blocks */
4880         cr = ac->ac_2order ? 0 : 1;
4881         /*
4882 @@ -1743,6 +1761,12 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
4883  repeat:
4884         for (; cr < 4 && ac->ac_status == AC_STATUS_CONTINUE; cr++) {
4885                 ac->ac_criteria = cr;
4886 +               /*
4887 +                * searching for the right group start
4888 +                * from the goal value specified
4889 +                */
4890 +               group = ac->ac_g_ex.fe_group;
4891 +
4892                 for (i = 0; i < EXT4_SB(sb)->s_groups_count; group++, i++) {
4893                         struct ext4_group_info *grp;
4894                         struct ext4_group_desc *desc;
4895 @@ -1963,6 +1987,8 @@ static int ext4_mb_seq_history_open(struct inode *inode, struct file *file)
4896         int rc;
4897         int size;
4898
4899 +       if (unlikely(sbi->s_mb_history == NULL))
4900 +               return -ENOMEM;
4901         s = kmalloc(sizeof(*s), GFP_KERNEL);
4902         if (s == NULL)
4903                 return -ENOMEM;
4904 @@ -2165,9 +2191,7 @@ static void ext4_mb_history_init(struct super_block *sb)
4905         sbi->s_mb_history_cur = 0;
4906         spin_lock_init(&sbi->s_mb_history_lock);
4907         i = sbi->s_mb_history_max * sizeof(struct ext4_mb_history);
4908 -       sbi->s_mb_history = kmalloc(i, GFP_KERNEL);
4909 -       if (likely(sbi->s_mb_history != NULL))
4910 -               memset(sbi->s_mb_history, 0, i);
4911 +       sbi->s_mb_history = kzalloc(i, GFP_KERNEL);
4912         /* if we can't allocate history, then we simple won't use it */
4913  }
4914
4915 @@ -2215,21 +2239,192 @@ ext4_mb_store_history(struct ext4_allocation_context *ac)
4916  #define ext4_mb_history_init(sb)
4917  #endif
4918
4919 +
4920 +/* Create and initialize ext4_group_info data for the given group. */
4921 +int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
4922 +                         struct ext4_group_desc *desc)
4923 +{
4924 +       int i, len;
4925 +       int metalen = 0;
4926 +       struct ext4_sb_info *sbi = EXT4_SB(sb);
4927 +       struct ext4_group_info **meta_group_info;
4928 +
4929 +       /*
4930 +        * First check if this group is the first of a reserved block.
4931 +        * If it's true, we have to allocate a new table of pointers
4932 +        * to ext4_group_info structures
4933 +        */
4934 +       if (group % EXT4_DESC_PER_BLOCK(sb) == 0) {
4935 +               metalen = sizeof(*meta_group_info) <<
4936 +                       EXT4_DESC_PER_BLOCK_BITS(sb);
4937 +               meta_group_info = kmalloc(metalen, GFP_KERNEL);
4938 +               if (meta_group_info == NULL) {
4939 +                       printk(KERN_ERR "EXT4-fs: can't allocate mem for a "
4940 +                              "buddy group\n");
4941 +                       goto exit_meta_group_info;
4942 +               }
4943 +               sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)] =
4944 +                       meta_group_info;
4945 +       }
4946 +
4947 +       /*
4948 +        * calculate needed size. if change bb_counters size,
4949 +        * don't forget about ext4_mb_generate_buddy()
4950 +        */
4951 +       len = offsetof(typeof(**meta_group_info),
4952 +                      bb_counters[sb->s_blocksize_bits + 2]);
4953 +
4954 +       meta_group_info =
4955 +               sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)];
4956 +       i = group & (EXT4_DESC_PER_BLOCK(sb) - 1);
4957 +
4958 +       meta_group_info[i] = kzalloc(len, GFP_KERNEL);
4959 +       if (meta_group_info[i] == NULL) {
4960 +               printk(KERN_ERR "EXT4-fs: can't allocate buddy mem\n");
4961 +               goto exit_group_info;
4962 +       }
4963 +       set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT,
4964 +               &(meta_group_info[i]->bb_state));
4965 +
4966 +       /*
4967 +        * initialize bb_free to be able to skip
4968 +        * empty groups without initialization
4969 +        */
4970 +       if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
4971 +               meta_group_info[i]->bb_free =
4972 +                       ext4_free_blocks_after_init(sb, group, desc);
4973 +       } else {
4974 +               meta_group_info[i]->bb_free =
4975 +                       le16_to_cpu(desc->bg_free_blocks_count);
4976 +       }
4977 +
4978 +       INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list);
4979 +
4980 +#ifdef DOUBLE_CHECK
4981 +       {
4982 +               struct buffer_head *bh;
4983 +               meta_group_info[i]->bb_bitmap =
4984 +                       kmalloc(sb->s_blocksize, GFP_KERNEL);
4985 +               BUG_ON(meta_group_info[i]->bb_bitmap == NULL);
4986 +               bh = ext4_read_block_bitmap(sb, group);
4987 +               BUG_ON(bh == NULL);
4988 +               memcpy(meta_group_info[i]->bb_bitmap, bh->b_data,
4989 +                       sb->s_blocksize);
4990 +               put_bh(bh);
4991 +       }
4992 +#endif
4993 +
4994 +       return 0;
4995 +
4996 +exit_group_info:
4997 +       /* If a meta_group_info table has been allocated, release it now */
4998 +       if (group % EXT4_DESC_PER_BLOCK(sb) == 0)
4999 +               kfree(sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)]);
5000 +exit_meta_group_info:
5001 +       return -ENOMEM;
5002 +} /* ext4_mb_add_groupinfo */
5003 +
5004 +/*
5005 + * Add a group to the existing groups.
5006 + * This function is used for online resize
5007 + */
5008 +int ext4_mb_add_more_groupinfo(struct super_block *sb, ext4_group_t group,
5009 +                              struct ext4_group_desc *desc)
5010 +{
5011 +       struct ext4_sb_info *sbi = EXT4_SB(sb);
5012 +       struct inode *inode = sbi->s_buddy_cache;
5013 +       int blocks_per_page;
5014 +       int block;
5015 +       int pnum;
5016 +       struct page *page;
5017 +       int err;
5018 +
5019 +       /* Add group based on group descriptor*/
5020 +       err = ext4_mb_add_groupinfo(sb, group, desc);
5021 +       if (err)
5022 +               return err;
5023 +
5024 +       /*
5025 +        * Cache pages containing dynamic mb_alloc datas (buddy and bitmap
5026 +        * datas) are set not up to date so that they will be re-initilaized
5027 +        * during the next call to ext4_mb_load_buddy
5028 +        */
5029 +
5030 +       /* Set buddy page as not up to date */
5031 +       blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
5032 +       block = group * 2;
5033 +       pnum = block / blocks_per_page;
5034 +       page = find_get_page(inode->i_mapping, pnum);
5035 +       if (page != NULL) {
5036 +               ClearPageUptodate(page);
5037 +               page_cache_release(page);
5038 +       }
5039 +
5040 +       /* Set bitmap page as not up to date */
5041 +       block++;
5042 +       pnum = block / blocks_per_page;
5043 +       page = find_get_page(inode->i_mapping, pnum);
5044 +       if (page != NULL) {
5045 +               ClearPageUptodate(page);
5046 +               page_cache_release(page);
5047 +       }
5048 +
5049 +       return 0;
5050 +}
5051 +
5052 +/*
5053 + * Update an existing group.
5054 + * This function is used for online resize
5055 + */
5056 +void ext4_mb_update_group_info(struct ext4_group_info *grp, ext4_grpblk_t add)
5057 +{
5058 +       grp->bb_free += add;
5059 +}
5060 +
5061  static int ext4_mb_init_backend(struct super_block *sb)
5062  {
5063         ext4_group_t i;
5064 -       int j, len, metalen;
5065 +       int metalen;
5066         struct ext4_sb_info *sbi = EXT4_SB(sb);
5067 -       int num_meta_group_infos =
5068 -               (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) >>
5069 -                       EXT4_DESC_PER_BLOCK_BITS(sb);
5070 +       struct ext4_super_block *es = sbi->s_es;
5071 +       int num_meta_group_infos;
5072 +       int num_meta_group_infos_max;
5073 +       int array_size;
5074         struct ext4_group_info **meta_group_info;
5075 +       struct ext4_group_desc *desc;
5076
5077 +       /* This is the number of blocks used by GDT */
5078 +       num_meta_group_infos = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) -
5079 +                               1) >> EXT4_DESC_PER_BLOCK_BITS(sb);
5080 +
5081 +       /*
5082 +        * This is the total number of blocks used by GDT including
5083 +        * the number of reserved blocks for GDT.
5084 +        * The s_group_info array is allocated with this value
5085 +        * to allow a clean online resize without a complex
5086 +        * manipulation of pointer.
5087 +        * The drawback is the unused memory when no resize
5088 +        * occurs but it's very low in terms of pages
5089 +        * (see comments below)
5090 +        * Need to handle this properly when META_BG resizing is allowed
5091 +        */
5092 +       num_meta_group_infos_max = num_meta_group_infos +
5093 +                               le16_to_cpu(es->s_reserved_gdt_blocks);
5094 +
5095 +       /*
5096 +        * array_size is the size of s_group_info array. We round it
5097 +        * to the next power of two because this approximation is done
5098 +        * internally by kmalloc so we can have some more memory
5099 +        * for free here (e.g. may be used for META_BG resize).
5100 +        */
5101 +       array_size = 1;
5102 +       while (array_size < sizeof(*sbi->s_group_info) *
5103 +              num_meta_group_infos_max)
5104 +               array_size = array_size << 1;
5105         /* An 8TB filesystem with 64-bit pointers requires a 4096 byte
5106          * kmalloc. A 128kb malloc should suffice for a 256TB filesystem.
5107          * So a two level scheme suffices for now. */
5108 -       sbi->s_group_info = kmalloc(sizeof(*sbi->s_group_info) *
5109 -                                   num_meta_group_infos, GFP_KERNEL);
5110 +       sbi->s_group_info = kmalloc(array_size, GFP_KERNEL);
5111         if (sbi->s_group_info == NULL) {
5112                 printk(KERN_ERR "EXT4-fs: can't allocate buddy meta group\n");
5113                 return -ENOMEM;
5114 @@ -2256,63 +2451,15 @@ static int ext4_mb_init_backend(struct super_block *sb)
5115                 sbi->s_group_info[i] = meta_group_info;
5116         }
5117
5118 -       /*
5119 -        * calculate needed size. if change bb_counters size,
5120 -        * don't forget about ext4_mb_generate_buddy()
5121 -        */
5122 -       len = sizeof(struct ext4_group_info);
5123 -       len += sizeof(unsigned short) * (sb->s_blocksize_bits + 2);
5124         for (i = 0; i < sbi->s_groups_count; i++) {
5125 -               struct ext4_group_desc *desc;
5126 -
5127 -               meta_group_info =
5128 -                       sbi->s_group_info[i >> EXT4_DESC_PER_BLOCK_BITS(sb)];
5129 -               j = i & (EXT4_DESC_PER_BLOCK(sb) - 1);
5130 -
5131 -               meta_group_info[j] = kzalloc(len, GFP_KERNEL);
5132 -               if (meta_group_info[j] == NULL) {
5133 -                       printk(KERN_ERR "EXT4-fs: can't allocate buddy mem\n");
5134 -                       goto err_freebuddy;
5135 -               }
5136                 desc = ext4_get_group_desc(sb, i, NULL);
5137                 if (desc == NULL) {
5138                         printk(KERN_ERR
5139                                 "EXT4-fs: can't read descriptor %lu\n", i);
5140 -                       i++;
5141                         goto err_freebuddy;
5142                 }
5143 -               memset(meta_group_info[j], 0, len);
5144 -               set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT,
5145 -                       &(meta_group_info[j]->bb_state));
5146 -
5147 -               /*
5148 -                * initialize bb_free to be able to skip
5149 -                * empty groups without initialization
5150 -                */
5151 -               if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
5152 -                       meta_group_info[j]->bb_free =
5153 -                               ext4_free_blocks_after_init(sb, i, desc);
5154 -               } else {
5155 -                       meta_group_info[j]->bb_free =
5156 -                               le16_to_cpu(desc->bg_free_blocks_count);
5157 -               }
5158 -
5159 -               INIT_LIST_HEAD(&meta_group_info[j]->bb_prealloc_list);
5160 -
5161 -#ifdef DOUBLE_CHECK
5162 -               {
5163 -                       struct buffer_head *bh;
5164 -                       meta_group_info[j]->bb_bitmap =
5165 -                               kmalloc(sb->s_blocksize, GFP_KERNEL);
5166 -                       BUG_ON(meta_group_info[j]->bb_bitmap == NULL);
5167 -                       bh = read_block_bitmap(sb, i);
5168 -                       BUG_ON(bh == NULL);
5169 -                       memcpy(meta_group_info[j]->bb_bitmap, bh->b_data,
5170 -                                       sb->s_blocksize);
5171 -                       put_bh(bh);
5172 -               }
5173 -#endif
5174 -
5175 +               if (ext4_mb_add_groupinfo(sb, i, desc) != 0)
5176 +                       goto err_freebuddy;
5177         }
5178
5179         return 0;
5180 @@ -2333,9 +2480,10 @@ err_freesgi:
5181  int ext4_mb_init(struct super_block *sb, int needs_recovery)
5182  {
5183         struct ext4_sb_info *sbi = EXT4_SB(sb);
5184 -       unsigned i;
5185 +       unsigned i, j;
5186         unsigned offset;
5187         unsigned max;
5188 +       int ret;
5189
5190         if (!test_opt(sb, MBALLOC))
5191                 return 0;
5192 @@ -2370,12 +2518,12 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
5193         } while (i <= sb->s_blocksize_bits + 1);
5194
5195         /* init file for buddy data */
5196 -       i = ext4_mb_init_backend(sb);
5197 -       if (i) {
5198 +       ret = ext4_mb_init_backend(sb);
5199 +       if (ret != 0) {
5200                 clear_opt(sbi->s_mount_opt, MBALLOC);
5201                 kfree(sbi->s_mb_offsets);
5202                 kfree(sbi->s_mb_maxs);
5203 -               return i;
5204 +               return ret;
5205         }
5206
5207         spin_lock_init(&sbi->s_md_lock);
5208 @@ -2392,7 +2540,7 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
5209         sbi->s_mb_history_filter = EXT4_MB_HISTORY_DEFAULT;
5210         sbi->s_mb_group_prealloc = MB_DEFAULT_GROUP_PREALLOC;
5211
5212 -       i = sizeof(struct ext4_locality_group) * NR_CPUS;
5213 +       i = sizeof(struct ext4_locality_group) * nr_cpu_ids;
5214         sbi->s_locality_groups = kmalloc(i, GFP_KERNEL);
5215         if (sbi->s_locality_groups == NULL) {
5216                 clear_opt(sbi->s_mount_opt, MBALLOC);
5217 @@ -2400,11 +2548,12 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
5218                 kfree(sbi->s_mb_maxs);
5219                 return -ENOMEM;
5220         }
5221 -       for (i = 0; i < NR_CPUS; i++) {
5222 +       for (i = 0; i < nr_cpu_ids; i++) {
5223                 struct ext4_locality_group *lg;
5224                 lg = &sbi->s_locality_groups[i];
5225                 mutex_init(&lg->lg_mutex);
5226 -               INIT_LIST_HEAD(&lg->lg_prealloc_list);
5227 +               for (j = 0; j < PREALLOC_TB_SIZE; j++)
5228 +                       INIT_LIST_HEAD(&lg->lg_prealloc_list[j]);
5229                 spin_lock_init(&lg->lg_prealloc_lock);
5230         }
5231
5232 @@ -2548,8 +2697,7 @@ ext4_mb_free_committed_blocks(struct super_block *sb)
5233                 ext4_lock_group(sb, md->group);
5234                 for (i = 0; i < md->num; i++) {
5235                         mb_debug(" %u", md->blocks[i]);
5236 -                       err = mb_free_blocks(NULL, &e4b, md->blocks[i], 1);
5237 -                       BUG_ON(err != 0);
5238 +                       mb_free_blocks(NULL, &e4b, md->blocks[i], 1);
5239                 }
5240                 mb_debug("\n");
5241                 ext4_unlock_group(sb, md->group);
5242 @@ -2575,25 +2723,24 @@ ext4_mb_free_committed_blocks(struct super_block *sb)
5243
5244
5245
5246 -#define MB_PROC_VALUE_READ(name)                               \
5247 -static int ext4_mb_read_##name(char *page, char **start,       \
5248 -               off_t off, int count, int *eof, void *data)     \
5249 +#define MB_PROC_FOPS(name)                                     \
5250 +static int ext4_mb_##name##_proc_show(struct seq_file *m, void *v)     \
5251  {                                                              \
5252 -       struct ext4_sb_info *sbi = data;                        \
5253 -       int len;                                                \
5254 -       *eof = 1;                                               \
5255 -       if (off != 0)                                           \
5256 -               return 0;                                       \
5257 -       len = sprintf(page, "%ld\n", sbi->s_mb_##name);         \
5258 -       *start = page;                                          \
5259 -       return len;                                             \
5260 -}
5261 -
5262 -#define MB_PROC_VALUE_WRITE(name)                              \
5263 -static int ext4_mb_write_##name(struct file *file,             \
5264 -               const char __user *buf, unsigned long cnt, void *data)  \
5265 +       struct ext4_sb_info *sbi = m->private;                  \
5266 +                                                               \
5267 +       seq_printf(m, "%ld\n", sbi->s_mb_##name);               \
5268 +       return 0;                                               \
5269 +}                                                              \
5270 +                                                               \
5271 +static int ext4_mb_##name##_proc_open(struct inode *inode, struct file *file)\
5272  {                                                              \
5273 -       struct ext4_sb_info *sbi = data;                        \
5274 +       return single_open(file, ext4_mb_##name##_proc_show, PDE(inode)->data);\
5275 +}                                                              \
5276 +                                                               \
5277 +static ssize_t ext4_mb_##name##_proc_write(struct file *file,  \
5278 +               const char __user *buf, size_t cnt, loff_t *ppos)       \
5279 +{                                                              \
5280 +       struct ext4_sb_info *sbi = PDE(file->f_path.dentry->d_inode)->data;\
5281         char str[32];                                           \
5282         long value;                                             \
5283         if (cnt >= sizeof(str))                                 \
5284 @@ -2605,31 +2752,32 @@ static int ext4_mb_write_##name(struct file *file,              \
5285                 return -ERANGE;                                 \
5286         sbi->s_mb_##name = value;                               \
5287         return cnt;                                             \
5288 -}
5289 +}                                                              \
5290 +                                                               \
5291 +static const struct file_operations ext4_mb_##name##_proc_fops = {     \
5292 +       .owner          = THIS_MODULE,                          \
5293 +       .open           = ext4_mb_##name##_proc_open,           \
5294 +       .read           = seq_read,                             \
5295 +       .llseek         = seq_lseek,                            \
5296 +       .release        = single_release,                       \
5297 +       .write          = ext4_mb_##name##_proc_write,          \
5298 +};
5299
5300 -MB_PROC_VALUE_READ(stats);
5301 -MB_PROC_VALUE_WRITE(stats);
5302 -MB_PROC_VALUE_READ(max_to_scan);
5303 -MB_PROC_VALUE_WRITE(max_to_scan);
5304 -MB_PROC_VALUE_READ(min_to_scan);
5305 -MB_PROC_VALUE_WRITE(min_to_scan);
5306 -MB_PROC_VALUE_READ(order2_reqs);
5307 -MB_PROC_VALUE_WRITE(order2_reqs);
5308 -MB_PROC_VALUE_READ(stream_request);
5309 -MB_PROC_VALUE_WRITE(stream_request);
5310 -MB_PROC_VALUE_READ(group_prealloc);
5311 -MB_PROC_VALUE_WRITE(group_prealloc);
5312 +MB_PROC_FOPS(stats);
5313 +MB_PROC_FOPS(max_to_scan);
5314 +MB_PROC_FOPS(min_to_scan);
5315 +MB_PROC_FOPS(order2_reqs);
5316 +MB_PROC_FOPS(stream_request);
5317 +MB_PROC_FOPS(group_prealloc);
5318
5319  #define        MB_PROC_HANDLER(name, var)                                      \
5320  do {                                                                   \
5321 -       proc = create_proc_entry(name, mode, sbi->s_mb_proc);           \
5322 +       proc = proc_create_data(name, mode, sbi->s_mb_proc,             \
5323 +                               &ext4_mb_##var##_proc_fops, sbi);       \
5324         if (proc == NULL) {                                             \
5325                 printk(KERN_ERR "EXT4-fs: can't to create %s\n", name); \
5326                 goto err_out;                                           \
5327         }                                                               \
5328 -       proc->data = sbi;                                               \
5329 -       proc->read_proc  = ext4_mb_read_##var ;                         \
5330 -       proc->write_proc = ext4_mb_write_##var;                         \
5331  } while (0)
5332
5333  static int ext4_mb_init_per_dev_proc(struct super_block *sb)
5334 @@ -2639,6 +2787,10 @@ static int ext4_mb_init_per_dev_proc(struct super_block *sb)
5335         struct proc_dir_entry *proc;
5336         char devname[64];
5337
5338 +       if (proc_root_ext4 == NULL) {
5339 +               sbi->s_mb_proc = NULL;
5340 +               return -EINVAL;
5341 +       }
5342         bdevname(sb->s_bdev, devname);
5343         sbi->s_mb_proc = proc_mkdir(devname, proc_root_ext4);
5344
5345 @@ -2747,7 +2899,7 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
5346
5347
5348         err = -EIO;
5349 -       bitmap_bh = read_block_bitmap(sb, ac->ac_b_ex.fe_group);
5350 +       bitmap_bh = ext4_read_block_bitmap(sb, ac->ac_b_ex.fe_group);
5351         if (!bitmap_bh)
5352                 goto out_err;
5353
5354 @@ -2816,7 +2968,23 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
5355         le16_add_cpu(&gdp->bg_free_blocks_count, -ac->ac_b_ex.fe_len);
5356         gdp->bg_checksum = ext4_group_desc_csum(sbi, ac->ac_b_ex.fe_group, gdp);
5357         spin_unlock(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group));
5358 -       percpu_counter_sub(&sbi->s_freeblocks_counter, ac->ac_b_ex.fe_len);
5359 +
5360 +       /*
5361 +        * free blocks account has already be reduced/reserved
5362 +        * at write_begin() time for delayed allocation
5363 +        * do not double accounting
5364 +        */
5365 +       if (!(ac->ac_flags & EXT4_MB_DELALLOC_RESERVED))
5366 +               percpu_counter_sub(&sbi->s_freeblocks_counter,
5367 +                                       ac->ac_b_ex.fe_len);
5368 +
5369 +       if (sbi->s_log_groups_per_flex) {
5370 +               ext4_group_t flex_group = ext4_flex_group(sbi,
5371 +                                                         ac->ac_b_ex.fe_group);
5372 +               spin_lock(sb_bgl_lock(sbi, flex_group));
5373 +               sbi->s_flex_groups[flex_group].free_blocks -= ac->ac_b_ex.fe_len;
5374 +               spin_unlock(sb_bgl_lock(sbi, flex_group));
5375 +       }
5376
5377         err = ext4_journal_dirty_metadata(handle, bitmap_bh);
5378         if (err)
5379 @@ -3096,6 +3264,7 @@ static void ext4_mb_use_group_pa(struct ext4_allocation_context *ac,
5380                                 struct ext4_prealloc_space *pa)
5381  {
5382         unsigned int len = ac->ac_o_ex.fe_len;
5383 +
5384         ext4_get_group_no_and_offset(ac->ac_sb, pa->pa_pstart,
5385                                         &ac->ac_b_ex.fe_group,
5386                                         &ac->ac_b_ex.fe_start);
5387 @@ -3113,14 +3282,45 @@ static void ext4_mb_use_group_pa(struct ext4_allocation_context *ac,
5388  }
5389
5390  /*
5391 + * Return the prealloc space that have minimal distance
5392 + * from the goal block. @cpa is the prealloc
5393 + * space that is having currently known minimal distance
5394 + * from the goal block.
5395 + */
5396 +static struct ext4_prealloc_space *
5397 +ext4_mb_check_group_pa(ext4_fsblk_t goal_block,
5398 +                       struct ext4_prealloc_space *pa,
5399 +                       struct ext4_prealloc_space *cpa)
5400 +{
5401 +       ext4_fsblk_t cur_distance, new_distance;
5402 +
5403 +       if (cpa == NULL) {
5404 +               atomic_inc(&pa->pa_count);
5405 +               return pa;
5406 +       }
5407 +       cur_distance = abs(goal_block - cpa->pa_pstart);
5408 +       new_distance = abs(goal_block - pa->pa_pstart);
5409 +
5410 +       if (cur_distance < new_distance)
5411 +               return cpa;
5412 +
5413 +       /* drop the previous reference */
5414 +       atomic_dec(&cpa->pa_count);
5415 +       atomic_inc(&pa->pa_count);
5416 +       return pa;
5417 +}
5418 +
5419 +/*
5420   * search goal blocks in preallocated space
5421   */
5422  static noinline_for_stack int
5423  ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
5424  {
5425 +       int order, i;
5426         struct ext4_inode_info *ei = EXT4_I(ac->ac_inode);
5427         struct ext4_locality_group *lg;
5428 -       struct ext4_prealloc_space *pa;
5429 +       struct ext4_prealloc_space *pa, *cpa = NULL;
5430 +       ext4_fsblk_t goal_block;
5431
5432         /* only data can be preallocated */
5433         if (!(ac->ac_flags & EXT4_MB_HINT_DATA))
5434 @@ -3158,22 +3358,38 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
5435         lg = ac->ac_lg;
5436         if (lg == NULL)
5437                 return 0;
5438 +       order  = fls(ac->ac_o_ex.fe_len) - 1;
5439 +       if (order > PREALLOC_TB_SIZE - 1)
5440 +               /* The max size of hash table is PREALLOC_TB_SIZE */
5441 +               order = PREALLOC_TB_SIZE - 1;
5442 +
5443 +       goal_block = ac->ac_g_ex.fe_group * EXT4_BLOCKS_PER_GROUP(ac->ac_sb) +
5444 +                    ac->ac_g_ex.fe_start +
5445 +                    le32_to_cpu(EXT4_SB(ac->ac_sb)->s_es->s_first_data_block);
5446 +       /*
5447 +        * search for the prealloc space that is having
5448 +        * minimal distance from the goal block.
5449 +        */
5450 +       for (i = order; i < PREALLOC_TB_SIZE; i++) {
5451 +               rcu_read_lock();
5452 +               list_for_each_entry_rcu(pa, &lg->lg_prealloc_list[i],
5453 +                                       pa_inode_list) {
5454 +                       spin_lock(&pa->pa_lock);
5455 +                       if (pa->pa_deleted == 0 &&
5456 +                                       pa->pa_free >= ac->ac_o_ex.fe_len) {
5457
5458 -       rcu_read_lock();
5459 -       list_for_each_entry_rcu(pa, &lg->lg_prealloc_list, pa_inode_list) {
5460 -               spin_lock(&pa->pa_lock);
5461 -               if (pa->pa_deleted == 0 && pa->pa_free >= ac->ac_o_ex.fe_len) {
5462 -                       atomic_inc(&pa->pa_count);
5463 -                       ext4_mb_use_group_pa(ac, pa);
5464 +                               cpa = ext4_mb_check_group_pa(goal_block,
5465 +                                                               pa, cpa);
5466 +                       }
5467                         spin_unlock(&pa->pa_lock);
5468 -                       ac->ac_criteria = 20;
5469 -                       rcu_read_unlock();
5470 -                       return 1;
5471                 }
5472 -               spin_unlock(&pa->pa_lock);
5473 +               rcu_read_unlock();
5474 +       }
5475 +       if (cpa) {
5476 +               ext4_mb_use_group_pa(ac, cpa);
5477 +               ac->ac_criteria = 20;
5478 +               return 1;
5479         }
5480 -       rcu_read_unlock();
5481 -
5482         return 0;
5483  }
5484
5485 @@ -3396,6 +3612,7 @@ ext4_mb_new_group_pa(struct ext4_allocation_context *ac)
5486         pa->pa_free = pa->pa_len;
5487         atomic_set(&pa->pa_count, 1);
5488         spin_lock_init(&pa->pa_lock);
5489 +       INIT_LIST_HEAD(&pa->pa_inode_list);
5490         pa->pa_deleted = 0;
5491         pa->pa_linear = 1;
5492
5493 @@ -3416,10 +3633,10 @@ ext4_mb_new_group_pa(struct ext4_allocation_context *ac)
5494         list_add(&pa->pa_group_list, &grp->bb_prealloc_list);
5495         ext4_unlock_group(sb, ac->ac_b_ex.fe_group);
5496
5497 -       spin_lock(pa->pa_obj_lock);
5498 -       list_add_tail_rcu(&pa->pa_inode_list, &lg->lg_prealloc_list);
5499 -       spin_unlock(pa->pa_obj_lock);
5500 -
5501 +       /*
5502 +        * We will later add the new pa to the right bucket
5503 +        * after updating the pa_free in ext4_mb_release_context
5504 +        */
5505         return 0;
5506  }
5507
5508 @@ -3473,8 +3690,6 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
5509                 if (bit >= end)
5510                         break;
5511                 next = mb_find_next_bit(bitmap_bh->b_data, end, bit);
5512 -               if (next > end)
5513 -                       next = end;
5514                 start = group * EXT4_BLOCKS_PER_GROUP(sb) + bit +
5515                                 le32_to_cpu(sbi->s_es->s_first_data_block);
5516                 mb_debug("    free preallocated %u/%u in group %u\n",
5517 @@ -3569,22 +3784,25 @@ ext4_mb_discard_group_preallocations(struct super_block *sb,
5518         if (list_empty(&grp->bb_prealloc_list))
5519                 return 0;
5520
5521 -       bitmap_bh = read_block_bitmap(sb, group);
5522 +       bitmap_bh = ext4_read_block_bitmap(sb, group);
5523         if (bitmap_bh == NULL) {
5524 -               /* error handling here */
5525 -               ext4_mb_release_desc(&e4b);
5526 -               BUG_ON(bitmap_bh == NULL);
5527 +               ext4_error(sb, __func__, "Error in reading block "
5528 +                               "bitmap for %lu\n", group);
5529 +               return 0;
5530         }
5531
5532         err = ext4_mb_load_buddy(sb, group, &e4b);
5533 -       BUG_ON(err != 0); /* error handling here */
5534 +       if (err) {
5535 +               ext4_error(sb, __func__, "Error in loading buddy "
5536 +                               "information for %lu\n", group);
5537 +               put_bh(bitmap_bh);
5538 +               return 0;
5539 +       }
5540
5541         if (needed == 0)
5542                 needed = EXT4_BLOCKS_PER_GROUP(sb) + 1;
5543
5544 -       grp = ext4_get_group_info(sb, group);
5545         INIT_LIST_HEAD(&list);
5546 -
5547         ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS);
5548  repeat:
5549         ext4_lock_group(sb, group);
5550 @@ -3741,13 +3959,18 @@ repeat:
5551                 ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, NULL);
5552
5553                 err = ext4_mb_load_buddy(sb, group, &e4b);
5554 -               BUG_ON(err != 0); /* error handling here */
5555 +               if (err) {
5556 +                       ext4_error(sb, __func__, "Error in loading buddy "
5557 +                                       "information for %lu\n", group);
5558 +                       continue;
5559 +               }
5560
5561 -               bitmap_bh = read_block_bitmap(sb, group);
5562 +               bitmap_bh = ext4_read_block_bitmap(sb, group);
5563                 if (bitmap_bh == NULL) {
5564 -                       /* error handling here */
5565 +                       ext4_error(sb, __func__, "Error in reading block "
5566 +                                       "bitmap for %lu\n", group);
5567                         ext4_mb_release_desc(&e4b);
5568 -                       BUG_ON(bitmap_bh == NULL);
5569 +                       continue;
5570                 }
5571
5572                 ext4_lock_group(sb, group);
5573 @@ -3950,22 +4173,168 @@ ext4_mb_initialize_context(struct ext4_allocation_context *ac,
5574
5575  }
5576
5577 +static noinline_for_stack void
5578 +ext4_mb_discard_lg_preallocations(struct super_block *sb,
5579 +                                       struct ext4_locality_group *lg,
5580 +                                       int order, int total_entries)
5581 +{
5582 +       ext4_group_t group = 0;
5583 +       struct ext4_buddy e4b;
5584 +       struct list_head discard_list;
5585 +       struct ext4_prealloc_space *pa, *tmp;
5586 +       struct ext4_allocation_context *ac;
5587 +
5588 +       mb_debug("discard locality group preallocation\n");
5589 +
5590 +       INIT_LIST_HEAD(&discard_list);
5591 +       ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS);
5592 +
5593 +       spin_lock(&lg->lg_prealloc_lock);
5594 +       list_for_each_entry_rcu(pa, &lg->lg_prealloc_list[order],
5595 +                                               pa_inode_list) {
5596 +               spin_lock(&pa->pa_lock);
5597 +               if (atomic_read(&pa->pa_count)) {
5598 +                       /*
5599 +                        * This is the pa that we just used
5600 +                        * for block allocation. So don't
5601 +                        * free that
5602 +                        */
5603 +                       spin_unlock(&pa->pa_lock);
5604 +                       continue;
5605 +               }
5606 +               if (pa->pa_deleted) {
5607 +                       spin_unlock(&pa->pa_lock);
5608 +                       continue;
5609 +               }
5610 +               /* only lg prealloc space */
5611 +               BUG_ON(!pa->pa_linear);
5612 +
5613 +               /* seems this one can be freed ... */
5614 +               pa->pa_deleted = 1;
5615 +               spin_unlock(&pa->pa_lock);
5616 +
5617 +               list_del_rcu(&pa->pa_inode_list);
5618 +               list_add(&pa->u.pa_tmp_list, &discard_list);
5619 +
5620 +               total_entries--;
5621 +               if (total_entries <= 5) {
5622 +                       /*
5623 +                        * we want to keep only 5 entries
5624 +                        * allowing it to grow to 8. This
5625 +                        * mak sure we don't call discard
5626 +                        * soon for this list.
5627 +                        */
5628 +                       break;
5629 +               }
5630 +       }
5631 +       spin_unlock(&lg->lg_prealloc_lock);
5632 +
5633 +       list_for_each_entry_safe(pa, tmp, &discard_list, u.pa_tmp_list) {
5634 +
5635 +               ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, NULL);
5636 +               if (ext4_mb_load_buddy(sb, group, &e4b)) {
5637 +                       ext4_error(sb, __func__, "Error in loading buddy "
5638 +                                       "information for %lu\n", group);
5639 +                       continue;
5640 +               }
5641 +               ext4_lock_group(sb, group);
5642 +               list_del(&pa->pa_group_list);
5643 +               ext4_mb_release_group_pa(&e4b, pa, ac);
5644 +               ext4_unlock_group(sb, group);
5645 +
5646 +               ext4_mb_release_desc(&e4b);
5647 +               list_del(&pa->u.pa_tmp_list);
5648 +               call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
5649 +       }
5650 +       if (ac)
5651 +               kmem_cache_free(ext4_ac_cachep, ac);
5652 +}
5653 +
5654 +/*
5655 + * We have incremented pa_count. So it cannot be freed at this
5656 + * point. Also we hold lg_mutex. So no parallel allocation is
5657 + * possible from this lg. That means pa_free cannot be updated.
5658 + *
5659 + * A parallel ext4_mb_discard_group_preallocations is possible.
5660 + * which can cause the lg_prealloc_list to be updated.
5661 + */
5662 +
5663 +static void ext4_mb_add_n_trim(struct ext4_allocation_context *ac)
5664 +{
5665 +       int order, added = 0, lg_prealloc_count = 1;
5666 +       struct super_block *sb = ac->ac_sb;
5667 +       struct ext4_locality_group *lg = ac->ac_lg;
5668 +       struct ext4_prealloc_space *tmp_pa, *pa = ac->ac_pa;
5669 +
5670 +       order = fls(pa->pa_free) - 1;
5671 +       if (order > PREALLOC_TB_SIZE - 1)
5672 +               /* The max size of hash table is PREALLOC_TB_SIZE */
5673 +               order = PREALLOC_TB_SIZE - 1;
5674 +       /* Add the prealloc space to lg */
5675 +       rcu_read_lock();
5676 +       list_for_each_entry_rcu(tmp_pa, &lg->lg_prealloc_list[order],
5677 +                                               pa_inode_list) {
5678 +               spin_lock(&tmp_pa->pa_lock);
5679 +               if (tmp_pa->pa_deleted) {
5680 +                       spin_unlock(&pa->pa_lock);
5681 +                       continue;
5682 +               }
5683 +               if (!added && pa->pa_free < tmp_pa->pa_free) {
5684 +                       /* Add to the tail of the previous entry */
5685 +                       list_add_tail_rcu(&pa->pa_inode_list,
5686 +                                               &tmp_pa->pa_inode_list);
5687 +                       added = 1;
5688 +                       /*
5689 +                        * we want to count the total
5690 +                        * number of entries in the list
5691 +                        */
5692 +               }
5693 +               spin_unlock(&tmp_pa->pa_lock);
5694 +               lg_prealloc_count++;
5695 +       }
5696 +       if (!added)
5697 +               list_add_tail_rcu(&pa->pa_inode_list,
5698 +                                       &lg->lg_prealloc_list[order]);
5699 +       rcu_read_unlock();
5700 +
5701 +       /* Now trim the list to be not more than 8 elements */
5702 +       if (lg_prealloc_count > 8) {
5703 +               ext4_mb_discard_lg_preallocations(sb, lg,
5704 +                                               order, lg_prealloc_count);
5705 +               return;
5706 +       }
5707 +       return ;
5708 +}
5709 +
5710  /*
5711   * release all resource we used in allocation
5712   */
5713  static int ext4_mb_release_context(struct ext4_allocation_context *ac)
5714  {
5715 -       if (ac->ac_pa) {
5716 -               if (ac->ac_pa->pa_linear) {
5717 +       struct ext4_prealloc_space *pa = ac->ac_pa;
5718 +       if (pa) {
5719 +               if (pa->pa_linear) {
5720                         /* see comment in ext4_mb_use_group_pa() */
5721 -                       spin_lock(&ac->ac_pa->pa_lock);
5722 -                       ac->ac_pa->pa_pstart += ac->ac_b_ex.fe_len;
5723 -                       ac->ac_pa->pa_lstart += ac->ac_b_ex.fe_len;
5724 -                       ac->ac_pa->pa_free -= ac->ac_b_ex.fe_len;
5725 -                       ac->ac_pa->pa_len -= ac->ac_b_ex.fe_len;
5726 -                       spin_unlock(&ac->ac_pa->pa_lock);
5727 +                       spin_lock(&pa->pa_lock);
5728 +                       pa->pa_pstart += ac->ac_b_ex.fe_len;
5729 +                       pa->pa_lstart += ac->ac_b_ex.fe_len;
5730 +                       pa->pa_free -= ac->ac_b_ex.fe_len;
5731 +                       pa->pa_len -= ac->ac_b_ex.fe_len;
5732 +                       spin_unlock(&pa->pa_lock);
5733 +                       /*
5734 +                        * We want to add the pa to the right bucket.
5735 +                        * Remove it from the list and while adding
5736 +                        * make sure the list to which we are adding
5737 +                        * doesn't grow big.
5738 +                        */
5739 +                       if (likely(pa->pa_free)) {
5740 +                               spin_lock(pa->pa_obj_lock);
5741 +                               list_del_rcu(&pa->pa_inode_list);
5742 +                               spin_unlock(pa->pa_obj_lock);
5743 +                               ext4_mb_add_n_trim(ac);
5744 +                       }
5745                 }
5746 -               ext4_mb_put_pa(ac, ac->ac_sb, ac->ac_pa);
5747 +               ext4_mb_put_pa(ac, ac->ac_sb, pa);
5748         }
5749         if (ac->ac_bitmap_page)
5750                 page_cache_release(ac->ac_bitmap_page);
5751 @@ -4011,10 +4380,21 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
5752         sbi = EXT4_SB(sb);
5753
5754         if (!test_opt(sb, MBALLOC)) {
5755 -               block = ext4_new_blocks_old(handle, ar->inode, ar->goal,
5756 +               block = ext4_old_new_blocks(handle, ar->inode, ar->goal,
5757                                             &(ar->len), errp);
5758                 return block;
5759         }
5760 +       if (!EXT4_I(ar->inode)->i_delalloc_reserved_flag) {
5761 +               /*
5762 +                * With delalloc we already reserved the blocks
5763 +                */
5764 +               ar->len = ext4_has_free_blocks(sbi, ar->len);
5765 +       }
5766 +
5767 +       if (ar->len == 0) {
5768 +               *errp = -ENOSPC;
5769 +               return 0;
5770 +       }
5771
5772         while (ar->len && DQUOT_ALLOC_BLOCK(ar->inode, ar->len)) {
5773                 ar->flags |= EXT4_MB_HINT_NOPREALLOC;
5774 @@ -4026,10 +4406,14 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
5775         }
5776         inquota = ar->len;
5777
5778 +       if (EXT4_I(ar->inode)->i_delalloc_reserved_flag)
5779 +               ar->flags |= EXT4_MB_DELALLOC_RESERVED;
5780 +
5781         ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS);
5782         if (!ac) {
5783 +               ar->len = 0;
5784                 *errp = -ENOMEM;
5785 -               return 0;
5786 +               goto out1;
5787         }
5788
5789         ext4_mb_poll_new_transaction(sb, handle);
5790 @@ -4037,12 +4421,11 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
5791         *errp = ext4_mb_initialize_context(ac, ar);
5792         if (*errp) {
5793                 ar->len = 0;
5794 -               goto out;
5795 +               goto out2;
5796         }
5797
5798         ac->ac_op = EXT4_MB_HISTORY_PREALLOC;
5799         if (!ext4_mb_use_preallocated(ac)) {
5800 -
5801                 ac->ac_op = EXT4_MB_HISTORY_ALLOC;
5802                 ext4_mb_normalize_request(ac, ar);
5803  repeat:
5804 @@ -4085,11 +4468,12 @@ repeat:
5805
5806         ext4_mb_release_context(ac);
5807
5808 -out:
5809 +out2:
5810 +       kmem_cache_free(ext4_ac_cachep, ac);
5811 +out1:
5812         if (ar->len < inquota)
5813                 DQUOT_FREE_BLOCK(ar->inode, inquota - ar->len);
5814
5815 -       kmem_cache_free(ext4_ac_cachep, ac);
5816         return block;
5817  }
5818  static void ext4_mb_poll_new_transaction(struct super_block *sb,
5819 @@ -4242,12 +4626,16 @@ do_more:
5820                 overflow = bit + count - EXT4_BLOCKS_PER_GROUP(sb);
5821                 count -= overflow;
5822         }
5823 -       bitmap_bh = read_block_bitmap(sb, block_group);
5824 -       if (!bitmap_bh)
5825 +       bitmap_bh = ext4_read_block_bitmap(sb, block_group);
5826 +       if (!bitmap_bh) {
5827 +               err = -EIO;
5828                 goto error_return;
5829 +       }
5830         gdp = ext4_get_group_desc(sb, block_group, &gd_bh);
5831 -       if (!gdp)
5832 +       if (!gdp) {
5833 +               err = -EIO;
5834                 goto error_return;
5835 +       }
5836
5837         if (in_range(ext4_block_bitmap(sb, gdp), block, count) ||
5838             in_range(ext4_inode_bitmap(sb, gdp), block, count) ||
5839 @@ -4309,10 +4697,9 @@ do_more:
5840                 ext4_mb_free_metadata(handle, &e4b, block_group, bit, count);
5841         } else {
5842                 ext4_lock_group(sb, block_group);
5843 -               err = mb_free_blocks(inode, &e4b, bit, count);
5844 +               mb_free_blocks(inode, &e4b, bit, count);
5845                 ext4_mb_return_to_preallocation(inode, &e4b, block, count);
5846                 ext4_unlock_group(sb, block_group);
5847 -               BUG_ON(err != 0);
5848         }
5849
5850         spin_lock(sb_bgl_lock(sbi, block_group));
5851 @@ -4321,6 +4708,13 @@ do_more:
5852         spin_unlock(sb_bgl_lock(sbi, block_group));
5853         percpu_counter_add(&sbi->s_freeblocks_counter, count);
5854
5855 +       if (sbi->s_log_groups_per_flex) {
5856 +               ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
5857 +               spin_lock(sb_bgl_lock(sbi, flex_group));
5858 +               sbi->s_flex_groups[flex_group].free_blocks += count;
5859 +               spin_unlock(sb_bgl_lock(sbi, flex_group));
5860 +       }
5861 +
5862         ext4_mb_release_desc(&e4b);
5863
5864         *freed += count;
5865 diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
5866 index bfe6add..c7c9906 100644
5867 --- a/fs/ext4/mballoc.h
5868 +++ b/fs/ext4/mballoc.h
5869 @@ -164,11 +164,17 @@ struct ext4_free_extent {
5870   * Locality group:
5871   *   we try to group all related changes together
5872   *   so that writeback can flush/allocate them together as well
5873 + *   Size of lg_prealloc_list hash is determined by MB_DEFAULT_GROUP_PREALLOC
5874 + *   (512). We store prealloc space into the hash based on the pa_free blocks
5875 + *   order value.ie, fls(pa_free)-1;
5876   */
5877 +#define PREALLOC_TB_SIZE 10
5878  struct ext4_locality_group {
5879         /* for allocator */
5880 -       struct mutex            lg_mutex;       /* to serialize allocates */
5881 -       struct list_head        lg_prealloc_list;/* list of preallocations */
5882 +       /* to serialize allocates */
5883 +       struct mutex            lg_mutex;
5884 +       /* list of preallocations */
5885 +       struct list_head        lg_prealloc_list[PREALLOC_TB_SIZE];
5886         spinlock_t              lg_prealloc_lock;
5887  };
5888
5889 diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
5890 index b9e077b..46fc0b5 100644
5891 --- a/fs/ext4/migrate.c
5892 +++ b/fs/ext4/migrate.c
5893 @@ -53,7 +53,8 @@ static int finish_range(handle_t *handle, struct inode *inode,
5894          * credit. But below we try to not accumalate too much
5895          * of them by restarting the journal.
5896          */
5897 -       needed = ext4_ext_calc_credits_for_insert(inode, path);
5898 +       needed = ext4_ext_calc_credits_for_single_extent(inode,
5899 +                   lb->last_block - lb->first_block + 1, path);
5900
5901         /*
5902          * Make sure the credit we accumalated is not really high
5903 diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
5904 index ab16bea..387ad98 100644
5905 --- a/fs/ext4/namei.c
5906 +++ b/fs/ext4/namei.c
5907 @@ -183,6 +183,16 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
5908                              struct inode *inode);
5909
5910  /*
5911 + * p is at least 6 bytes before the end of page
5912 + */
5913 +static inline struct ext4_dir_entry_2 *
5914 +ext4_next_entry(struct ext4_dir_entry_2 *p)
5915 +{
5916 +       return (struct ext4_dir_entry_2 *)((char *)p +
5917 +               ext4_rec_len_from_disk(p->rec_len));
5918 +}
5919 +
5920 +/*
5921   * Future: use high four bits of block for coalesce-on-delete flags
5922   * Mask them off for now.
5923   */
5924 @@ -231,13 +241,13 @@ static inline unsigned dx_root_limit (struct inode *dir, unsigned infosize)
5925  {
5926         unsigned entry_space = dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(1) -
5927                 EXT4_DIR_REC_LEN(2) - infosize;
5928 -       return 0? 20: entry_space / sizeof(struct dx_entry);
5929 +       return entry_space / sizeof(struct dx_entry);
5930  }
5931
5932  static inline unsigned dx_node_limit (struct inode *dir)
5933  {
5934         unsigned entry_space = dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(0);
5935 -       return 0? 22: entry_space / sizeof(struct dx_entry);
5936 +       return entry_space / sizeof(struct dx_entry);
5937  }
5938
5939  /*
5940 @@ -554,15 +564,6 @@ static int ext4_htree_next_block(struct inode *dir, __u32 hash,
5941
5942
5943  /*
5944 - * p is at least 6 bytes before the end of page
5945 - */
5946 -static inline struct ext4_dir_entry_2 *ext4_next_entry(struct ext4_dir_entry_2 *p)
5947 -{
5948 -       return (struct ext4_dir_entry_2 *)((char *)p +
5949 -               ext4_rec_len_from_disk(p->rec_len));
5950 -}
5951 -
5952 -/*
5953   * This function fills a red-black tree with information from a
5954   * directory block.  It returns the number directory entries loaded
5955   * into the tree.  If there is an error it is returned in err.
5956 @@ -993,19 +994,21 @@ static struct buffer_head * ext4_dx_find_entry(struct dentry *dentry,
5957                 de = (struct ext4_dir_entry_2 *) bh->b_data;
5958                 top = (struct ext4_dir_entry_2 *) ((char *) de + sb->s_blocksize -
5959                                        EXT4_DIR_REC_LEN(0));
5960 -               for (; de < top; de = ext4_next_entry(de))
5961 -               if (ext4_match (namelen, name, de)) {
5962 -                       if (!ext4_check_dir_entry("ext4_find_entry",
5963 -                                                 dir, de, bh,
5964 -                                 (block<<EXT4_BLOCK_SIZE_BITS(sb))
5965 -                                         +((char *)de - bh->b_data))) {
5966 -                               brelse (bh);
5967 +               for (; de < top; de = ext4_next_entry(de)) {
5968 +                       int off = (block << EXT4_BLOCK_SIZE_BITS(sb))
5969 +                                 + ((char *) de - bh->b_data);
5970 +
5971 +                       if (!ext4_check_dir_entry(__func__, dir, de, bh, off)) {
5972 +                               brelse(bh);
5973                                 *err = ERR_BAD_DX_DIR;
5974                                 goto errout;
5975                         }
5976 -                       *res_dir = de;
5977 -                       dx_release (frames);
5978 -                       return bh;
5979 +
5980 +                       if (ext4_match(namelen, name, de)) {
5981 +                               *res_dir = de;
5982 +                               dx_release(frames);
5983 +                               return bh;
5984 +                       }
5985                 }
5986                 brelse (bh);
5987                 /* Check to see if we should continue to search */
5988 diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
5989 index 9ff7b1c..b3d3560 100644
5990 --- a/fs/ext4/resize.c
5991 +++ b/fs/ext4/resize.c
5992 @@ -73,7 +73,7 @@ static int verify_group_input(struct super_block *sb,
5993                              "Inode bitmap not in group (block %llu)",
5994                              (unsigned long long)input->inode_bitmap);
5995         else if (outside(input->inode_table, start, end) ||
5996 -                outside(itend - 1, start, end))
5997 +                outside(itend - 1, start, end))
5998                 ext4_warning(sb, __func__,
5999                              "Inode table not in group (blocks %llu-%llu)",
6000                              (unsigned long long)input->inode_table, itend - 1);
6001 @@ -104,7 +104,7 @@ static int verify_group_input(struct super_block *sb,
6002                              (unsigned long long)input->inode_bitmap,
6003                              start, metaend - 1);
6004         else if (inside(input->inode_table, start, metaend) ||
6005 -                inside(itend - 1, start, metaend))
6006 +                inside(itend - 1, start, metaend))
6007                 ext4_warning(sb, __func__,
6008                              "Inode table (%llu-%llu) overlaps"
6009                              "GDT table (%llu-%llu)",
6010 @@ -158,9 +158,9 @@ static int extend_or_restart_transaction(handle_t *handle, int thresh,
6011         if (err) {
6012                 if ((err = ext4_journal_restart(handle, EXT4_MAX_TRANS_DATA)))
6013                         return err;
6014 -               if ((err = ext4_journal_get_write_access(handle, bh)))
6015 +               if ((err = ext4_journal_get_write_access(handle, bh)))
6016                         return err;
6017 -        }
6018 +       }
6019
6020         return 0;
6021  }
6022 @@ -416,11 +416,11 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
6023                        "EXT4-fs: ext4_add_new_gdb: adding group block %lu\n",
6024                        gdb_num);
6025
6026 -       /*
6027 -        * If we are not using the primary superblock/GDT copy don't resize,
6028 -        * because the user tools have no way of handling this.  Probably a
6029 -        * bad time to do it anyways.
6030 -        */
6031 +        /*
6032 +         * If we are not using the primary superblock/GDT copy don't resize,
6033 +         * because the user tools have no way of handling this.  Probably a
6034 +         * bad time to do it anyways.
6035 +         */
6036         if (EXT4_SB(sb)->s_sbh->b_blocknr !=
6037             le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block)) {
6038                 ext4_warning(sb, __func__,
6039 @@ -507,14 +507,14 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
6040         return 0;
6041
6042  exit_inode:
6043 -       //ext4_journal_release_buffer(handle, iloc.bh);
6044 +       /* ext4_journal_release_buffer(handle, iloc.bh); */
6045         brelse(iloc.bh);
6046  exit_dindj:
6047 -       //ext4_journal_release_buffer(handle, dind);
6048 +       /* ext4_journal_release_buffer(handle, dind); */
6049  exit_primary:
6050 -       //ext4_journal_release_buffer(handle, *primary);
6051 +       /* ext4_journal_release_buffer(handle, *primary); */
6052  exit_sbh:
6053 -       //ext4_journal_release_buffer(handle, *primary);
6054 +       /* ext4_journal_release_buffer(handle, *primary); */
6055  exit_dind:
6056         brelse(dind);
6057  exit_bh:
6058 @@ -773,7 +773,8 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
6059
6060         if (reserved_gdb || gdb_off == 0) {
6061                 if (!EXT4_HAS_COMPAT_FEATURE(sb,
6062 -                                            EXT4_FEATURE_COMPAT_RESIZE_INODE)){
6063 +                                            EXT4_FEATURE_COMPAT_RESIZE_INODE)
6064 +                   || !le16_to_cpu(es->s_reserved_gdt_blocks)) {
6065                         ext4_warning(sb, __func__,
6066                                      "No reserved GDT blocks, can't resize");
6067                         return -EPERM;
6068 @@ -818,12 +819,12 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
6069         if ((err = ext4_journal_get_write_access(handle, sbi->s_sbh)))
6070                 goto exit_journal;
6071
6072 -       /*
6073 -        * We will only either add reserved group blocks to a backup group
6074 -        * or remove reserved blocks for the first group in a new group block.
6075 -        * Doing both would be mean more complex code, and sane people don't
6076 -        * use non-sparse filesystems anymore.  This is already checked above.
6077 -        */
6078 +        /*
6079 +         * We will only either add reserved group blocks to a backup group
6080 +         * or remove reserved blocks for the first group in a new group block.
6081 +         * Doing both would be mean more complex code, and sane people don't
6082 +         * use non-sparse filesystems anymore.  This is already checked above.
6083 +         */
6084         if (gdb_off) {
6085                 primary = sbi->s_group_desc[gdb_num];
6086                 if ((err = ext4_journal_get_write_access(handle, primary)))
6087 @@ -835,24 +836,24 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
6088         } else if ((err = add_new_gdb(handle, inode, input, &primary)))
6089                 goto exit_journal;
6090
6091 -       /*
6092 -        * OK, now we've set up the new group.  Time to make it active.
6093 -        *
6094 -        * Current kernels don't lock all allocations via lock_super(),
6095 -        * so we have to be safe wrt. concurrent accesses the group
6096 -        * data.  So we need to be careful to set all of the relevant
6097 -        * group descriptor data etc. *before* we enable the group.
6098 -        *
6099 -        * The key field here is sbi->s_groups_count: as long as
6100 -        * that retains its old value, nobody is going to access the new
6101 -        * group.
6102 -        *
6103 -        * So first we update all the descriptor metadata for the new
6104 -        * group; then we update the total disk blocks count; then we
6105 -        * update the groups count to enable the group; then finally we
6106 -        * update the free space counts so that the system can start
6107 -        * using the new disk blocks.
6108 -        */
6109 +        /*
6110 +         * OK, now we've set up the new group.  Time to make it active.
6111 +         *
6112 +         * Current kernels don't lock all allocations via lock_super(),
6113 +         * so we have to be safe wrt. concurrent accesses the group
6114 +         * data.  So we need to be careful to set all of the relevant
6115 +         * group descriptor data etc. *before* we enable the group.
6116 +         *
6117 +         * The key field here is sbi->s_groups_count: as long as
6118 +         * that retains its old value, nobody is going to access the new
6119 +         * group.
6120 +         *
6121 +         * So first we update all the descriptor metadata for the new
6122 +         * group; then we update the total disk blocks count; then we
6123 +         * update the groups count to enable the group; then finally we
6124 +         * update the free space counts so that the system can start
6125 +         * using the new disk blocks.
6126 +         */
6127
6128         /* Update group descriptor block for new group */
6129         gdp = (struct ext4_group_desc *)((char *)primary->b_data +
6130 @@ -866,6 +867,15 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
6131         gdp->bg_checksum = ext4_group_desc_csum(sbi, input->group, gdp);
6132
6133         /*
6134 +        * We can allocate memory for mb_alloc based on the new group
6135 +        * descriptor
6136 +        */
6137 +       if (test_opt(sb, MBALLOC)) {
6138 +               err = ext4_mb_add_more_groupinfo(sb, input->group, gdp);
6139 +               if (err)
6140 +                       goto exit_journal;
6141 +       }
6142 +       /*
6143          * Make the new blocks and inodes valid next.  We do this before
6144          * increasing the group count so that once the group is enabled,
6145          * all of its blocks and inodes are already valid.
6146 @@ -937,7 +947,8 @@ exit_put:
6147         return err;
6148  } /* ext4_group_add */
6149
6150 -/* Extend the filesystem to the new number of blocks specified.  This entry
6151 +/*
6152 + * Extend the filesystem to the new number of blocks specified.  This entry
6153   * point is only used to extend the current filesystem to the end of the last
6154   * existing group.  It can be accessed via ioctl, or by "remount,resize=<size>"
6155   * for emergencies (because it has no dependencies on reserved blocks).
6156 @@ -957,6 +968,8 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
6157         handle_t *handle;
6158         int err;
6159         unsigned long freed_blocks;
6160 +       ext4_group_t group;
6161 +       struct ext4_group_info *grp;
6162
6163         /* We don't need to worry about locking wrt other resizers just
6164          * yet: we're going to revalidate es->s_blocks_count after
6165 @@ -988,7 +1001,7 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
6166         }
6167
6168         /* Handle the remaining blocks in the last group only. */
6169 -       ext4_get_group_no_and_offset(sb, o_blocks_count, NULL, &last);
6170 +       ext4_get_group_no_and_offset(sb, o_blocks_count, &group, &last);
6171
6172         if (last == 0) {
6173                 ext4_warning(sb, __func__,
6174 @@ -1013,7 +1026,7 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
6175                              o_blocks_count + add, add);
6176
6177         /* See if the device is actually as big as what was requested */
6178 -       bh = sb_bread(sb, o_blocks_count + add -1);
6179 +       bh = sb_bread(sb, o_blocks_count + add - 1);
6180         if (!bh) {
6181                 ext4_warning(sb, __func__,
6182                              "can't read last block, resize aborted");
6183 @@ -1060,6 +1073,45 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
6184                    o_blocks_count + add);
6185         if ((err = ext4_journal_stop(handle)))
6186                 goto exit_put;
6187 +
6188 +       /*
6189 +        * Mark mballoc pages as not up to date so that they will be updated
6190 +        * next time they are loaded by ext4_mb_load_buddy.
6191 +        */
6192 +       if (test_opt(sb, MBALLOC)) {
6193 +               struct ext4_sb_info *sbi = EXT4_SB(sb);
6194 +               struct inode *inode = sbi->s_buddy_cache;
6195 +               int blocks_per_page;
6196 +               int block;
6197 +               int pnum;
6198 +               struct page *page;
6199 +
6200 +               /* Set buddy page as not up to date */
6201 +               blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
6202 +               block = group * 2;
6203 +               pnum = block / blocks_per_page;
6204 +               page = find_get_page(inode->i_mapping, pnum);
6205 +               if (page != NULL) {
6206 +                       ClearPageUptodate(page);
6207 +                       page_cache_release(page);
6208 +               }
6209 +
6210 +               /* Set bitmap page as not up to date */
6211 +               block++;
6212 +               pnum = block / blocks_per_page;
6213 +               page = find_get_page(inode->i_mapping, pnum);
6214 +               if (page != NULL) {
6215 +                       ClearPageUptodate(page);
6216 +                       page_cache_release(page);
6217 +               }
6218 +
6219 +               /* Get the info on the last group */
6220 +               grp = ext4_get_group_info(sb, group);
6221 +
6222 +               /* Update free blocks in group info */
6223 +               ext4_mb_update_group_info(grp, add);
6224 +       }
6225 +
6226         if (test_opt(sb, DEBUG))
6227                 printk(KERN_DEBUG "EXT4-fs: extended group to %llu blocks\n",
6228                        ext4_blocks_count(es));
6229 diff --git a/fs/ext4/super.c b/fs/ext4/super.c
6230 index 02bf243..ed80f9f 100644
6231 --- a/fs/ext4/super.c
6232 +++ b/fs/ext4/super.c
6233 @@ -49,20 +49,19 @@ static int ext4_load_journal(struct super_block *, struct ext4_super_block *,
6234                              unsigned long journal_devnum);
6235  static int ext4_create_journal(struct super_block *, struct ext4_super_block *,
6236                                unsigned int);
6237 -static void ext4_commit_super (struct super_block * sb,
6238 -                              struct ext4_super_block * es,
6239 -                              int sync);
6240 -static void ext4_mark_recovery_complete(struct super_block * sb,
6241 -                                       struct ext4_super_block * es);
6242 -static void ext4_clear_journal_err(struct super_block * sb,
6243 -                                  struct ext4_super_block * es);
6244 +static void ext4_commit_super(struct super_block *sb,
6245 +                             struct ext4_super_block *es, int sync);
6246 +static void ext4_mark_recovery_complete(struct super_block *sb,
6247 +                                       struct ext4_super_block *es);
6248 +static void ext4_clear_journal_err(struct super_block *sb,
6249 +                                  struct ext4_super_block *es);
6250  static int ext4_sync_fs(struct super_block *sb, int wait);
6251 -static const char *ext4_decode_error(struct super_block * sb, int errno,
6252 +static const char *ext4_decode_error(struct super_block *sb, int errno,
6253                                      char nbuf[16]);
6254 -static int ext4_remount (struct super_block * sb, int * flags, char * data);
6255 -static int ext4_statfs (struct dentry * dentry, struct kstatfs * buf);
6256 +static int ext4_remount(struct super_block *sb, int *flags, char *data);
6257 +static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf);
6258  static void ext4_unlockfs(struct super_block *sb);
6259 -static void ext4_write_super (struct super_block * sb);
6260 +static void ext4_write_super(struct super_block *sb);
6261  static void ext4_write_super_lockfs(struct super_block *sb);
6262
6263
6264 @@ -211,15 +210,15 @@ static void ext4_handle_error(struct super_block *sb)
6265         if (sb->s_flags & MS_RDONLY)
6266                 return;
6267
6268 -       if (!test_opt (sb, ERRORS_CONT)) {
6269 +       if (!test_opt(sb, ERRORS_CONT)) {
6270                 journal_t *journal = EXT4_SB(sb)->s_journal;
6271
6272                 EXT4_SB(sb)->s_mount_opt |= EXT4_MOUNT_ABORT;
6273                 if (journal)
6274                         jbd2_journal_abort(journal, -EIO);
6275         }
6276 -       if (test_opt (sb, ERRORS_RO)) {
6277 -               printk (KERN_CRIT "Remounting filesystem read-only\n");
6278 +       if (test_opt(sb, ERRORS_RO)) {
6279 +               printk(KERN_CRIT "Remounting filesystem read-only\n");
6280                 sb->s_flags |= MS_RDONLY;
6281         }
6282         ext4_commit_super(sb, es, 1);
6283 @@ -228,13 +227,13 @@ static void ext4_handle_error(struct super_block *sb)
6284                         sb->s_id);
6285  }
6286
6287 -void ext4_error (struct super_block * sb, const char * function,
6288 -                const char * fmt, ...)
6289 +void ext4_error(struct super_block *sb, const char *function,
6290 +               const char *fmt, ...)
6291  {
6292         va_list args;
6293
6294         va_start(args, fmt);
6295 -       printk(KERN_CRIT "EXT4-fs error (device %s): %s: ",sb->s_id, function);
6296 +       printk(KERN_CRIT "EXT4-fs error (device %s): %s: ", sb->s_id, function);
6297         vprintk(fmt, args);
6298         printk("\n");
6299         va_end(args);
6300 @@ -242,7 +241,7 @@ void ext4_error (struct super_block * sb, const char * function,
6301         ext4_handle_error(sb);
6302  }
6303
6304 -static const char *ext4_decode_error(struct super_block * sb, int errno,
6305 +static const char *ext4_decode_error(struct super_block *sb, int errno,
6306                                      char nbuf[16])
6307  {
6308         char *errstr = NULL;
6309 @@ -278,8 +277,7 @@ static const char *ext4_decode_error(struct super_block * sb, int errno,
6310  /* __ext4_std_error decodes expected errors from journaling functions
6311   * automatically and invokes the appropriate error response.  */
6312
6313 -void __ext4_std_error (struct super_block * sb, const char * function,
6314 -                      int errno)
6315 +void __ext4_std_error(struct super_block *sb, const char *function, int errno)
6316  {
6317         char nbuf[16];
6318         const char *errstr;
6319 @@ -292,8 +290,8 @@ void __ext4_std_error (struct super_block * sb, const char * function,
6320                 return;
6321
6322         errstr = ext4_decode_error(sb, errno, nbuf);
6323 -       printk (KERN_CRIT "EXT4-fs error (device %s) in %s: %s\n",
6324 -               sb->s_id, function, errstr);
6325 +       printk(KERN_CRIT "EXT4-fs error (device %s) in %s: %s\n",
6326 +              sb->s_id, function, errstr);
6327
6328         ext4_handle_error(sb);
6329  }
6330 @@ -308,15 +306,15 @@ void __ext4_std_error (struct super_block * sb, const char * function,
6331   * case we take the easy way out and panic immediately.
6332   */
6333
6334 -void ext4_abort (struct super_block * sb, const char * function,
6335 -                const char * fmt, ...)
6336 +void ext4_abort(struct super_block *sb, const char *function,
6337 +               const char *fmt, ...)
6338  {
6339         va_list args;
6340
6341 -       printk (KERN_CRIT "ext4_abort called.\n");
6342 +       printk(KERN_CRIT "ext4_abort called.\n");
6343
6344         va_start(args, fmt);
6345 -       printk(KERN_CRIT "EXT4-fs error (device %s): %s: ",sb->s_id, function);
6346 +       printk(KERN_CRIT "EXT4-fs error (device %s): %s: ", sb->s_id, function);
6347         vprintk(fmt, args);
6348         printk("\n");
6349         va_end(args);
6350 @@ -334,8 +332,8 @@ void ext4_abort (struct super_block * sb, const char * function,
6351         jbd2_journal_abort(EXT4_SB(sb)->s_journal, -EIO);
6352  }
6353
6354 -void ext4_warning (struct super_block * sb, const char * function,
6355 -                  const char * fmt, ...)
6356 +void ext4_warning(struct super_block *sb, const char *function,
6357 +                 const char *fmt, ...)
6358  {
6359         va_list args;
6360
6361 @@ -496,7 +494,7 @@ static void dump_orphan_list(struct super_block *sb, struct ext4_sb_info *sbi)
6362         }
6363  }
6364
6365 -static void ext4_put_super (struct super_block * sb)
6366 +static void ext4_put_super(struct super_block *sb)
6367  {
6368         struct ext4_sb_info *sbi = EXT4_SB(sb);
6369         struct ext4_super_block *es = sbi->s_es;
6370 @@ -506,6 +504,7 @@ static void ext4_put_super (struct super_block * sb)
6371         ext4_ext_release(sb);
6372         ext4_xattr_put_super(sb);
6373         jbd2_journal_destroy(sbi->s_journal);
6374 +       sbi->s_journal = NULL;
6375         if (!(sb->s_flags & MS_RDONLY)) {
6376                 EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
6377                 es->s_state = cpu_to_le16(sbi->s_mount_state);
6378 @@ -517,6 +516,7 @@ static void ext4_put_super (struct super_block * sb)
6379         for (i = 0; i < sbi->s_gdb_count; i++)
6380                 brelse(sbi->s_group_desc[i]);
6381         kfree(sbi->s_group_desc);
6382 +       kfree(sbi->s_flex_groups);
6383         percpu_counter_destroy(&sbi->s_freeblocks_counter);
6384         percpu_counter_destroy(&sbi->s_freeinodes_counter);
6385         percpu_counter_destroy(&sbi->s_dirs_counter);
6386 @@ -568,9 +568,16 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
6387  #endif
6388         ei->i_block_alloc_info = NULL;
6389         ei->vfs_inode.i_version = 1;
6390 +       ei->vfs_inode.i_data.writeback_index = 0;
6391         memset(&ei->i_cached_extent, 0, sizeof(struct ext4_ext_cache));
6392         INIT_LIST_HEAD(&ei->i_prealloc_list);
6393         spin_lock_init(&ei->i_prealloc_lock);
6394 +       jbd2_journal_init_jbd_inode(&ei->jinode, &ei->vfs_inode);
6395 +       ei->i_reserved_data_blocks = 0;
6396 +       ei->i_reserved_meta_blocks = 0;
6397 +       ei->i_allocated_meta_blocks = 0;
6398 +       ei->i_delalloc_reserved_flag = 0;
6399 +       spin_lock_init(&(ei->i_block_reservation_lock));
6400         return &ei->vfs_inode;
6401  }
6402
6403 @@ -635,9 +642,12 @@ static void ext4_clear_inode(struct inode *inode)
6404         EXT4_I(inode)->i_block_alloc_info = NULL;
6405         if (unlikely(rsv))
6406                 kfree(rsv);
6407 +       jbd2_journal_release_jbd_inode(EXT4_SB(inode->i_sb)->s_journal,
6408 +                                      &EXT4_I(inode)->jinode);
6409  }
6410
6411 -static inline void ext4_show_quota_options(struct seq_file *seq, struct super_block *sb)
6412 +static inline void ext4_show_quota_options(struct seq_file *seq,
6413 +                                          struct super_block *sb)
6414  {
6415  #if defined(CONFIG_QUOTA)
6416         struct ext4_sb_info *sbi = EXT4_SB(sb);
6417 @@ -671,7 +681,6 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
6418         unsigned long def_mount_opts;
6419         struct super_block *sb = vfs->mnt_sb;
6420         struct ext4_sb_info *sbi = EXT4_SB(sb);
6421 -       journal_t *journal = sbi->s_journal;
6422         struct ext4_super_block *es = sbi->s_es;
6423
6424         def_mount_opts = le32_to_cpu(es->s_default_mount_opts);
6425 @@ -747,6 +756,9 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
6426                 seq_puts(seq, ",nomballoc");
6427         if (test_opt(sb, I_VERSION))
6428                 seq_puts(seq, ",i_version");
6429 +       if (!test_opt(sb, DELALLOC))
6430 +               seq_puts(seq, ",nodelalloc");
6431 +
6432
6433         if (sbi->s_stripe)
6434                 seq_printf(seq, ",stripe=%lu", sbi->s_stripe);
6435 @@ -810,8 +822,8 @@ static struct dentry *ext4_fh_to_parent(struct super_block *sb, struct fid *fid,
6436  }
6437
6438  #ifdef CONFIG_QUOTA
6439 -#define QTYPE2NAME(t) ((t)==USRQUOTA?"user":"group")
6440 -#define QTYPE2MOPT(on, t) ((t)==USRQUOTA?((on)##USRJQUOTA):((on)##GRPJQUOTA))
6441 +#define QTYPE2NAME(t) ((t) == USRQUOTA?"user":"group")
6442 +#define QTYPE2MOPT(on, t) ((t) == USRQUOTA?((on)##USRJQUOTA):((on)##GRPJQUOTA))
6443
6444  static int ext4_dquot_initialize(struct inode *inode, int type);
6445  static int ext4_dquot_drop(struct inode *inode);
6446 @@ -894,7 +906,7 @@ enum {
6447         Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota,
6448         Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota,
6449         Opt_grpquota, Opt_extents, Opt_noextents, Opt_i_version,
6450 -       Opt_mballoc, Opt_nomballoc, Opt_stripe,
6451 +       Opt_mballoc, Opt_nomballoc, Opt_stripe, Opt_delalloc, Opt_nodelalloc,
6452  };
6453
6454  static match_table_t tokens = {
6455 @@ -953,6 +965,8 @@ static match_table_t tokens = {
6456         {Opt_nomballoc, "nomballoc"},
6457         {Opt_stripe, "stripe=%u"},
6458         {Opt_resize, "resize"},
6459 +       {Opt_delalloc, "delalloc"},
6460 +       {Opt_nodelalloc, "nodelalloc"},
6461         {Opt_err, NULL},
6462  };
6463
6464 @@ -977,12 +991,12 @@ static ext4_fsblk_t get_sb_block(void **data)
6465         return sb_block;
6466  }
6467
6468 -static int parse_options (char *options, struct super_block *sb,
6469 -                         unsigned int *inum, unsigned long *journal_devnum,
6470 -                         ext4_fsblk_t *n_blocks_count, int is_remount)
6471 +static int parse_options(char *options, struct super_block *sb,
6472 +                        unsigned int *inum, unsigned long *journal_devnum,
6473 +                        ext4_fsblk_t *n_blocks_count, int is_remount)
6474  {
6475         struct ext4_sb_info *sbi = EXT4_SB(sb);
6476 -       char * p;
6477 +       char *p;
6478         substring_t args[MAX_OPT_ARGS];
6479         int data_opt = 0;
6480         int option;
6481 @@ -990,11 +1004,12 @@ static int parse_options (char *options, struct super_block *sb,
6482         int qtype, qfmt;
6483         char *qname;
6484  #endif
6485 +       ext4_fsblk_t last_block;
6486
6487         if (!options)
6488                 return 1;
6489
6490 -       while ((p = strsep (&options, ",")) != NULL) {
6491 +       while ((p = strsep(&options, ",")) != NULL) {
6492                 int token;
6493                 if (!*p)
6494                         continue;
6495 @@ -1002,16 +1017,16 @@ static int parse_options (char *options, struct super_block *sb,
6496                 token = match_token(p, tokens, args);
6497                 switch (token) {
6498                 case Opt_bsd_df:
6499 -                       clear_opt (sbi->s_mount_opt, MINIX_DF);
6500 +                       clear_opt(sbi->s_mount_opt, MINIX_DF);
6501                         break;
6502                 case Opt_minix_df:
6503 -                       set_opt (sbi->s_mount_opt, MINIX_DF);
6504 +                       set_opt(sbi->s_mount_opt, MINIX_DF);
6505                         break;
6506                 case Opt_grpid:
6507 -                       set_opt (sbi->s_mount_opt, GRPID);
6508 +                       set_opt(sbi->s_mount_opt, GRPID);
6509                         break;
6510                 case Opt_nogrpid:
6511 -                       clear_opt (sbi->s_mount_opt, GRPID);
6512 +                       clear_opt(sbi->s_mount_opt, GRPID);
6513                         break;
6514                 case Opt_resuid:
6515                         if (match_int(&args[0], &option))
6516 @@ -1028,41 +1043,41 @@ static int parse_options (char *options, struct super_block *sb,
6517                         /* *sb_block = match_int(&args[0]); */
6518                         break;
6519                 case Opt_err_panic:
6520 -                       clear_opt (sbi->s_mount_opt, ERRORS_CONT);
6521 -                       clear_opt (sbi->s_mount_opt, ERRORS_RO);
6522 -                       set_opt (sbi->s_mount_opt, ERRORS_PANIC);
6523 +                       clear_opt(sbi->s_mount_opt, ERRORS_CONT);
6524 +                       clear_opt(sbi->s_mount_opt, ERRORS_RO);
6525 +                       set_opt(sbi->s_mount_opt, ERRORS_PANIC);
6526                         break;
6527                 case Opt_err_ro:
6528 -                       clear_opt (sbi->s_mount_opt, ERRORS_CONT);
6529 -                       clear_opt (sbi->s_mount_opt, ERRORS_PANIC);
6530 -                       set_opt (sbi->s_mount_opt, ERRORS_RO);
6531 +                       clear_opt(sbi->s_mount_opt, ERRORS_CONT);
6532 +                       clear_opt(sbi->s_mount_opt, ERRORS_PANIC);
6533 +                       set_opt(sbi->s_mount_opt, ERRORS_RO);
6534                         break;
6535                 case Opt_err_cont:
6536 -                       clear_opt (sbi->s_mount_opt, ERRORS_RO);
6537 -                       clear_opt (sbi->s_mount_opt, ERRORS_PANIC);
6538 -                       set_opt (sbi->s_mount_opt, ERRORS_CONT);
6539 +                       clear_opt(sbi->s_mount_opt, ERRORS_RO);
6540 +                       clear_opt(sbi->s_mount_opt, ERRORS_PANIC);
6541 +                       set_opt(sbi->s_mount_opt, ERRORS_CONT);
6542                         break;
6543                 case Opt_nouid32:
6544 -                       set_opt (sbi->s_mount_opt, NO_UID32);
6545 +                       set_opt(sbi->s_mount_opt, NO_UID32);
6546                         break;
6547                 case Opt_nocheck:
6548 -                       clear_opt (sbi->s_mount_opt, CHECK);
6549 +                       clear_opt(sbi->s_mount_opt, CHECK);
6550                         break;
6551                 case Opt_debug:
6552 -                       set_opt (sbi->s_mount_opt, DEBUG);
6553 +                       set_opt(sbi->s_mount_opt, DEBUG);
6554                         break;
6555                 case Opt_oldalloc:
6556 -                       set_opt (sbi->s_mount_opt, OLDALLOC);
6557 +                       set_opt(sbi->s_mount_opt, OLDALLOC);
6558                         break;
6559                 case Opt_orlov:
6560 -                       clear_opt (sbi->s_mount_opt, OLDALLOC);
6561 +                       clear_opt(sbi->s_mount_opt, OLDALLOC);
6562                         break;
6563  #ifdef CONFIG_EXT4DEV_FS_XATTR
6564                 case Opt_user_xattr:
6565 -                       set_opt (sbi->s_mount_opt, XATTR_USER);
6566 +                       set_opt(sbi->s_mount_opt, XATTR_USER);
6567                         break;
6568                 case Opt_nouser_xattr:
6569 -                       clear_opt (sbi->s_mount_opt, XATTR_USER);
6570 +                       clear_opt(sbi->s_mount_opt, XATTR_USER);
6571                         break;
6572  #else
6573                 case Opt_user_xattr:
6574 @@ -1100,7 +1115,7 @@ static int parse_options (char *options, struct super_block *sb,
6575                                        "journal on remount\n");
6576                                 return 0;
6577                         }
6578 -                       set_opt (sbi->s_mount_opt, UPDATE_JOURNAL);
6579 +                       set_opt(sbi->s_mount_opt, UPDATE_JOURNAL);
6580                         break;
6581                 case Opt_journal_inum:
6582                         if (is_remount) {
6583 @@ -1130,7 +1145,7 @@ static int parse_options (char *options, struct super_block *sb,
6584                         set_opt(sbi->s_mount_opt, JOURNAL_CHECKSUM);
6585                         break;
6586                 case Opt_noload:
6587 -                       set_opt (sbi->s_mount_opt, NOLOAD);
6588 +                       set_opt(sbi->s_mount_opt, NOLOAD);
6589                         break;
6590                 case Opt_commit:
6591                         if (match_int(&args[0], &option))
6592 @@ -1309,15 +1324,39 @@ set_qf_format:
6593                         clear_opt(sbi->s_mount_opt, NOBH);
6594                         break;
6595                 case Opt_extents:
6596 -                       set_opt (sbi->s_mount_opt, EXTENTS);
6597 +                       if (!EXT4_HAS_INCOMPAT_FEATURE(sb,
6598 +                                       EXT4_FEATURE_INCOMPAT_EXTENTS)) {
6599 +                               ext4_warning(sb, __func__,
6600 +                                       "extents feature not enabled "
6601 +                                       "on this filesystem, use tune2fs\n");
6602 +                               return 0;
6603 +                       }
6604 +                       set_opt(sbi->s_mount_opt, EXTENTS);
6605                         break;
6606                 case Opt_noextents:
6607 -                       clear_opt (sbi->s_mount_opt, EXTENTS);
6608 +                       /*
6609 +                        * When e2fsprogs support resizing an already existing
6610 +                        * ext3 file system to greater than 2**32 we need to
6611 +                        * add support to block allocator to handle growing
6612 +                        * already existing block  mapped inode so that blocks
6613 +                        * allocated for them fall within 2**32
6614 +                        */
6615 +                       last_block = ext4_blocks_count(sbi->s_es) - 1;
6616 +                       if (last_block  > 0xffffffffULL) {
6617 +                               printk(KERN_ERR "EXT4-fs: Filesystem too "
6618 +                                               "large to mount with "
6619 +                                               "-o noextents options\n");
6620 +                               return 0;
6621 +                       }
6622 +                       clear_opt(sbi->s_mount_opt, EXTENTS);
6623                         break;
6624                 case Opt_i_version:
6625                         set_opt(sbi->s_mount_opt, I_VERSION);
6626                         sb->s_flags |= MS_I_VERSION;
6627                         break;
6628 +               case Opt_nodelalloc:
6629 +                       clear_opt(sbi->s_mount_opt, DELALLOC);
6630 +                       break;
6631                 case Opt_mballoc:
6632                         set_opt(sbi->s_mount_opt, MBALLOC);
6633                         break;
6634 @@ -1331,10 +1370,13 @@ set_qf_format:
6635                                 return 0;
6636                         sbi->s_stripe = option;
6637                         break;
6638 +               case Opt_delalloc:
6639 +                       set_opt(sbi->s_mount_opt, DELALLOC);
6640 +                       break;
6641                 default:
6642 -                       printk (KERN_ERR
6643 -                               "EXT4-fs: Unrecognized mount option \"%s\" "
6644 -                               "or missing value\n", p);
6645 +                       printk(KERN_ERR
6646 +                              "EXT4-fs: Unrecognized mount option \"%s\" "
6647 +                              "or missing value\n", p);
6648                         return 0;
6649                 }
6650         }
6651 @@ -1381,31 +1423,31 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
6652         int res = 0;
6653
6654         if (le32_to_cpu(es->s_rev_level) > EXT4_MAX_SUPP_REV) {
6655 -               printk (KERN_ERR "EXT4-fs warning: revision level too high, "
6656 -                       "forcing read-only mode\n");
6657 +               printk(KERN_ERR "EXT4-fs warning: revision level too high, "
6658 +                      "forcing read-only mode\n");
6659                 res = MS_RDONLY;
6660         }
6661         if (read_only)
6662                 return res;
6663         if (!(sbi->s_mount_state & EXT4_VALID_FS))
6664 -               printk (KERN_WARNING "EXT4-fs warning: mounting unchecked fs, "
6665 -                       "running e2fsck is recommended\n");
6666 +               printk(KERN_WARNING "EXT4-fs warning: mounting unchecked fs, "
6667 +                      "running e2fsck is recommended\n");
6668         else if ((sbi->s_mount_state & EXT4_ERROR_FS))
6669 -               printk (KERN_WARNING
6670 -                       "EXT4-fs warning: mounting fs with errors, "
6671 -                       "running e2fsck is recommended\n");
6672 +               printk(KERN_WARNING
6673 +                      "EXT4-fs warning: mounting fs with errors, "
6674 +                      "running e2fsck is recommended\n");
6675         else if ((__s16) le16_to_cpu(es->s_max_mnt_count) >= 0 &&
6676                  le16_to_cpu(es->s_mnt_count) >=
6677                  (unsigned short) (__s16) le16_to_cpu(es->s_max_mnt_count))
6678 -               printk (KERN_WARNING
6679 -                       "EXT4-fs warning: maximal mount count reached, "
6680 -                       "running e2fsck is recommended\n");
6681 +               printk(KERN_WARNING
6682 +                      "EXT4-fs warning: maximal mount count reached, "
6683 +                      "running e2fsck is recommended\n");
6684         else if (le32_to_cpu(es->s_checkinterval) &&
6685                 (le32_to_cpu(es->s_lastcheck) +
6686                         le32_to_cpu(es->s_checkinterval) <= get_seconds()))
6687 -               printk (KERN_WARNING
6688 -                       "EXT4-fs warning: checktime reached, "
6689 -                       "running e2fsck is recommended\n");
6690 +               printk(KERN_WARNING
6691 +                      "EXT4-fs warning: checktime reached, "
6692 +                      "running e2fsck is recommended\n");
6693  #if 0
6694                 /* @@@ We _will_ want to clear the valid bit if we find
6695                  * inconsistencies, to force a fsck at reboot.  But for
6696 @@ -1443,6 +1485,53 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
6697         return res;
6698  }
6699
6700 +static int ext4_fill_flex_info(struct super_block *sb)
6701 +{
6702 +       struct ext4_sb_info *sbi = EXT4_SB(sb);
6703 +       struct ext4_group_desc *gdp = NULL;
6704 +       struct buffer_head *bh;
6705 +       ext4_group_t flex_group_count;
6706 +       ext4_group_t flex_group;
6707 +       int groups_per_flex = 0;
6708 +       __u64 block_bitmap = 0;
6709 +       int i;
6710 +
6711 +       if (!sbi->s_es->s_log_groups_per_flex) {
6712 +               sbi->s_log_groups_per_flex = 0;
6713 +               return 1;
6714 +       }
6715 +
6716 +       sbi->s_log_groups_per_flex = sbi->s_es->s_log_groups_per_flex;
6717 +       groups_per_flex = 1 << sbi->s_log_groups_per_flex;
6718 +
6719 +       flex_group_count = (sbi->s_groups_count + groups_per_flex - 1) /
6720 +               groups_per_flex;
6721 +       sbi->s_flex_groups = kzalloc(flex_group_count *
6722 +                                    sizeof(struct flex_groups), GFP_KERNEL);
6723 +       if (sbi->s_flex_groups == NULL) {
6724 +               printk(KERN_ERR "EXT4-fs: not enough memory for "
6725 +                               "%lu flex groups\n", flex_group_count);
6726 +               goto failed;
6727 +       }
6728 +
6729 +       gdp = ext4_get_group_desc(sb, 1, &bh);
6730 +       block_bitmap = ext4_block_bitmap(sb, gdp) - 1;
6731 +
6732 +       for (i = 0; i < sbi->s_groups_count; i++) {
6733 +               gdp = ext4_get_group_desc(sb, i, &bh);
6734 +
6735 +               flex_group = ext4_flex_group(sbi, i);
6736 +               sbi->s_flex_groups[flex_group].free_inodes +=
6737 +                       le16_to_cpu(gdp->bg_free_inodes_count);
6738 +               sbi->s_flex_groups[flex_group].free_blocks +=
6739 +                       le16_to_cpu(gdp->bg_free_blocks_count);
6740 +       }
6741 +
6742 +       return 1;
6743 +failed:
6744 +       return 0;
6745 +}
6746 +
6747  __le16 ext4_group_desc_csum(struct ext4_sb_info *sbi, __u32 block_group,
6748                             struct ext4_group_desc *gdp)
6749  {
6750 @@ -1507,16 +1596,14 @@ static int ext4_check_descriptors(struct super_block *sb)
6751                                 (EXT4_BLOCKS_PER_GROUP(sb) - 1);
6752
6753                 block_bitmap = ext4_block_bitmap(sb, gdp);
6754 -               if (block_bitmap < first_block || block_bitmap > last_block)
6755 -               {
6756 +               if (block_bitmap < first_block || block_bitmap > last_block) {
6757                         printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: "
6758                                "Block bitmap for group %lu not in group "
6759                                "(block %llu)!", i, block_bitmap);
6760                         return 0;
6761                 }
6762                 inode_bitmap = ext4_inode_bitmap(sb, gdp);
6763 -               if (inode_bitmap < first_block || inode_bitmap > last_block)
6764 -               {
6765 +               if (inode_bitmap < first_block || inode_bitmap > last_block) {
6766                         printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: "
6767                                "Inode bitmap for group %lu not in group "
6768                                "(block %llu)!", i, inode_bitmap);
6769 @@ -1524,26 +1611,28 @@ static int ext4_check_descriptors(struct super_block *sb)
6770                 }
6771                 inode_table = ext4_inode_table(sb, gdp);
6772                 if (inode_table < first_block ||
6773 -                   inode_table + sbi->s_itb_per_group - 1 > last_block)
6774 -               {
6775 +                   inode_table + sbi->s_itb_per_group - 1 > last_block) {
6776                         printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: "
6777                                "Inode table for group %lu not in group "
6778                                "(block %llu)!", i, inode_table);
6779                         return 0;
6780                 }
6781 +               spin_lock(sb_bgl_lock(sbi, i));
6782                 if (!ext4_group_desc_csum_verify(sbi, i, gdp)) {
6783                         printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: "
6784                                "Checksum for group %lu failed (%u!=%u)\n",
6785                                i, le16_to_cpu(ext4_group_desc_csum(sbi, i,
6786                                gdp)), le16_to_cpu(gdp->bg_checksum));
6787 -                       return 0;
6788 +                       if (!(sb->s_flags & MS_RDONLY))
6789 +                               return 0;
6790                 }
6791 +               spin_unlock(sb_bgl_lock(sbi, i));
6792                 if (!flexbg_flag)
6793                         first_block += EXT4_BLOCKS_PER_GROUP(sb);
6794         }
6795
6796         ext4_free_blocks_count_set(sbi->s_es, ext4_count_free_blocks(sb));
6797 -       sbi->s_es->s_free_inodes_count=cpu_to_le32(ext4_count_free_inodes(sb));
6798 +       sbi->s_es->s_free_inodes_count = cpu_to_le32(ext4_count_free_inodes(sb));
6799         return 1;
6800  }
6801
6802 @@ -1564,8 +1653,8 @@ static int ext4_check_descriptors(struct super_block *sb)
6803   * e2fsck was run on this filesystem, and it must have already done the orphan
6804   * inode cleanup for us, so we can safely abort without any further action.
6805   */
6806 -static void ext4_orphan_cleanup (struct super_block * sb,
6807 -                                struct ext4_super_block * es)
6808 +static void ext4_orphan_cleanup(struct super_block *sb,
6809 +                               struct ext4_super_block *es)
6810  {
6811         unsigned int s_flags = sb->s_flags;
6812         int nr_orphans = 0, nr_truncates = 0;
6813 @@ -1642,7 +1731,7 @@ static void ext4_orphan_cleanup (struct super_block * sb,
6814                 iput(inode);  /* The delete magic happens here! */
6815         }
6816
6817 -#define PLURAL(x) (x), ((x)==1) ? "" : "s"
6818 +#define PLURAL(x) (x), ((x) == 1) ? "" : "s"
6819
6820         if (nr_orphans)
6821                 printk(KERN_INFO "EXT4-fs: %s: %d orphan inode%s deleted\n",
6822 @@ -1809,12 +1898,12 @@ static unsigned long ext4_get_stripe_size(struct ext4_sb_info *sbi)
6823         return 0;
6824  }
6825
6826 -static int ext4_fill_super (struct super_block *sb, void *data, int silent)
6827 -                               __releases(kernel_sem)
6828 -                               __acquires(kernel_sem)
6829 +static int ext4_fill_super(struct super_block *sb, void *data, int silent)
6830 +                               __releases(kernel_lock)
6831 +                               __acquires(kernel_lock)
6832
6833  {
6834 -       struct buffer_head * bh;
6835 +       struct buffer_head *bh;
6836         struct ext4_super_block *es = NULL;
6837         struct ext4_sb_info *sbi;
6838         ext4_fsblk_t block;
6839 @@ -1851,11 +1940,6 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
6840                 goto out_fail;
6841         }
6842
6843 -       if (!sb_set_blocksize(sb, blocksize)) {
6844 -               printk(KERN_ERR "EXT4-fs: bad blocksize %d.\n", blocksize);
6845 -               goto out_fail;
6846 -       }
6847 -
6848         /*
6849          * The ext4 superblock will not be buffer aligned for other than 1kB
6850          * block sizes.  We need to calculate the offset from buffer start.
6851 @@ -1868,7 +1952,7 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
6852         }
6853
6854         if (!(bh = sb_bread(sb, logical_sb_block))) {
6855 -               printk (KERN_ERR "EXT4-fs: unable to read superblock\n");
6856 +               printk(KERN_ERR "EXT4-fs: unable to read superblock\n");
6857                 goto out_fail;
6858         }
6859         /*
6860 @@ -1919,17 +2003,30 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
6861
6862         /*
6863          * turn on extents feature by default in ext4 filesystem
6864 -        * User -o noextents to turn it off
6865 +        * only if feature flag already set by mkfs or tune2fs.
6866 +        * Use -o noextents to turn it off
6867          */
6868 -       set_opt(sbi->s_mount_opt, EXTENTS);
6869 +       if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS))
6870 +               set_opt(sbi->s_mount_opt, EXTENTS);
6871 +       else
6872 +               ext4_warning(sb, __func__,
6873 +                       "extents feature not enabled on this filesystem, "
6874 +                       "use tune2fs.\n");
6875         /*
6876 -        * turn on mballoc feature by default in ext4 filesystem
6877 -        * User -o nomballoc to turn it off
6878 +        * turn on mballoc code by default in ext4 filesystem
6879 +        * Use -o nomballoc to turn it off
6880          */
6881         set_opt(sbi->s_mount_opt, MBALLOC);
6882
6883 -       if (!parse_options ((char *) data, sb, &journal_inum, &journal_devnum,
6884 -                           NULL, 0))
6885 +       /*
6886 +        * enable delayed allocation by default
6887 +        * Use -o nodelalloc to turn it off
6888 +        */
6889 +       set_opt(sbi->s_mount_opt, DELALLOC);
6890 +
6891 +
6892 +       if (!parse_options((char *) data, sb, &journal_inum, &journal_devnum,
6893 +                          NULL, 0))
6894                 goto failed_mount;
6895
6896         sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
6897 @@ -2004,7 +2101,7 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
6898                         goto failed_mount;
6899                 }
6900
6901 -               brelse (bh);
6902 +               brelse(bh);
6903                 logical_sb_block = sb_block * EXT4_MIN_BLOCK_SIZE;
6904                 offset = do_div(logical_sb_block, blocksize);
6905                 bh = sb_bread(sb, logical_sb_block);
6906 @@ -2016,8 +2113,8 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
6907                 es = (struct ext4_super_block *)(((char *)bh->b_data) + offset);
6908                 sbi->s_es = es;
6909                 if (es->s_magic != cpu_to_le16(EXT4_SUPER_MAGIC)) {
6910 -                       printk (KERN_ERR
6911 -                               "EXT4-fs: Magic mismatch, very weird !\n");
6912 +                       printk(KERN_ERR
6913 +                              "EXT4-fs: Magic mismatch, very weird !\n");
6914                         goto failed_mount;
6915                 }
6916         }
6917 @@ -2034,9 +2131,9 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
6918                 if ((sbi->s_inode_size < EXT4_GOOD_OLD_INODE_SIZE) ||
6919                     (!is_power_of_2(sbi->s_inode_size)) ||
6920                     (sbi->s_inode_size > blocksize)) {
6921 -                       printk (KERN_ERR
6922 -                               "EXT4-fs: unsupported inode size: %d\n",
6923 -                               sbi->s_inode_size);
6924 +                       printk(KERN_ERR
6925 +                              "EXT4-fs: unsupported inode size: %d\n",
6926 +                              sbi->s_inode_size);
6927                         goto failed_mount;
6928                 }
6929                 if (sbi->s_inode_size > EXT4_GOOD_OLD_INODE_SIZE)
6930 @@ -2068,20 +2165,20 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
6931         sbi->s_mount_state = le16_to_cpu(es->s_state);
6932         sbi->s_addr_per_block_bits = ilog2(EXT4_ADDR_PER_BLOCK(sb));
6933         sbi->s_desc_per_block_bits = ilog2(EXT4_DESC_PER_BLOCK(sb));
6934 -       for (i=0; i < 4; i++)
6935 +       for (i = 0; i < 4; i++)
6936                 sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]);
6937         sbi->s_def_hash_version = es->s_def_hash_version;
6938
6939         if (sbi->s_blocks_per_group > blocksize * 8) {
6940 -               printk (KERN_ERR
6941 -                       "EXT4-fs: #blocks per group too big: %lu\n",
6942 -                       sbi->s_blocks_per_group);
6943 +               printk(KERN_ERR
6944 +                      "EXT4-fs: #blocks per group too big: %lu\n",
6945 +                      sbi->s_blocks_per_group);
6946                 goto failed_mount;
6947         }
6948         if (sbi->s_inodes_per_group > blocksize * 8) {
6949 -               printk (KERN_ERR
6950 -                       "EXT4-fs: #inodes per group too big: %lu\n",
6951 -                       sbi->s_inodes_per_group);
6952 +               printk(KERN_ERR
6953 +                      "EXT4-fs: #inodes per group too big: %lu\n",
6954 +                      sbi->s_inodes_per_group);
6955                 goto failed_mount;
6956         }
6957
6958 @@ -2115,10 +2212,10 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
6959         sbi->s_groups_count = blocks_count;
6960         db_count = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) /
6961                    EXT4_DESC_PER_BLOCK(sb);
6962 -       sbi->s_group_desc = kmalloc(db_count * sizeof (struct buffer_head *),
6963 +       sbi->s_group_desc = kmalloc(db_count * sizeof(struct buffer_head *),
6964                                     GFP_KERNEL);
6965         if (sbi->s_group_desc == NULL) {
6966 -               printk (KERN_ERR "EXT4-fs: not enough memory\n");
6967 +               printk(KERN_ERR "EXT4-fs: not enough memory\n");
6968                 goto failed_mount;
6969         }
6970
6971 @@ -2128,16 +2225,24 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
6972                 block = descriptor_loc(sb, logical_sb_block, i);
6973                 sbi->s_group_desc[i] = sb_bread(sb, block);
6974                 if (!sbi->s_group_desc[i]) {
6975 -                       printk (KERN_ERR "EXT4-fs: "
6976 -                               "can't read group descriptor %d\n", i);
6977 +                       printk(KERN_ERR "EXT4-fs: "
6978 +                              "can't read group descriptor %d\n", i);
6979                         db_count = i;
6980                         goto failed_mount2;
6981                 }
6982         }
6983 -       if (!ext4_check_descriptors (sb)) {
6984 +       if (!ext4_check_descriptors(sb)) {
6985                 printk(KERN_ERR "EXT4-fs: group descriptors corrupted!\n");
6986                 goto failed_mount2;
6987         }
6988 +       if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG))
6989 +               if (!ext4_fill_flex_info(sb)) {
6990 +                       printk(KERN_ERR
6991 +                              "EXT4-fs: unable to initialize "
6992 +                              "flex_bg meta info!\n");
6993 +                       goto failed_mount2;
6994 +               }
6995 +
6996         sbi->s_gdb_count = db_count;
6997         get_random_bytes(&sbi->s_next_generation, sizeof(u32));
6998         spin_lock_init(&sbi->s_next_gen_lock);
6999 @@ -2202,11 +2307,11 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
7000                     EXT4_SB(sb)->s_journal->j_failed_commit) {
7001                         printk(KERN_CRIT "EXT4-fs error (device %s): "
7002                                "ext4_fill_super: Journal transaction "
7003 -                              "%u is corrupt\n", sb->s_id,
7004 +                              "%u is corrupt\n", sb->s_id,
7005                                EXT4_SB(sb)->s_journal->j_failed_commit);
7006 -                       if (test_opt (sb, ERRORS_RO)) {
7007 -                               printk (KERN_CRIT
7008 -                                       "Mounting filesystem read-only\n");
7009 +                       if (test_opt(sb, ERRORS_RO)) {
7010 +                               printk(KERN_CRIT
7011 +                                      "Mounting filesystem read-only\n");
7012                                 sb->s_flags |= MS_RDONLY;
7013                                 EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
7014                                 es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
7015 @@ -2226,9 +2331,9 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
7016                         goto failed_mount3;
7017         } else {
7018                 if (!silent)
7019 -                       printk (KERN_ERR
7020 -                               "ext4: No journal on filesystem on %s\n",
7021 -                               sb->s_id);
7022 +                       printk(KERN_ERR
7023 +                              "ext4: No journal on filesystem on %s\n",
7024 +                              sb->s_id);
7025                 goto failed_mount3;
7026         }
7027
7028 @@ -2312,7 +2417,7 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
7029                 goto failed_mount4;
7030         }
7031
7032 -       ext4_setup_super (sb, es, sb->s_flags & MS_RDONLY);
7033 +       ext4_setup_super(sb, es, sb->s_flags & MS_RDONLY);
7034
7035         /* determine the minimum size of new large inodes, if present */
7036         if (sbi->s_inode_size > EXT4_GOOD_OLD_INODE_SIZE) {
7037 @@ -2351,12 +2456,19 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
7038         ext4_orphan_cleanup(sb, es);
7039         EXT4_SB(sb)->s_mount_state &= ~EXT4_ORPHAN_FS;
7040         if (needs_recovery)
7041 -               printk (KERN_INFO "EXT4-fs: recovery complete.\n");
7042 +               printk(KERN_INFO "EXT4-fs: recovery complete.\n");
7043         ext4_mark_recovery_complete(sb, es);
7044 -       printk (KERN_INFO "EXT4-fs: mounted filesystem with %s data mode.\n",
7045 -               test_opt(sb,DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA ? "journal":
7046 -               test_opt(sb,DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA ? "ordered":
7047 -               "writeback");
7048 +       printk(KERN_INFO "EXT4-fs: mounted filesystem with %s data mode.\n",
7049 +              test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA ? "journal":
7050 +              test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA ? "ordered":
7051 +              "writeback");
7052 +
7053 +       if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) {
7054 +               printk(KERN_WARNING "EXT4-fs: Ignoring delalloc option - "
7055 +                               "requested data journaling mode\n");
7056 +               clear_opt(sbi->s_mount_opt, DELALLOC);
7057 +       } else if (test_opt(sb, DELALLOC))
7058 +               printk(KERN_INFO "EXT4-fs: delayed allocation enabled\n");
7059
7060         ext4_ext_init(sb);
7061         ext4_mb_init(sb, needs_recovery);
7062 @@ -2372,6 +2484,7 @@ cantfind_ext4:
7063
7064  failed_mount4:
7065         jbd2_journal_destroy(sbi->s_journal);
7066 +       sbi->s_journal = NULL;
7067  failed_mount3:
7068         percpu_counter_destroy(&sbi->s_freeblocks_counter);
7069         percpu_counter_destroy(&sbi->s_freeinodes_counter);
7070 @@ -2461,14 +2574,14 @@ static journal_t *ext4_get_journal(struct super_block *sb,
7071  static journal_t *ext4_get_dev_journal(struct super_block *sb,
7072                                        dev_t j_dev)
7073  {
7074 -       struct buffer_head * bh;
7075 +       struct buffer_head *bh;
7076         journal_t *journal;
7077         ext4_fsblk_t start;
7078         ext4_fsblk_t len;
7079         int hblock, blocksize;
7080         ext4_fsblk_t sb_block;
7081         unsigned long offset;
7082 -       struct ext4_super_block * es;
7083 +       struct ext4_super_block *es;
7084         struct block_device *bdev;
7085
7086         bdev = ext4_blkdev_get(j_dev);
7087 @@ -2583,8 +2696,8 @@ static int ext4_load_journal(struct super_block *sb,
7088                                         "unavailable, cannot proceed.\n");
7089                                 return -EROFS;
7090                         }
7091 -                       printk (KERN_INFO "EXT4-fs: write access will "
7092 -                                       "be enabled during recovery.\n");
7093 +                       printk(KERN_INFO "EXT4-fs: write access will "
7094 +                              "be enabled during recovery.\n");
7095                 }
7096         }
7097
7098 @@ -2637,8 +2750,8 @@ static int ext4_load_journal(struct super_block *sb,
7099         return 0;
7100  }
7101
7102 -static int ext4_create_journal(struct super_block * sb,
7103 -                              struct ext4_super_block * es,
7104 +static int ext4_create_journal(struct super_block *sb,
7105 +                              struct ext4_super_block *es,
7106                                unsigned int journal_inum)
7107  {
7108         journal_t *journal;
7109 @@ -2679,9 +2792,8 @@ static int ext4_create_journal(struct super_block * sb,
7110         return 0;
7111  }
7112
7113 -static void ext4_commit_super (struct super_block * sb,
7114 -                              struct ext4_super_block * es,
7115 -                              int sync)
7116 +static void ext4_commit_super(struct super_block *sb,
7117 +                             struct ext4_super_block *es, int sync)
7118  {
7119         struct buffer_head *sbh = EXT4_SB(sb)->s_sbh;
7120
7121 @@ -2702,8 +2814,8 @@ static void ext4_commit_super (struct super_block * sb,
7122   * remounting) the filesystem readonly, then we will end up with a
7123   * consistent fs on disk.  Record that fact.
7124   */
7125 -static void ext4_mark_recovery_complete(struct super_block * sb,
7126 -                                       struct ext4_super_block * es)
7127 +static void ext4_mark_recovery_complete(struct super_block *sb,
7128 +                                       struct ext4_super_block *es)
7129  {
7130         journal_t *journal = EXT4_SB(sb)->s_journal;
7131
7132 @@ -2725,8 +2837,8 @@ static void ext4_mark_recovery_complete(struct super_block * sb,
7133   * has recorded an error from a previous lifetime, move that error to the
7134   * main filesystem now.
7135   */
7136 -static void ext4_clear_journal_err(struct super_block * sb,
7137 -                                  struct ext4_super_block * es)
7138 +static void ext4_clear_journal_err(struct super_block *sb,
7139 +                                  struct ext4_super_block *es)
7140  {
7141         journal_t *journal;
7142         int j_errno;
7143 @@ -2751,7 +2863,7 @@ static void ext4_clear_journal_err(struct super_block * sb,
7144
7145                 EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
7146                 es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
7147 -               ext4_commit_super (sb, es, 1);
7148 +               ext4_commit_super(sb, es, 1);
7149
7150                 jbd2_journal_clear_err(journal);
7151         }
7152 @@ -2784,7 +2896,7 @@ int ext4_force_commit(struct super_block *sb)
7153   * This implicitly triggers the writebehind on sync().
7154   */
7155
7156 -static void ext4_write_super (struct super_block * sb)
7157 +static void ext4_write_super(struct super_block *sb)
7158  {
7159         if (mutex_trylock(&sb->s_lock) != 0)
7160                 BUG();
7161 @@ -2840,13 +2952,14 @@ static void ext4_unlockfs(struct super_block *sb)
7162         }
7163  }
7164
7165 -static int ext4_remount (struct super_block * sb, int * flags, char * data)
7166 +static int ext4_remount(struct super_block *sb, int *flags, char *data)
7167  {
7168 -       struct ext4_super_block * es;
7169 +       struct ext4_super_block *es;
7170         struct ext4_sb_info *sbi = EXT4_SB(sb);
7171         ext4_fsblk_t n_blocks_count = 0;
7172         unsigned long old_sb_flags;
7173         struct ext4_mount_options old_opts;
7174 +       ext4_group_t g;
7175         int err;
7176  #ifdef CONFIG_QUOTA
7177         int i;
7178 @@ -2925,6 +3038,26 @@ static int ext4_remount (struct super_block * sb, int * flags, char * data)
7179                         }
7180
7181                         /*
7182 +                        * Make sure the group descriptor checksums
7183 +                        * are sane.  If they aren't, refuse to
7184 +                        * remount r/w.
7185 +                        */
7186 +                       for (g = 0; g < sbi->s_groups_count; g++) {
7187 +                               struct ext4_group_desc *gdp =
7188 +                                       ext4_get_group_desc(sb, g, NULL);
7189 +
7190 +                               if (!ext4_group_desc_csum_verify(sbi, g, gdp)) {
7191 +                                       printk(KERN_ERR
7192 +              "EXT4-fs: ext4_remount: "
7193 +               "Checksum for group %lu failed (%u!=%u)\n",
7194 +               g, le16_to_cpu(ext4_group_desc_csum(sbi, g, gdp)),
7195 +                                              le16_to_cpu(gdp->bg_checksum));
7196 +                                       err = -EINVAL;
7197 +                                       goto restore_opts;
7198 +                               }
7199 +                       }
7200 +
7201 +                       /*
7202                          * If we have an unprocessed orphan list hanging
7203                          * around from a previously readonly bdev mount,
7204                          * require a full umount/remount for now.
7205 @@ -2949,7 +3082,7 @@ static int ext4_remount (struct super_block * sb, int * flags, char * data)
7206                         sbi->s_mount_state = le16_to_cpu(es->s_state);
7207                         if ((err = ext4_group_extend(sb, es, n_blocks_count)))
7208                                 goto restore_opts;
7209 -                       if (!ext4_setup_super (sb, es, 0))
7210 +                       if (!ext4_setup_super(sb, es, 0))
7211                                 sb->s_flags &= ~MS_RDONLY;
7212                 }
7213         }
7214 @@ -2979,7 +3112,7 @@ restore_opts:
7215         return err;
7216  }
7217
7218 -static int ext4_statfs (struct dentry * dentry, struct kstatfs * buf)
7219 +static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf)
7220  {
7221         struct super_block *sb = dentry->d_sb;
7222         struct ext4_sb_info *sbi = EXT4_SB(sb);
7223 @@ -3217,12 +3350,12 @@ static int ext4_quota_on(struct super_block *sb, int type, int format_id,
7224         }
7225         /* Journaling quota? */
7226         if (EXT4_SB(sb)->s_qf_names[type]) {
7227 -               /* Quotafile not of fs root? */
7228 +               /* Quotafile not in fs root? */
7229                 if (nd.path.dentry->d_parent->d_inode != sb->s_root->d_inode)
7230                         printk(KERN_WARNING
7231                                 "EXT4-fs: Quota file not on filesystem root. "
7232                                 "Journaled quota will not work.\n");
7233 -       }
7234 +       }
7235
7236         /*
7237          * When we journal data on quota file, we have to flush journal to see
7238 @@ -3325,7 +3458,7 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type,
7239                         err = ext4_journal_dirty_metadata(handle, bh);
7240                 else {
7241                         /* Always do at least ordered writes for quotas */
7242 -                       err = ext4_journal_dirty_data(handle, bh);
7243 +                       err = ext4_jbd2_file_inode(handle, inode);
7244                         mark_buffer_dirty(bh);
7245                 }
7246                 brelse(bh);
7247 diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
7248 index ff08633..8954208 100644
7249 --- a/fs/ext4/xattr.c
7250 +++ b/fs/ext4/xattr.c
7251 @@ -810,7 +810,7 @@ inserted:
7252                         /* We need to allocate a new block */
7253                         ext4_fsblk_t goal = ext4_group_first_block_no(sb,
7254                                                 EXT4_I(inode)->i_block_group);
7255 -                       ext4_fsblk_t block = ext4_new_block(handle, inode,
7256 +                       ext4_fsblk_t block = ext4_new_meta_block(handle, inode,
7257                                                         goal, &error);
7258                         if (error)
7259                                 goto cleanup;
7260 @@ -1512,7 +1512,7 @@ static inline void ext4_xattr_hash_entry(struct ext4_xattr_header *header,
7261         char *name = entry->e_name;
7262         int n;
7263
7264 -       for (n=0; n < entry->e_name_len; n++) {
7265 +       for (n = 0; n < entry->e_name_len; n++) {
7266                 hash = (hash << NAME_HASH_SHIFT) ^
7267                        (hash >> (8*sizeof(hash) - NAME_HASH_SHIFT)) ^
7268                        *name++;
7269 diff --git a/fs/ext4/xattr_trusted.c b/fs/ext4/xattr_trusted.c
7270 index fff3338..ac1a52c 100644
7271 --- a/fs/ext4/xattr_trusted.c
7272 +++ b/fs/ext4/xattr_trusted.c
7273 @@ -13,13 +13,11 @@
7274  #include "ext4.h"
7275  #include "xattr.h"
7276
7277 -#define XATTR_TRUSTED_PREFIX "trusted."
7278 -
7279  static size_t
7280  ext4_xattr_trusted_list(struct inode *inode, char *list, size_t list_size,
7281                         const char *name, size_t name_len)
7282  {
7283 -       const size_t prefix_len = sizeof(XATTR_TRUSTED_PREFIX)-1;
7284 +       const size_t prefix_len = XATTR_TRUSTED_PREFIX_LEN;
7285         const size_t total_len = prefix_len + name_len + 1;
7286
7287         if (!capable(CAP_SYS_ADMIN))
7288 diff --git a/fs/ext4/xattr_user.c b/fs/ext4/xattr_user.c
7289 index 67be723..d91aa61 100644
7290 --- a/fs/ext4/xattr_user.c
7291 +++ b/fs/ext4/xattr_user.c
7292 @@ -12,13 +12,11 @@
7293  #include "ext4.h"
7294  #include "xattr.h"
7295
7296 -#define XATTR_USER_PREFIX "user."
7297 -
7298  static size_t
7299  ext4_xattr_user_list(struct inode *inode, char *list, size_t list_size,
7300                      const char *name, size_t name_len)
7301  {
7302 -       const size_t prefix_len = sizeof(XATTR_USER_PREFIX)-1;
7303 +       const size_t prefix_len = XATTR_USER_PREFIX_LEN;
7304         const size_t total_len = prefix_len + name_len + 1;
7305
7306         if (!test_opt(inode->i_sb, XATTR_USER))
7307 diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
7308 index 6914598..91389c8 100644
7309 --- a/fs/jbd2/checkpoint.c
7310 +++ b/fs/jbd2/checkpoint.c
7311 @@ -688,7 +688,6 @@ void __jbd2_journal_drop_transaction(journal_t *journal, transaction_t *transact
7312
7313         J_ASSERT(transaction->t_state == T_FINISHED);
7314         J_ASSERT(transaction->t_buffers == NULL);
7315 -       J_ASSERT(transaction->t_sync_datalist == NULL);
7316         J_ASSERT(transaction->t_forget == NULL);
7317         J_ASSERT(transaction->t_iobuf_list == NULL);
7318         J_ASSERT(transaction->t_shadow_list == NULL);
7319 diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
7320 index a2ed72f..adf0395 100644
7321 --- a/fs/jbd2/commit.c
7322 +++ b/fs/jbd2/commit.c
7323 @@ -22,6 +22,8 @@
7324  #include <linux/pagemap.h>
7325  #include <linux/jiffies.h>
7326  #include <linux/crc32.h>
7327 +#include <linux/writeback.h>
7328 +#include <linux/backing-dev.h>
7329
7330  /*
7331   * Default IO end handler for temporary BJ_IO buffer_heads.
7332 @@ -37,8 +39,8 @@ static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
7333  }
7334
7335  /*
7336 - * When an ext3-ordered file is truncated, it is possible that many pages are
7337 - * not sucessfully freed, because they are attached to a committing transaction.
7338 + * When an ext4 file is truncated, it is possible that some pages are not
7339 + * successfully freed, because they are attached to a committing transaction.
7340   * After the transaction commits, these pages are left on the LRU, with no
7341   * ->mapping, and with attached buffers.  These pages are trivially reclaimable
7342   * by the VM, but their apparent absence upsets the VM accounting, and it makes
7343 @@ -80,21 +82,6 @@ nope:
7344  }
7345
7346  /*
7347 - * Try to acquire jbd_lock_bh_state() against the buffer, when j_list_lock is
7348 - * held.  For ranking reasons we must trylock.  If we lose, schedule away and
7349 - * return 0.  j_list_lock is dropped in this case.
7350 - */
7351 -static int inverted_lock(journal_t *journal, struct buffer_head *bh)
7352 -{
7353 -       if (!jbd_trylock_bh_state(bh)) {
7354 -               spin_unlock(&journal->j_list_lock);
7355 -               schedule();
7356 -               return 0;
7357 -       }
7358 -       return 1;
7359 -}
7360 -
7361 -/*
7362   * Done it all: now submit the commit record.  We should have
7363   * cleaned up our previous buffers by now, so if we are in abort
7364   * mode we can now just skip the rest of the journal write
7365 @@ -112,6 +99,7 @@ static int journal_submit_commit_record(journal_t *journal,
7366         struct buffer_head *bh;
7367         int ret;
7368         int barrier_done = 0;
7369 +       struct timespec now = current_kernel_time();
7370
7371         if (is_journal_aborted(journal))
7372                 return 0;
7373 @@ -126,6 +114,8 @@ static int journal_submit_commit_record(journal_t *journal,
7374         tmp->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER);
7375         tmp->h_blocktype = cpu_to_be32(JBD2_COMMIT_BLOCK);
7376         tmp->h_sequence = cpu_to_be32(commit_transaction->t_tid);
7377 +       tmp->h_commit_sec = cpu_to_be64(now.tv_sec);
7378 +       tmp->h_commit_nsec = cpu_to_be32(now.tv_nsec);
7379
7380         if (JBD2_HAS_COMPAT_FEATURE(journal,
7381                                     JBD2_FEATURE_COMPAT_CHECKSUM)) {
7382 @@ -197,159 +187,114 @@ static int journal_wait_on_commit_record(struct buffer_head *bh)
7383  }
7384
7385  /*
7386 - * Wait for all submitted IO to complete.
7387 + * write the filemap data using writepage() address_space_operations.
7388 + * We don't do block allocation here even for delalloc. We don't
7389 + * use writepages() because with dealyed allocation we may be doing
7390 + * block allocation in writepages().
7391   */
7392 -static int journal_wait_on_locked_list(journal_t *journal,
7393 -                                      transaction_t *commit_transaction)
7394 +static int journal_submit_inode_data_buffers(struct address_space *mapping)
7395  {
7396 -       int ret = 0;
7397 -       struct journal_head *jh;
7398 -
7399 -       while (commit_transaction->t_locked_list) {
7400 -               struct buffer_head *bh;
7401 -
7402 -               jh = commit_transaction->t_locked_list->b_tprev;
7403 -               bh = jh2bh(jh);
7404 -               get_bh(bh);
7405 -               if (buffer_locked(bh)) {
7406 -                       spin_unlock(&journal->j_list_lock);
7407 -                       wait_on_buffer(bh);
7408 -                       if (unlikely(!buffer_uptodate(bh)))
7409 -                               ret = -EIO;
7410 -                       spin_lock(&journal->j_list_lock);
7411 -               }
7412 -               if (!inverted_lock(journal, bh)) {
7413 -                       put_bh(bh);
7414 -                       spin_lock(&journal->j_list_lock);
7415 -                       continue;
7416 -               }
7417 -               if (buffer_jbd(bh) && jh->b_jlist == BJ_Locked) {
7418 -                       __jbd2_journal_unfile_buffer(jh);
7419 -                       jbd_unlock_bh_state(bh);
7420 -                       jbd2_journal_remove_journal_head(bh);
7421 -                       put_bh(bh);
7422 -               } else {
7423 -                       jbd_unlock_bh_state(bh);
7424 -               }
7425 -               put_bh(bh);
7426 -               cond_resched_lock(&journal->j_list_lock);
7427 -       }
7428 +       int ret;
7429 +       struct writeback_control wbc = {
7430 +               .sync_mode =  WB_SYNC_ALL,
7431 +               .nr_to_write = mapping->nrpages * 2,
7432 +               .range_start = 0,
7433 +               .range_end = i_size_read(mapping->host),
7434 +               .for_writepages = 1,
7435 +       };
7436 +
7437 +       ret = generic_writepages(mapping, &wbc);
7438         return ret;
7439 -  }
7440 +}
7441
7442 -static void journal_do_submit_data(struct buffer_head **wbuf, int bufs)
7443 +/*
7444 + * Submit all the data buffers of inode associated with the transaction to
7445 + * disk.
7446 + *
7447 + * We are in a committing transaction. Therefore no new inode can be added to
7448 + * our inode list. We use JI_COMMIT_RUNNING flag to protect inode we currently
7449 + * operate on from being released while we write out pages.
7450 + */
7451 +static int journal_submit_data_buffers(journal_t *journal,
7452 +               transaction_t *commit_transaction)
7453  {
7454 -       int i;
7455 +       struct jbd2_inode *jinode;
7456 +       int err, ret = 0;
7457 +       struct address_space *mapping;
7458
7459 -       for (i = 0; i < bufs; i++) {
7460 -               wbuf[i]->b_end_io = end_buffer_write_sync;
7461 -               /* We use-up our safety reference in submit_bh() */
7462 -               submit_bh(WRITE, wbuf[i]);
7463 +       spin_lock(&journal->j_list_lock);
7464 +       list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
7465 +               mapping = jinode->i_vfs_inode->i_mapping;
7466 +               jinode->i_flags |= JI_COMMIT_RUNNING;
7467 +               spin_unlock(&journal->j_list_lock);
7468 +               /*
7469 +                * submit the inode data buffers. We use writepage
7470 +                * instead of writepages. Because writepages can do
7471 +                * block allocation  with delalloc. We need to write
7472 +                * only allocated blocks here.
7473 +                */
7474 +               err = journal_submit_inode_data_buffers(mapping);
7475 +               if (!ret)
7476 +                       ret = err;
7477 +               spin_lock(&journal->j_list_lock);
7478 +               J_ASSERT(jinode->i_transaction == commit_transaction);
7479 +               jinode->i_flags &= ~JI_COMMIT_RUNNING;
7480 +               wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
7481         }
7482 +       spin_unlock(&journal->j_list_lock);
7483 +       return ret;
7484  }
7485
7486  /*
7487 - *  Submit all the data buffers to disk
7488 + * Wait for data submitted for writeout, refile inodes to proper
7489 + * transaction if needed.
7490 + *
7491   */
7492 -static void journal_submit_data_buffers(journal_t *journal,
7493 -                               transaction_t *commit_transaction)
7494 +static int journal_finish_inode_data_buffers(journal_t *journal,
7495 +               transaction_t *commit_transaction)
7496  {
7497 -       struct journal_head *jh;
7498 -       struct buffer_head *bh;
7499 -       int locked;
7500 -       int bufs = 0;
7501 -       struct buffer_head **wbuf = journal->j_wbuf;
7502 +       struct jbd2_inode *jinode, *next_i;
7503 +       int err, ret = 0;
7504
7505 -       /*
7506 -        * Whenever we unlock the journal and sleep, things can get added
7507 -        * onto ->t_sync_datalist, so we have to keep looping back to
7508 -        * write_out_data until we *know* that the list is empty.
7509 -        *
7510 -        * Cleanup any flushed data buffers from the data list.  Even in
7511 -        * abort mode, we want to flush this out as soon as possible.
7512 -        */
7513 -write_out_data:
7514 -       cond_resched();
7515 +       /* For locking, see the comment in journal_submit_data_buffers() */
7516         spin_lock(&journal->j_list_lock);
7517 -
7518 -       while (commit_transaction->t_sync_datalist) {
7519 -               jh = commit_transaction->t_sync_datalist;
7520 -               bh = jh2bh(jh);
7521 -               locked = 0;
7522 -
7523 -               /* Get reference just to make sure buffer does not disappear
7524 -                * when we are forced to drop various locks */
7525 -               get_bh(bh);
7526 -               /* If the buffer is dirty, we need to submit IO and hence
7527 -                * we need the buffer lock. We try to lock the buffer without
7528 -                * blocking. If we fail, we need to drop j_list_lock and do
7529 -                * blocking lock_buffer().
7530 -                */
7531 -               if (buffer_dirty(bh)) {
7532 -                       if (test_set_buffer_locked(bh)) {
7533 -                               BUFFER_TRACE(bh, "needs blocking lock");
7534 -                               spin_unlock(&journal->j_list_lock);
7535 -                               /* Write out all data to prevent deadlocks */
7536 -                               journal_do_submit_data(wbuf, bufs);
7537 -                               bufs = 0;
7538 -                               lock_buffer(bh);
7539 -                               spin_lock(&journal->j_list_lock);
7540 -                       }
7541 -                       locked = 1;
7542 -               }
7543 -               /* We have to get bh_state lock. Again out of order, sigh. */
7544 -               if (!inverted_lock(journal, bh)) {
7545 -                       jbd_lock_bh_state(bh);
7546 -                       spin_lock(&journal->j_list_lock);
7547 -               }
7548 -               /* Someone already cleaned up the buffer? */
7549 -               if (!buffer_jbd(bh)
7550 -                       || jh->b_transaction != commit_transaction
7551 -                       || jh->b_jlist != BJ_SyncData) {
7552 -                       jbd_unlock_bh_state(bh);
7553 -                       if (locked)
7554 -                               unlock_buffer(bh);
7555 -                       BUFFER_TRACE(bh, "already cleaned up");
7556 -                       put_bh(bh);
7557 -                       continue;
7558 -               }
7559 -               if (locked && test_clear_buffer_dirty(bh)) {
7560 -                       BUFFER_TRACE(bh, "needs writeout, adding to array");
7561 -                       wbuf[bufs++] = bh;
7562 -                       __jbd2_journal_file_buffer(jh, commit_transaction,
7563 -                                               BJ_Locked);
7564 -                       jbd_unlock_bh_state(bh);
7565 -                       if (bufs == journal->j_wbufsize) {
7566 -                               spin_unlock(&journal->j_list_lock);
7567 -                               journal_do_submit_data(wbuf, bufs);
7568 -                               bufs = 0;
7569 -                               goto write_out_data;
7570 -                       }
7571 -               } else if (!locked && buffer_locked(bh)) {
7572 -                       __jbd2_journal_file_buffer(jh, commit_transaction,
7573 -                                               BJ_Locked);
7574 -                       jbd_unlock_bh_state(bh);
7575 -                       put_bh(bh);
7576 -               } else {
7577 -                       BUFFER_TRACE(bh, "writeout complete: unfile");
7578 -                       __jbd2_journal_unfile_buffer(jh);
7579 -                       jbd_unlock_bh_state(bh);
7580 -                       if (locked)
7581 -                               unlock_buffer(bh);
7582 -                       jbd2_journal_remove_journal_head(bh);
7583 -                       /* Once for our safety reference, once for
7584 -                        * jbd2_journal_remove_journal_head() */
7585 -                       put_bh(bh);
7586 -                       put_bh(bh);
7587 +       list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
7588 +               jinode->i_flags |= JI_COMMIT_RUNNING;
7589 +               spin_unlock(&journal->j_list_lock);
7590 +               err = filemap_fdatawait(jinode->i_vfs_inode->i_mapping);
7591 +               if (err) {
7592 +                       /*
7593 +                        * Because AS_EIO is cleared by
7594 +                        * wait_on_page_writeback_range(), set it again so
7595 +                        * that user process can get -EIO from fsync().
7596 +                        */
7597 +                       set_bit(AS_EIO,
7598 +                               &jinode->i_vfs_inode->i_mapping->flags);
7599 +
7600 +                       if (!ret)
7601 +                               ret = err;
7602                 }
7603 +               spin_lock(&journal->j_list_lock);
7604 +               jinode->i_flags &= ~JI_COMMIT_RUNNING;
7605 +               wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
7606 +       }
7607
7608 -               if (need_resched() || spin_needbreak(&journal->j_list_lock)) {
7609 -                       spin_unlock(&journal->j_list_lock);
7610 -                       goto write_out_data;
7611 +       /* Now refile inode to proper lists */
7612 +       list_for_each_entry_safe(jinode, next_i,
7613 +                                &commit_transaction->t_inode_list, i_list) {
7614 +               list_del(&jinode->i_list);
7615 +               if (jinode->i_next_transaction) {
7616 +                       jinode->i_transaction = jinode->i_next_transaction;
7617 +                       jinode->i_next_transaction = NULL;
7618 +                       list_add(&jinode->i_list,
7619 +                               &jinode->i_transaction->t_inode_list);
7620 +               } else {
7621 +                       jinode->i_transaction = NULL;
7622                 }
7623         }
7624         spin_unlock(&journal->j_list_lock);
7625 -       journal_do_submit_data(wbuf, bufs);
7626 +
7627 +       return ret;
7628  }
7629
7630  static __u32 jbd2_checksum_data(__u32 crc32_sum, struct buffer_head *bh)
7631 @@ -524,21 +469,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
7632          * Now start flushing things to disk, in the order they appear
7633          * on the transaction lists.  Data blocks go first.
7634          */
7635 -       err = 0;
7636 -       journal_submit_data_buffers(journal, commit_transaction);
7637 -
7638 -       /*
7639 -        * Wait for all previously submitted IO to complete if commit
7640 -        * record is to be written synchronously.
7641 -        */
7642 -       spin_lock(&journal->j_list_lock);
7643 -       if (!JBD2_HAS_INCOMPAT_FEATURE(journal,
7644 -               JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT))
7645 -               err = journal_wait_on_locked_list(journal,
7646 -                                               commit_transaction);
7647 -
7648 -       spin_unlock(&journal->j_list_lock);
7649 -
7650 +       err = journal_submit_data_buffers(journal, commit_transaction);
7651         if (err)
7652                 jbd2_journal_abort(journal, err);
7653
7654 @@ -547,16 +478,6 @@ void jbd2_journal_commit_transaction(journal_t *journal)
7655         jbd_debug(3, "JBD: commit phase 2\n");
7656
7657         /*
7658 -        * If we found any dirty or locked buffers, then we should have
7659 -        * looped back up to the write_out_data label.  If there weren't
7660 -        * any then journal_clean_data_list should have wiped the list
7661 -        * clean by now, so check that it is in fact empty.
7662 -        */
7663 -       J_ASSERT (commit_transaction->t_sync_datalist == NULL);
7664 -
7665 -       jbd_debug (3, "JBD: commit phase 3\n");
7666 -
7667 -       /*
7668          * Way to go: we have now written out all of the data for a
7669          * transaction!  Now comes the tricky part: we need to write out
7670          * metadata.  Loop over the transaction's entire buffer list:
7671 @@ -574,6 +495,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
7672         J_ASSERT(commit_transaction->t_nr_buffers <=
7673                  commit_transaction->t_outstanding_credits);
7674
7675 +       err = 0;
7676         descriptor = NULL;
7677         bufs = 0;
7678         while (commit_transaction->t_buffers) {
7679 @@ -748,13 +670,23 @@ start_journal_io:
7680                                                  &cbh, crc32_sum);
7681                 if (err)
7682                         __jbd2_journal_abort_hard(journal);
7683 +       }
7684
7685 -               spin_lock(&journal->j_list_lock);
7686 -               err = journal_wait_on_locked_list(journal,
7687 -                                               commit_transaction);
7688 -               spin_unlock(&journal->j_list_lock);
7689 -               if (err)
7690 -                       __jbd2_journal_abort_hard(journal);
7691 +       /*
7692 +        * This is the right place to wait for data buffers both for ASYNC
7693 +        * and !ASYNC commit. If commit is ASYNC, we need to wait only after
7694 +        * the commit block went to disk (which happens above). If commit is
7695 +        * SYNC, we need to wait for data buffers before we start writing
7696 +        * commit block, which happens below in such setting.
7697 +        */
7698 +       err = journal_finish_inode_data_buffers(journal, commit_transaction);
7699 +       if (err) {
7700 +               char b[BDEVNAME_SIZE];
7701 +
7702 +               printk(KERN_WARNING
7703 +                       "JBD2: Detected IO errors while flushing file data "
7704 +                       "on %s\n", bdevname(journal->j_fs_dev, b));
7705 +               err = 0;
7706         }
7707
7708         /* Lo and behold: we have just managed to send a transaction to
7709 @@ -768,7 +700,7 @@ start_journal_io:
7710            so we incur less scheduling load.
7711         */
7712
7713 -       jbd_debug(3, "JBD: commit phase 4\n");
7714 +       jbd_debug(3, "JBD: commit phase 3\n");
7715
7716         /*
7717          * akpm: these are BJ_IO, and j_list_lock is not needed.
7718 @@ -827,7 +759,7 @@ wait_for_iobuf:
7719
7720         J_ASSERT (commit_transaction->t_shadow_list == NULL);
7721
7722 -       jbd_debug(3, "JBD: commit phase 5\n");
7723 +       jbd_debug(3, "JBD: commit phase 4\n");
7724
7725         /* Here we wait for the revoke record and descriptor record buffers */
7726   wait_for_ctlbuf:
7727 @@ -854,7 +786,7 @@ wait_for_iobuf:
7728                 /* AKPM: bforget here */
7729         }
7730
7731 -       jbd_debug(3, "JBD: commit phase 6\n");
7732 +       jbd_debug(3, "JBD: commit phase 5\n");
7733
7734         if (!JBD2_HAS_INCOMPAT_FEATURE(journal,
7735                 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
7736 @@ -874,9 +806,9 @@ wait_for_iobuf:
7737             transaction can be removed from any checkpoint list it was on
7738             before. */
7739
7740 -       jbd_debug(3, "JBD: commit phase 7\n");
7741 +       jbd_debug(3, "JBD: commit phase 6\n");
7742
7743 -       J_ASSERT(commit_transaction->t_sync_datalist == NULL);
7744 +       J_ASSERT(list_empty(&commit_transaction->t_inode_list));
7745         J_ASSERT(commit_transaction->t_buffers == NULL);
7746         J_ASSERT(commit_transaction->t_checkpoint_list == NULL);
7747         J_ASSERT(commit_transaction->t_iobuf_list == NULL);
7748 @@ -997,7 +929,7 @@ restart_loop:
7749
7750         /* Done with this transaction! */
7751
7752 -       jbd_debug(3, "JBD: commit phase 8\n");
7753 +       jbd_debug(3, "JBD: commit phase 7\n");
7754
7755         J_ASSERT(commit_transaction->t_state == T_COMMIT);
7756
7757 diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
7758 index 2e24567..8207a01 100644
7759 --- a/fs/jbd2/journal.c
7760 +++ b/fs/jbd2/journal.c
7761 @@ -50,7 +50,6 @@ EXPORT_SYMBOL(jbd2_journal_unlock_updates);
7762  EXPORT_SYMBOL(jbd2_journal_get_write_access);
7763  EXPORT_SYMBOL(jbd2_journal_get_create_access);
7764  EXPORT_SYMBOL(jbd2_journal_get_undo_access);
7765 -EXPORT_SYMBOL(jbd2_journal_dirty_data);
7766  EXPORT_SYMBOL(jbd2_journal_dirty_metadata);
7767  EXPORT_SYMBOL(jbd2_journal_release_buffer);
7768  EXPORT_SYMBOL(jbd2_journal_forget);
7769 @@ -69,7 +68,6 @@ EXPORT_SYMBOL(jbd2_journal_set_features);
7770  EXPORT_SYMBOL(jbd2_journal_create);
7771  EXPORT_SYMBOL(jbd2_journal_load);
7772  EXPORT_SYMBOL(jbd2_journal_destroy);
7773 -EXPORT_SYMBOL(jbd2_journal_update_superblock);
7774  EXPORT_SYMBOL(jbd2_journal_abort);
7775  EXPORT_SYMBOL(jbd2_journal_errno);
7776  EXPORT_SYMBOL(jbd2_journal_ack_err);
7777 @@ -82,6 +80,10 @@ EXPORT_SYMBOL(jbd2_journal_blocks_per_page);
7778  EXPORT_SYMBOL(jbd2_journal_invalidatepage);
7779  EXPORT_SYMBOL(jbd2_journal_try_to_free_buffers);
7780  EXPORT_SYMBOL(jbd2_journal_force_commit);
7781 +EXPORT_SYMBOL(jbd2_journal_file_inode);
7782 +EXPORT_SYMBOL(jbd2_journal_init_jbd_inode);
7783 +EXPORT_SYMBOL(jbd2_journal_release_jbd_inode);
7784 +EXPORT_SYMBOL(jbd2_journal_begin_ordered_truncate);
7785
7786  static int journal_convert_superblock_v1(journal_t *, journal_superblock_t *);
7787  static void __journal_abort_soft (journal_t *journal, int errno);
7788 @@ -2195,6 +2197,54 @@ void jbd2_journal_put_journal_head(struct journal_head *jh)
7789  }
7790
7791  /*
7792 + * Initialize jbd inode head
7793 + */
7794 +void jbd2_journal_init_jbd_inode(struct jbd2_inode *jinode, struct inode *inode)
7795 +{
7796 +       jinode->i_transaction = NULL;
7797 +       jinode->i_next_transaction = NULL;
7798 +       jinode->i_vfs_inode = inode;
7799 +       jinode->i_flags = 0;
7800 +       INIT_LIST_HEAD(&jinode->i_list);
7801 +}
7802 +
7803 +/*
7804 + * Function to be called before we start removing inode from memory (i.e.,
7805 + * clear_inode() is a fine place to be called from). It removes inode from
7806 + * transaction's lists.
7807 + */
7808 +void jbd2_journal_release_jbd_inode(journal_t *journal,
7809 +                                   struct jbd2_inode *jinode)
7810 +{
7811 +       int writeout = 0;
7812 +
7813 +       if (!journal)
7814 +               return;
7815 +restart:
7816 +       spin_lock(&journal->j_list_lock);
7817 +       /* Is commit writing out inode - we have to wait */
7818 +       if (jinode->i_flags & JI_COMMIT_RUNNING) {
7819 +               wait_queue_head_t *wq;
7820 +               DEFINE_WAIT_BIT(wait, &jinode->i_flags, __JI_COMMIT_RUNNING);
7821 +               wq = bit_waitqueue(&jinode->i_flags, __JI_COMMIT_RUNNING);
7822 +               prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE);
7823 +               spin_unlock(&journal->j_list_lock);
7824 +               schedule();
7825 +               finish_wait(wq, &wait.wait);
7826 +               goto restart;
7827 +       }
7828 +
7829 +       /* Do we need to wait for data writeback? */
7830 +       if (journal->j_committing_transaction == jinode->i_transaction)
7831 +               writeout = 1;
7832 +       if (jinode->i_transaction) {
7833 +               list_del(&jinode->i_list);
7834 +               jinode->i_transaction = NULL;
7835 +       }
7836 +       spin_unlock(&journal->j_list_lock);
7837 +}
7838 +
7839 +/*
7840   * debugfs tunables
7841   */
7842  #ifdef CONFIG_JBD2_DEBUG
7843 diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
7844 index d6e006e..4f7cadb 100644
7845 --- a/fs/jbd2/transaction.c
7846 +++ b/fs/jbd2/transaction.c
7847 @@ -41,7 +41,6 @@ static void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh);
7848   *     new transaction and we can't block without protecting against other
7849   *     processes trying to touch the journal while it is in transition.
7850   *
7851 - * Called under j_state_lock
7852   */
7853
7854  static transaction_t *
7855 @@ -52,6 +51,7 @@ jbd2_get_transaction(journal_t *journal, transaction_t *transaction)
7856         transaction->t_tid = journal->j_transaction_sequence++;
7857         transaction->t_expires = jiffies + journal->j_commit_interval;
7858         spin_lock_init(&transaction->t_handle_lock);
7859 +       INIT_LIST_HEAD(&transaction->t_inode_list);
7860
7861         /* Set up the commit timer for the new transaction. */
7862         journal->j_commit_timer.expires = round_jiffies(transaction->t_expires);
7863 @@ -943,183 +943,6 @@ out:
7864  }
7865
7866  /**
7867 - * int jbd2_journal_dirty_data() -  mark a buffer as containing dirty data which
7868 - *                             needs to be flushed before we can commit the
7869 - *                             current transaction.
7870 - * @handle: transaction
7871 - * @bh: bufferhead to mark
7872 - *
7873 - * The buffer is placed on the transaction's data list and is marked as
7874 - * belonging to the transaction.
7875 - *
7876 - * Returns error number or 0 on success.
7877 - *
7878 - * jbd2_journal_dirty_data() can be called via page_launder->ext3_writepage
7879 - * by kswapd.
7880 - */
7881 -int jbd2_journal_dirty_data(handle_t *handle, struct buffer_head *bh)
7882 -{
7883 -       journal_t *journal = handle->h_transaction->t_journal;
7884 -       int need_brelse = 0;
7885 -       struct journal_head *jh;
7886 -
7887 -       if (is_handle_aborted(handle))
7888 -               return 0;
7889 -
7890 -       jh = jbd2_journal_add_journal_head(bh);
7891 -       JBUFFER_TRACE(jh, "entry");
7892 -
7893 -       /*
7894 -        * The buffer could *already* be dirty.  Writeout can start
7895 -        * at any time.
7896 -        */
7897 -       jbd_debug(4, "jh: %p, tid:%d\n", jh, handle->h_transaction->t_tid);
7898 -
7899 -       /*
7900 -        * What if the buffer is already part of a running transaction?
7901 -        *
7902 -        * There are two cases:
7903 -        * 1) It is part of the current running transaction.  Refile it,
7904 -        *    just in case we have allocated it as metadata, deallocated
7905 -        *    it, then reallocated it as data.
7906 -        * 2) It is part of the previous, still-committing transaction.
7907 -        *    If all we want to do is to guarantee that the buffer will be
7908 -        *    written to disk before this new transaction commits, then
7909 -        *    being sure that the *previous* transaction has this same
7910 -        *    property is sufficient for us!  Just leave it on its old
7911 -        *    transaction.
7912 -        *
7913 -        * In case (2), the buffer must not already exist as metadata
7914 -        * --- that would violate write ordering (a transaction is free
7915 -        * to write its data at any point, even before the previous
7916 -        * committing transaction has committed).  The caller must
7917 -        * never, ever allow this to happen: there's nothing we can do
7918 -        * about it in this layer.
7919 -        */
7920 -       jbd_lock_bh_state(bh);
7921 -       spin_lock(&journal->j_list_lock);
7922 -
7923 -       /* Now that we have bh_state locked, are we really still mapped? */
7924 -       if (!buffer_mapped(bh)) {
7925 -               JBUFFER_TRACE(jh, "unmapped buffer, bailing out");
7926 -               goto no_journal;
7927 -       }
7928 -
7929 -       if (jh->b_transaction) {
7930 -               JBUFFER_TRACE(jh, "has transaction");
7931 -               if (jh->b_transaction != handle->h_transaction) {
7932 -                       JBUFFER_TRACE(jh, "belongs to older transaction");
7933 -                       J_ASSERT_JH(jh, jh->b_transaction ==
7934 -                                       journal->j_committing_transaction);
7935 -
7936 -                       /* @@@ IS THIS TRUE  ? */
7937 -                       /*
7938 -                        * Not any more.  Scenario: someone does a write()
7939 -                        * in data=journal mode.  The buffer's transaction has
7940 -                        * moved into commit.  Then someone does another
7941 -                        * write() to the file.  We do the frozen data copyout
7942 -                        * and set b_next_transaction to point to j_running_t.
7943 -                        * And while we're in that state, someone does a
7944 -                        * writepage() in an attempt to pageout the same area
7945 -                        * of the file via a shared mapping.  At present that
7946 -                        * calls jbd2_journal_dirty_data(), and we get right here.
7947 -                        * It may be too late to journal the data.  Simply
7948 -                        * falling through to the next test will suffice: the
7949 -                        * data will be dirty and wil be checkpointed.  The
7950 -                        * ordering comments in the next comment block still
7951 -                        * apply.
7952 -                        */
7953 -                       //J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
7954 -
7955 -                       /*
7956 -                        * If we're journalling data, and this buffer was
7957 -                        * subject to a write(), it could be metadata, forget
7958 -                        * or shadow against the committing transaction.  Now,
7959 -                        * someone has dirtied the same darn page via a mapping
7960 -                        * and it is being writepage()'d.
7961 -                        * We *could* just steal the page from commit, with some
7962 -                        * fancy locking there.  Instead, we just skip it -
7963 -                        * don't tie the page's buffers to the new transaction
7964 -                        * at all.
7965 -                        * Implication: if we crash before the writepage() data
7966 -                        * is written into the filesystem, recovery will replay
7967 -                        * the write() data.
7968 -                        */
7969 -                       if (jh->b_jlist != BJ_None &&
7970 -                                       jh->b_jlist != BJ_SyncData &&
7971 -                                       jh->b_jlist != BJ_Locked) {
7972 -                               JBUFFER_TRACE(jh, "Not stealing");
7973 -                               goto no_journal;
7974 -                       }
7975 -
7976 -                       /*
7977 -                        * This buffer may be undergoing writeout in commit.  We
7978 -                        * can't return from here and let the caller dirty it
7979 -                        * again because that can cause the write-out loop in
7980 -                        * commit to never terminate.
7981 -                        */
7982 -                       if (buffer_dirty(bh)) {
7983 -                               get_bh(bh);
7984 -                               spin_unlock(&journal->j_list_lock);
7985 -                               jbd_unlock_bh_state(bh);
7986 -                               need_brelse = 1;
7987 -                               sync_dirty_buffer(bh);
7988 -                               jbd_lock_bh_state(bh);
7989 -                               spin_lock(&journal->j_list_lock);
7990 -                               /* Since we dropped the lock... */
7991 -                               if (!buffer_mapped(bh)) {
7992 -                                       JBUFFER_TRACE(jh, "buffer got unmapped");
7993 -                                       goto no_journal;
7994 -                               }
7995 -                               /* The buffer may become locked again at any
7996 -                                  time if it is redirtied */
7997 -                       }
7998 -
7999 -                       /* journal_clean_data_list() may have got there first */
8000 -                       if (jh->b_transaction != NULL) {
8001 -                               JBUFFER_TRACE(jh, "unfile from commit");
8002 -                               __jbd2_journal_temp_unlink_buffer(jh);
8003 -                               /* It still points to the committing
8004 -                                * transaction; move it to this one so
8005 -                                * that the refile assert checks are
8006 -                                * happy. */
8007 -                               jh->b_transaction = handle->h_transaction;
8008 -                       }
8009 -                       /* The buffer will be refiled below */
8010 -
8011 -               }
8012 -               /*
8013 -                * Special case --- the buffer might actually have been
8014 -                * allocated and then immediately deallocated in the previous,
8015 -                * committing transaction, so might still be left on that
8016 -                * transaction's metadata lists.
8017 -                */
8018 -               if (jh->b_jlist != BJ_SyncData && jh->b_jlist != BJ_Locked) {
8019 -                       JBUFFER_TRACE(jh, "not on correct data list: unfile");
8020 -                       J_ASSERT_JH(jh, jh->b_jlist != BJ_Shadow);
8021 -                       __jbd2_journal_temp_unlink_buffer(jh);
8022 -                       jh->b_transaction = handle->h_transaction;
8023 -                       JBUFFER_TRACE(jh, "file as data");
8024 -                       __jbd2_journal_file_buffer(jh, handle->h_transaction,
8025 -                                               BJ_SyncData);
8026 -               }
8027 -       } else {
8028 -               JBUFFER_TRACE(jh, "not on a transaction");
8029 -               __jbd2_journal_file_buffer(jh, handle->h_transaction, BJ_SyncData);
8030 -       }
8031 -no_journal:
8032 -       spin_unlock(&journal->j_list_lock);
8033 -       jbd_unlock_bh_state(bh);
8034 -       if (need_brelse) {
8035 -               BUFFER_TRACE(bh, "brelse");
8036 -               __brelse(bh);
8037 -       }
8038 -       JBUFFER_TRACE(jh, "exit");
8039 -       jbd2_journal_put_journal_head(jh);
8040 -       return 0;
8041 -}
8042 -
8043 -/**
8044   * int jbd2_journal_dirty_metadata() -  mark a buffer as containing dirty metadata
8045   * @handle: transaction to add buffer to.
8046   * @bh: buffer to mark
8047 @@ -1541,10 +1364,10 @@ __blist_del_buffer(struct journal_head **list, struct journal_head *jh)
8048   * Remove a buffer from the appropriate transaction list.
8049   *
8050   * Note that this function can *change* the value of
8051 - * bh->b_transaction->t_sync_datalist, t_buffers, t_forget,
8052 - * t_iobuf_list, t_shadow_list, t_log_list or t_reserved_list.  If the caller
8053 - * is holding onto a copy of one of thee pointers, it could go bad.
8054 - * Generally the caller needs to re-read the pointer from the transaction_t.
8055 + * bh->b_transaction->t_buffers, t_forget, t_iobuf_list, t_shadow_list,
8056 + * t_log_list or t_reserved_list.  If the caller is holding onto a copy of one
8057 + * of these pointers, it could go bad.  Generally the caller needs to re-read
8058 + * the pointer from the transaction_t.
8059   *
8060   * Called under j_list_lock.  The journal may not be locked.
8061   */
8062 @@ -1566,9 +1389,6 @@ void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh)
8063         switch (jh->b_jlist) {
8064         case BJ_None:
8065                 return;
8066 -       case BJ_SyncData:
8067 -               list = &transaction->t_sync_datalist;
8068 -               break;
8069         case BJ_Metadata:
8070                 transaction->t_nr_buffers--;
8071                 J_ASSERT_JH(jh, transaction->t_nr_buffers >= 0);
8072 @@ -1589,9 +1409,6 @@ void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh)
8073         case BJ_Reserved:
8074                 list = &transaction->t_reserved_list;
8075                 break;
8076 -       case BJ_Locked:
8077 -               list = &transaction->t_locked_list;
8078 -               break;
8079         }
8080
8081         __blist_del_buffer(list, jh);
8082 @@ -1634,15 +1451,7 @@ __journal_try_to_free_buffer(journal_t *journal, struct buffer_head *bh)
8083                 goto out;
8084
8085         spin_lock(&journal->j_list_lock);
8086 -       if (jh->b_transaction != NULL && jh->b_cp_transaction == NULL) {
8087 -               if (jh->b_jlist == BJ_SyncData || jh->b_jlist == BJ_Locked) {
8088 -                       /* A written-back ordered data buffer */
8089 -                       JBUFFER_TRACE(jh, "release data");
8090 -                       __jbd2_journal_unfile_buffer(jh);
8091 -                       jbd2_journal_remove_journal_head(bh);
8092 -                       __brelse(bh);
8093 -               }
8094 -       } else if (jh->b_cp_transaction != NULL && jh->b_transaction == NULL) {
8095 +       if (jh->b_cp_transaction != NULL && jh->b_transaction == NULL) {
8096                 /* written-back checkpointed metadata buffer */
8097                 if (jh->b_jlist == BJ_None) {
8098                         JBUFFER_TRACE(jh, "remove from checkpoint list");
8099 @@ -1656,12 +1465,43 @@ out:
8100         return;
8101  }
8102
8103 +/*
8104 + * jbd2_journal_try_to_free_buffers() could race with
8105 + * jbd2_journal_commit_transaction(). The later might still hold the
8106 + * reference count to the buffers when inspecting them on
8107 + * t_syncdata_list or t_locked_list.
8108 + *
8109 + * jbd2_journal_try_to_free_buffers() will call this function to
8110 + * wait for the current transaction to finish syncing data buffers, before
8111 + * try to free that buffer.
8112 + *
8113 + * Called with journal->j_state_lock hold.
8114 + */
8115 +static void jbd2_journal_wait_for_transaction_sync_data(journal_t *journal)
8116 +{
8117 +       transaction_t *transaction;
8118 +       tid_t tid;
8119 +
8120 +       spin_lock(&journal->j_state_lock);
8121 +       transaction = journal->j_committing_transaction;
8122 +
8123 +       if (!transaction) {
8124 +               spin_unlock(&journal->j_state_lock);
8125 +               return;
8126 +       }
8127 +
8128 +       tid = transaction->t_tid;
8129 +       spin_unlock(&journal->j_state_lock);
8130 +       jbd2_log_wait_commit(journal, tid);
8131 +}
8132
8133  /**
8134   * int jbd2_journal_try_to_free_buffers() - try to free page buffers.
8135   * @journal: journal for operation
8136   * @page: to try and free
8137 - * @unused_gfp_mask: unused
8138 + * @gfp_mask: we use the mask to detect how hard should we try to release
8139 + * buffers. If __GFP_WAIT and __GFP_FS is set, we wait for commit code to
8140 + * release the buffers.
8141   *
8142   *
8143   * For all the buffers on this page,
8144 @@ -1690,9 +1530,11 @@ out:
8145   * journal_try_to_free_buffer() is changing its state.  But that
8146   * cannot happen because we never reallocate freed data as metadata
8147   * while the data is part of a transaction.  Yes?
8148 + *
8149 + * Return 0 on failure, 1 on success
8150   */
8151  int jbd2_journal_try_to_free_buffers(journal_t *journal,
8152 -                               struct page *page, gfp_t unused_gfp_mask)
8153 +                               struct page *page, gfp_t gfp_mask)
8154  {
8155         struct buffer_head *head;
8156         struct buffer_head *bh;
8157 @@ -1708,7 +1550,8 @@ int jbd2_journal_try_to_free_buffers(journal_t *journal,
8158                 /*
8159                  * We take our own ref against the journal_head here to avoid
8160                  * having to add tons of locking around each instance of
8161 -                * jbd2_journal_remove_journal_head() and jbd2_journal_put_journal_head().
8162 +                * jbd2_journal_remove_journal_head() and
8163 +                * jbd2_journal_put_journal_head().
8164                  */
8165                 jh = jbd2_journal_grab_journal_head(bh);
8166                 if (!jh)
8167 @@ -1721,7 +1564,28 @@ int jbd2_journal_try_to_free_buffers(journal_t *journal,
8168                 if (buffer_jbd(bh))
8169                         goto busy;
8170         } while ((bh = bh->b_this_page) != head);
8171 +
8172         ret = try_to_free_buffers(page);
8173 +
8174 +       /*
8175 +        * There are a number of places where jbd2_journal_try_to_free_buffers()
8176 +        * could race with jbd2_journal_commit_transaction(), the later still
8177 +        * holds the reference to the buffers to free while processing them.
8178 +        * try_to_free_buffers() failed to free those buffers. Some of the
8179 +        * caller of releasepage() request page buffers to be dropped, otherwise
8180 +        * treat the fail-to-free as errors (such as generic_file_direct_IO())
8181 +        *
8182 +        * So, if the caller of try_to_release_page() wants the synchronous
8183 +        * behaviour(i.e make sure buffers are dropped upon return),
8184 +        * let's wait for the current transaction to finish flush of
8185 +        * dirty data buffers, then try to free those buffers again,
8186 +        * with the journal locked.
8187 +        */
8188 +       if (ret == 0 && (gfp_mask & __GFP_WAIT) && (gfp_mask & __GFP_FS)) {
8189 +               jbd2_journal_wait_for_transaction_sync_data(journal);
8190 +               ret = try_to_free_buffers(page);
8191 +       }
8192 +
8193  busy:
8194         return ret;
8195  }
8196 @@ -1823,6 +1687,7 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh)
8197         if (!buffer_jbd(bh))
8198                 goto zap_buffer_unlocked;
8199
8200 +       /* OK, we have data buffer in journaled mode */
8201         spin_lock(&journal->j_state_lock);
8202         jbd_lock_bh_state(bh);
8203         spin_lock(&journal->j_list_lock);
8204 @@ -1886,15 +1751,6 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh)
8205                 }
8206         } else if (transaction == journal->j_committing_transaction) {
8207                 JBUFFER_TRACE(jh, "on committing transaction");
8208 -               if (jh->b_jlist == BJ_Locked) {
8209 -                       /*
8210 -                        * The buffer is on the committing transaction's locked
8211 -                        * list.  We have the buffer locked, so I/O has
8212 -                        * completed.  So we can nail the buffer now.
8213 -                        */
8214 -                       may_free = __dispose_buffer(jh, transaction);
8215 -                       goto zap_buffer;
8216 -               }
8217                 /*
8218                  * If it is committing, we simply cannot touch it.  We
8219                  * can remove it's next_transaction pointer from the
8220 @@ -2027,9 +1883,6 @@ void __jbd2_journal_file_buffer(struct journal_head *jh,
8221                 J_ASSERT_JH(jh, !jh->b_committed_data);
8222                 J_ASSERT_JH(jh, !jh->b_frozen_data);
8223                 return;
8224 -       case BJ_SyncData:
8225 -               list = &transaction->t_sync_datalist;
8226 -               break;
8227         case BJ_Metadata:
8228                 transaction->t_nr_buffers++;
8229                 list = &transaction->t_buffers;
8230 @@ -2049,9 +1902,6 @@ void __jbd2_journal_file_buffer(struct journal_head *jh,
8231         case BJ_Reserved:
8232                 list = &transaction->t_reserved_list;
8233                 break;
8234 -       case BJ_Locked:
8235 -               list =  &transaction->t_locked_list;
8236 -               break;
8237         }
8238
8239         __blist_add_buffer(list, jh);
8240 @@ -2141,3 +1991,88 @@ void jbd2_journal_refile_buffer(journal_t *journal, struct journal_head *jh)
8241         spin_unlock(&journal->j_list_lock);
8242         __brelse(bh);
8243  }
8244 +
8245 +/*
8246 + * File inode in the inode list of the handle's transaction
8247 + */
8248 +int jbd2_journal_file_inode(handle_t *handle, struct jbd2_inode *jinode)
8249 +{
8250 +       transaction_t *transaction = handle->h_transaction;
8251 +       journal_t *journal = transaction->t_journal;
8252 +
8253 +       if (is_handle_aborted(handle))
8254 +               return -EIO;
8255 +
8256 +       jbd_debug(4, "Adding inode %lu, tid:%d\n", jinode->i_vfs_inode->i_ino,
8257 +                       transaction->t_tid);
8258 +
8259 +       /*
8260 +        * First check whether inode isn't already on the transaction's
8261 +        * lists without taking the lock. Note that this check is safe
8262 +        * without the lock as we cannot race with somebody removing inode
8263 +        * from the transaction. The reason is that we remove inode from the
8264 +        * transaction only in journal_release_jbd_inode() and when we commit
8265 +        * the transaction. We are guarded from the first case by holding
8266 +        * a reference to the inode. We are safe against the second case
8267 +        * because if jinode->i_transaction == transaction, commit code
8268 +        * cannot touch the transaction because we hold reference to it,
8269 +        * and if jinode->i_next_transaction == transaction, commit code
8270 +        * will only file the inode where we want it.
8271 +        */
8272 +       if (jinode->i_transaction == transaction ||
8273 +           jinode->i_next_transaction == transaction)
8274 +               return 0;
8275 +
8276 +       spin_lock(&journal->j_list_lock);
8277 +
8278 +       if (jinode->i_transaction == transaction ||
8279 +           jinode->i_next_transaction == transaction)
8280 +               goto done;
8281 +
8282 +       /* On some different transaction's list - should be
8283 +        * the committing one */
8284 +       if (jinode->i_transaction) {
8285 +               J_ASSERT(jinode->i_next_transaction == NULL);
8286 +               J_ASSERT(jinode->i_transaction ==
8287 +                                       journal->j_committing_transaction);
8288 +               jinode->i_next_transaction = transaction;
8289 +               goto done;
8290 +       }
8291 +       /* Not on any transaction list... */
8292 +       J_ASSERT(!jinode->i_next_transaction);
8293 +       jinode->i_transaction = transaction;
8294 +       list_add(&jinode->i_list, &transaction->t_inode_list);
8295 +done:
8296 +       spin_unlock(&journal->j_list_lock);
8297 +
8298 +       return 0;
8299 +}
8300 +
8301 +/*
8302 + * This function must be called when inode is journaled in ordered mode
8303 + * before truncation happens. It starts writeout of truncated part in
8304 + * case it is in the committing transaction so that we stand to ordered
8305 + * mode consistency guarantees.
8306 + */
8307 +int jbd2_journal_begin_ordered_truncate(struct jbd2_inode *inode,
8308 +                                       loff_t new_size)
8309 +{
8310 +       journal_t *journal;
8311 +       transaction_t *commit_trans;
8312 +       int ret = 0;
8313 +
8314 +       if (!inode->i_transaction && !inode->i_next_transaction)
8315 +               goto out;
8316 +       journal = inode->i_transaction->t_journal;
8317 +       spin_lock(&journal->j_state_lock);
8318 +       commit_trans = journal->j_committing_transaction;
8319 +       spin_unlock(&journal->j_state_lock);
8320 +       if (inode->i_transaction == commit_trans) {
8321 +               ret = filemap_fdatawrite_range(inode->i_vfs_inode->i_mapping,
8322 +                       new_size, LLONG_MAX);
8323 +               if (ret)
8324 +                       jbd2_journal_abort(journal, ret);
8325 +       }
8326 +out:
8327 +       return ret;
8328 +}
8329 diff --git a/fs/mpage.c b/fs/mpage.c
8330 index 235e4d3..dbcc7af 100644
8331 --- a/fs/mpage.c
8332 +++ b/fs/mpage.c
8333 @@ -82,7 +82,7 @@ static void mpage_end_io_write(struct bio *bio, int err)
8334         bio_put(bio);
8335  }
8336
8337 -static struct bio *mpage_bio_submit(int rw, struct bio *bio)
8338 +struct bio *mpage_bio_submit(int rw, struct bio *bio)
8339  {
8340         bio->bi_end_io = mpage_end_io_read;
8341         if (rw == WRITE)
8342 @@ -90,6 +90,7 @@ static struct bio *mpage_bio_submit(int rw, struct bio *bio)
8343         submit_bio(rw, bio);
8344         return NULL;
8345  }
8346 +EXPORT_SYMBOL(mpage_bio_submit);
8347
8348  static struct bio *
8349  mpage_alloc(struct block_device *bdev,
8350 @@ -435,15 +436,9 @@ EXPORT_SYMBOL(mpage_readpage);
8351   * written, so it can intelligently allocate a suitably-sized BIO.  For now,
8352   * just allocate full-size (16-page) BIOs.
8353   */
8354 -struct mpage_data {
8355 -       struct bio *bio;
8356 -       sector_t last_block_in_bio;
8357 -       get_block_t *get_block;
8358 -       unsigned use_writepage;
8359 -};
8360
8361 -static int __mpage_writepage(struct page *page, struct writeback_control *wbc,
8362 -                            void *data)
8363 +int __mpage_writepage(struct page *page, struct writeback_control *wbc,
8364 +                     void *data)
8365  {
8366         struct mpage_data *mpd = data;
8367         struct bio *bio = mpd->bio;
8368 @@ -651,6 +646,7 @@ out:
8369         mpd->bio = bio;
8370         return ret;
8371  }
8372 +EXPORT_SYMBOL(__mpage_writepage);
8373
8374  /**
8375   * mpage_writepages - walk the list of dirty pages of the given address space & writepage() all of them
8376 diff --git a/include/linux/fs.h b/include/linux/fs.h
8377 index d8e2762..97f992a 100644
8378 --- a/include/linux/fs.h
8379 +++ b/include/linux/fs.h
8380 @@ -1740,6 +1740,8 @@ extern int wait_on_page_writeback_range(struct address_space *mapping,
8381                                 pgoff_t start, pgoff_t end);
8382  extern int __filemap_fdatawrite_range(struct address_space *mapping,
8383                                 loff_t start, loff_t end, int sync_mode);
8384 +extern int filemap_fdatawrite_range(struct address_space *mapping,
8385 +                               loff_t start, loff_t end);
8386
8387  extern long do_fsync(struct file *file, int datasync);
8388  extern void sync_supers(void);
8389 diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
8390 index d147f0f..3dd2090 100644
8391 --- a/include/linux/jbd2.h
8392 +++ b/include/linux/jbd2.h
8393 @@ -168,6 +168,8 @@ struct commit_header {
8394         unsigned char   h_chksum_size;
8395         unsigned char   h_padding[2];
8396         __be32          h_chksum[JBD2_CHECKSUM_BYTES];
8397 +       __be64          h_commit_sec;
8398 +       __be32          h_commit_nsec;
8399  };
8400
8401  /*
8402 @@ -379,6 +381,38 @@ static inline void jbd_unlock_bh_journal_head(struct buffer_head *bh)
8403         bit_spin_unlock(BH_JournalHead, &bh->b_state);
8404  }
8405
8406 +/* Flags in jbd_inode->i_flags */
8407 +#define __JI_COMMIT_RUNNING 0
8408 +/* Commit of the inode data in progress. We use this flag to protect us from
8409 + * concurrent deletion of inode. We cannot use reference to inode for this
8410 + * since we cannot afford doing last iput() on behalf of kjournald
8411 + */
8412 +#define JI_COMMIT_RUNNING (1 << __JI_COMMIT_RUNNING)
8413 +
8414 +/**
8415 + * struct jbd_inode is the structure linking inodes in ordered mode
8416 + *   present in a transaction so that we can sync them during commit.
8417 + */
8418 +struct jbd2_inode {
8419 +       /* Which transaction does this inode belong to? Either the running
8420 +        * transaction or the committing one. [j_list_lock] */
8421 +       transaction_t *i_transaction;
8422 +
8423 +       /* Pointer to the running transaction modifying inode's data in case
8424 +        * there is already a committing transaction touching it. [j_list_lock] */
8425 +       transaction_t *i_next_transaction;
8426 +
8427 +       /* List of inodes in the i_transaction [j_list_lock] */
8428 +       struct list_head i_list;
8429 +
8430 +       /* VFS inode this inode belongs to [constant during the lifetime
8431 +        * of the structure] */
8432 +       struct inode *i_vfs_inode;
8433 +
8434 +       /* Flags of inode [j_list_lock] */
8435 +       unsigned int i_flags;
8436 +};
8437 +
8438  struct jbd2_revoke_table_s;
8439
8440  /**
8441 @@ -509,24 +543,12 @@ struct transaction_s
8442         struct journal_head     *t_reserved_list;
8443
8444         /*
8445 -        * Doubly-linked circular list of all buffers under writeout during
8446 -        * commit [j_list_lock]
8447 -        */
8448 -       struct journal_head     *t_locked_list;
8449 -
8450 -       /*
8451          * Doubly-linked circular list of all metadata buffers owned by this
8452          * transaction [j_list_lock]
8453          */
8454         struct journal_head     *t_buffers;
8455
8456         /*
8457 -        * Doubly-linked circular list of all data buffers still to be
8458 -        * flushed before this transaction can be committed [j_list_lock]
8459 -        */
8460 -       struct journal_head     *t_sync_datalist;
8461 -
8462 -       /*
8463          * Doubly-linked circular list of all forget buffers (superseded
8464          * buffers which we can un-checkpoint once this transaction commits)
8465          * [j_list_lock]
8466 @@ -565,6 +587,12 @@ struct transaction_s
8467         struct journal_head     *t_log_list;
8468
8469         /*
8470 +        * List of inodes whose data we've modified in data=ordered mode.
8471 +        * [j_list_lock]
8472 +        */
8473 +       struct list_head        t_inode_list;
8474 +
8475 +       /*
8476          * Protects info related to handles
8477          */
8478         spinlock_t              t_handle_lock;
8479 @@ -1004,7 +1032,6 @@ extern int         jbd2_journal_extend (handle_t *, int nblocks);
8480  extern int      jbd2_journal_get_write_access(handle_t *, struct buffer_head *);
8481  extern int      jbd2_journal_get_create_access (handle_t *, struct buffer_head *);
8482  extern int      jbd2_journal_get_undo_access(handle_t *, struct buffer_head *);
8483 -extern int      jbd2_journal_dirty_data (handle_t *, struct buffer_head *);
8484  extern int      jbd2_journal_dirty_metadata (handle_t *, struct buffer_head *);
8485  extern void     jbd2_journal_release_buffer (handle_t *, struct buffer_head *);
8486  extern int      jbd2_journal_forget (handle_t *, struct buffer_head *);
8487 @@ -1044,6 +1071,10 @@ extern void         jbd2_journal_ack_err    (journal_t *);
8488  extern int        jbd2_journal_clear_err  (journal_t *);
8489  extern int        jbd2_journal_bmap(journal_t *, unsigned long, unsigned long long *);
8490  extern int        jbd2_journal_force_commit(journal_t *);
8491 +extern int        jbd2_journal_file_inode(handle_t *handle, struct jbd2_inode *inode);
8492 +extern int        jbd2_journal_begin_ordered_truncate(struct jbd2_inode *inode, loff_t new_size);
8493 +extern void       jbd2_journal_init_jbd_inode(struct jbd2_inode *jinode, struct inode *inode);
8494 +extern void       jbd2_journal_release_jbd_inode(journal_t *journal, struct jbd2_inode *jinode);
8495
8496  /*
8497   * journal_head management
8498 @@ -1179,15 +1210,13 @@ static inline int jbd_space_needed(journal_t *journal)
8499
8500  /* journaling buffer types */
8501  #define BJ_None                0       /* Not journaled */
8502 -#define BJ_SyncData    1       /* Normal data: flush before commit */
8503 -#define BJ_Metadata    2       /* Normal journaled metadata */
8504 -#define BJ_Forget      3       /* Buffer superseded by this transaction */
8505 -#define BJ_IO          4       /* Buffer is for temporary IO use */
8506 -#define BJ_Shadow      5       /* Buffer contents being shadowed to the log */
8507 -#define BJ_LogCtl      6       /* Buffer contains log descriptors */
8508 -#define BJ_Reserved    7       /* Buffer is reserved for access by journal */
8509 -#define BJ_Locked      8       /* Locked for I/O during commit */
8510 -#define BJ_Types       9
8511 +#define BJ_Metadata    1       /* Normal journaled metadata */
8512 +#define BJ_Forget      2       /* Buffer superseded by this transaction */
8513 +#define BJ_IO          3       /* Buffer is for temporary IO use */
8514 +#define BJ_Shadow      4       /* Buffer contents being shadowed to the log */
8515 +#define BJ_LogCtl      5       /* Buffer contains log descriptors */
8516 +#define BJ_Reserved    6       /* Buffer is reserved for access by journal */
8517 +#define BJ_Types       7
8518
8519  extern int jbd_blocks_per_page(struct inode *inode);
8520
8521 diff --git a/include/linux/mpage.h b/include/linux/mpage.h
8522 index 068a0c9..5c42821 100644
8523 --- a/include/linux/mpage.h
8524 +++ b/include/linux/mpage.h
8525 @@ -11,11 +11,21 @@
8526   */
8527  #ifdef CONFIG_BLOCK
8528
8529 +struct mpage_data {
8530 +       struct bio *bio;
8531 +       sector_t last_block_in_bio;
8532 +       get_block_t *get_block;
8533 +       unsigned use_writepage;
8534 +};
8535 +
8536  struct writeback_control;
8537
8538 +struct bio *mpage_bio_submit(int rw, struct bio *bio);
8539  int mpage_readpages(struct address_space *mapping, struct list_head *pages,
8540                                 unsigned nr_pages, get_block_t get_block);
8541  int mpage_readpage(struct page *page, get_block_t get_block);
8542 +int __mpage_writepage(struct page *page, struct writeback_control *wbc,
8543 +                     void *data);
8544  int mpage_writepages(struct address_space *mapping,
8545                 struct writeback_control *wbc, get_block_t get_block);
8546  int mpage_writepage(struct page *page, get_block_t *get_block,
8547 diff --git a/include/linux/percpu_counter.h b/include/linux/percpu_counter.h
8548 index 9007ccd..2083888 100644
8549 --- a/include/linux/percpu_counter.h
8550 +++ b/include/linux/percpu_counter.h
8551 @@ -35,7 +35,7 @@ int percpu_counter_init_irq(struct percpu_counter *fbc, s64 amount);
8552  void percpu_counter_destroy(struct percpu_counter *fbc);
8553  void percpu_counter_set(struct percpu_counter *fbc, s64 amount);
8554  void __percpu_counter_add(struct percpu_counter *fbc, s64 amount, s32 batch);
8555 -s64 __percpu_counter_sum(struct percpu_counter *fbc);
8556 +s64 __percpu_counter_sum(struct percpu_counter *fbc, int set);
8557
8558  static inline void percpu_counter_add(struct percpu_counter *fbc, s64 amount)
8559  {
8560 @@ -44,13 +44,19 @@ static inline void percpu_counter_add(struct percpu_counter *fbc, s64 amount)
8561
8562  static inline s64 percpu_counter_sum_positive(struct percpu_counter *fbc)
8563  {
8564 -       s64 ret = __percpu_counter_sum(fbc);
8565 +       s64 ret = __percpu_counter_sum(fbc, 0);
8566         return ret < 0 ? 0 : ret;
8567  }
8568
8569 +static inline s64 percpu_counter_sum_and_set(struct percpu_counter *fbc)
8570 +{
8571 +       return __percpu_counter_sum(fbc, 1);
8572 +}
8573 +
8574 +
8575  static inline s64 percpu_counter_sum(struct percpu_counter *fbc)
8576  {
8577 -       return __percpu_counter_sum(fbc);
8578 +       return __percpu_counter_sum(fbc, 0);
8579  }
8580
8581  static inline s64 percpu_counter_read(struct percpu_counter *fbc)
8582 diff --git a/include/linux/writeback.h b/include/linux/writeback.h
8583 index f462439..0d8573e 100644
8584 --- a/include/linux/writeback.h
8585 +++ b/include/linux/writeback.h
8586 @@ -63,6 +63,7 @@ struct writeback_control {
8587         unsigned for_writepages:1;      /* This is a writepages() call */
8588         unsigned range_cyclic:1;        /* range_start is cyclic */
8589         unsigned more_io:1;             /* more io to be dispatched */
8590 +       unsigned range_cont:1;
8591  };
8592
8593  /*
8594 diff --git a/lib/percpu_counter.c b/lib/percpu_counter.c
8595 index 1191744..4a8ba4b 100644
8596 --- a/lib/percpu_counter.c
8597 +++ b/lib/percpu_counter.c
8598 @@ -52,7 +52,7 @@ EXPORT_SYMBOL(__percpu_counter_add);
8599   * Add up all the per-cpu counts, return the result.  This is a more accurate
8600   * but much slower version of percpu_counter_read_positive()
8601   */
8602 -s64 __percpu_counter_sum(struct percpu_counter *fbc)
8603 +s64 __percpu_counter_sum(struct percpu_counter *fbc, int set)
8604  {
8605         s64 ret;
8606         int cpu;
8607 @@ -62,7 +62,12 @@ s64 __percpu_counter_sum(struct percpu_counter *fbc)
8608         for_each_online_cpu(cpu) {
8609                 s32 *pcount = per_cpu_ptr(fbc->counters, cpu);
8610                 ret += *pcount;
8611 +               if (set)
8612 +                       *pcount = 0;
8613         }
8614 +       if (set)
8615 +               fbc->count = ret;
8616 +
8617         spin_unlock(&fbc->lock);
8618         return ret;
8619  }
8620 diff --git a/mm/filemap.c b/mm/filemap.c
8621 index 1e6a7d3..65d9d9e 100644
8622 --- a/mm/filemap.c
8623 +++ b/mm/filemap.c
8624 @@ -236,11 +236,12 @@ int filemap_fdatawrite(struct address_space *mapping)
8625  }
8626  EXPORT_SYMBOL(filemap_fdatawrite);
8627
8628 -static int filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
8629 +int filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
8630                                 loff_t end)
8631  {
8632         return __filemap_fdatawrite_range(mapping, start, end, WB_SYNC_ALL);
8633  }
8634 +EXPORT_SYMBOL(filemap_fdatawrite_range);
8635
8636  /**
8637   * filemap_flush - mostly a non-blocking flush
8638 diff --git a/mm/page-writeback.c b/mm/page-writeback.c
8639 index 789b6ad..ded57d5 100644
8640 --- a/mm/page-writeback.c
8641 +++ b/mm/page-writeback.c
8642 @@ -956,6 +956,9 @@ retry:
8643         }
8644         if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
8645                 mapping->writeback_index = index;
8646 +
8647 +       if (wbc->range_cont)
8648 +               wbc->range_start = index << PAGE_CACHE_SHIFT;
8649         return ret;
8650  }
8651  EXPORT_SYMBOL(write_cache_pages);