1 Patchset: 2.6.26-ext4-7
3 This patch was created by combining the ext4-pushed-post-2.6.27-rc1.gz
4 patches with the stable patches in 2.6.27-rc3-ext4-1 series.
6 Documentation/filesystems/ext4.txt | 131 ++-
8 fs/ext4/acl.c | 188 ++--
9 fs/ext4/balloc.c | 221 +++--
11 fs/ext4/ext4.h | 64 +-
12 fs/ext4/ext4_extents.h | 5 +-
13 fs/ext4/ext4_i.h | 10 +-
14 fs/ext4/ext4_jbd2.h | 29 +-
15 fs/ext4/ext4_sb.h | 5 +-
16 fs/ext4/extents.c | 277 +++---
17 fs/ext4/file.c | 20 +-
19 fs/ext4/group.h | 2 +-
20 fs/ext4/ialloc.c | 169 +++-
21 fs/ext4/inode.c | 1931 ++++++++++++++++++++++++++++++------
22 fs/ext4/mballoc.c | 744 +++++++++++----
23 fs/ext4/mballoc.h | 10 +-
24 fs/ext4/migrate.c | 3 +-
25 fs/ext4/namei.c | 45 +-
26 fs/ext4/resize.c | 134 ++-
27 fs/ext4/super.c | 451 ++++++---
28 fs/ext4/xattr.c | 4 +-
29 fs/ext4/xattr_trusted.c | 4 +-
30 fs/ext4/xattr_user.c | 4 +-
31 fs/jbd2/checkpoint.c | 1 -
32 fs/jbd2/commit.c | 308 +++----
33 fs/jbd2/journal.c | 54 +-
34 fs/jbd2/transaction.c | 365 +++----
36 include/linux/fs.h | 2 +
37 include/linux/jbd2.h | 73 +-
38 include/linux/mpage.h | 10 +
39 include/linux/percpu_counter.h | 12 +-
40 include/linux/writeback.h | 1 +
41 lib/percpu_counter.c | 7 +-
43 mm/page-writeback.c | 3 +
44 38 files changed, 3822 insertions(+), 1542 deletions(-)
46 diff --git a/Documentation/filesystems/ext4.txt b/Documentation/filesystems/ext4.txt
47 index 0c5086d..0d53949 100644
48 --- a/Documentation/filesystems/ext4.txt
49 +++ b/Documentation/filesystems/ext4.txt
50 @@ -13,72 +13,99 @@ Mailing list: linux-ext4@vger.kernel.org
51 1. Quick usage instructions:
52 ===========================
54 - - Grab updated e2fsprogs from
55 - ftp://ftp.kernel.org/pub/linux/kernel/people/tytso/e2fsprogs-interim/
56 - This is a patchset on top of e2fsprogs-1.39, which can be found at
57 + - Compile and install the latest version of e2fsprogs (as of this
58 + writing version 1.41) from:
60 + http://sourceforge.net/project/showfiles.php?group_id=2406
64 ftp://ftp.kernel.org/pub/linux/kernel/people/tytso/e2fsprogs/
66 - - It's still mke2fs -j /dev/hda1
67 + or grab the latest git repository from:
69 + git://git.kernel.org/pub/scm/fs/ext2/e2fsprogs.git
71 + - Note that it is highly important to install the mke2fs.conf file
72 + that comes with the e2fsprogs 1.41.x sources in /etc/mke2fs.conf. If
73 + you have edited the /etc/mke2fs.conf file installed on your system,
74 + you will need to merge your changes with the version from e2fsprogs
77 + - Create a new filesystem using the ext4dev filesystem type:
79 + # mke2fs -t ext4dev /dev/hda1
81 + Or configure an existing ext3 filesystem to support extents and set
82 + the test_fs flag to indicate that it's ok for an in-development
83 + filesystem to touch this filesystem:
85 - - mount /dev/hda1 /wherever -t ext4dev
86 + # tune2fs -O extents -E test_fs /dev/hda1
88 - - To enable extents,
89 + If the filesystem was created with 128 byte inodes, it can be
90 + converted to use 256 byte for greater efficiency via:
92 - mount /dev/hda1 /wherever -t ext4dev -o extents
93 + # tune2fs -I 256 /dev/hda1
95 - - The filesystem is compatible with the ext3 driver until you add a file
96 - which has extents (ie: `mount -o extents', then create a file).
97 + (Note: we currently do not have tools to convert an ext4dev
98 + filesystem back to ext3; so please do not do try this on production
101 - NOTE: The "extents" mount flag is temporary. It will soon go away and
102 - extents will be enabled by the "-o extents" flag to mke2fs or tune2fs
105 + # mount -t ext4dev /dev/hda1 /wherever
107 - When comparing performance with other filesystems, remember that
108 - ext3/4 by default offers higher data integrity guarantees than most. So
109 - when comparing with a metadata-only journalling filesystem, use `mount -o
110 - data=writeback'. And you might as well use `mount -o nobh' too along
111 - with it. Making the journal larger than the mke2fs default often helps
112 - performance with metadata-intensive workloads.
113 + ext3/4 by default offers higher data integrity guarantees than most.
114 + So when comparing with a metadata-only journalling filesystem, such
115 + as ext3, use `mount -o data=writeback'. And you might as well use
116 + `mount -o nobh' too along with it. Making the journal larger than
117 + the mke2fs default often helps performance with metadata-intensive
123 2.1 Currently available
125 -* ability to use filesystems > 16TB
126 +* ability to use filesystems > 16TB (e2fsprogs support not available yet)
127 * extent format reduces metadata overhead (RAM, IO for access, transactions)
128 * extent format more robust in face of on-disk corruption due to magics,
129 * internal redunancy in tree
131 -2.1 Previously available, soon to be enabled by default by "mkefs.ext4":
133 -* dir_index and resize inode will be on by default
134 -* large inodes will be used by default for fast EAs, nsec timestamps, etc
135 +* improved file allocation (multi-block alloc)
136 +* fix 32000 subdirectory limit
137 +* nsec timestamps for mtime, atime, ctime, create time
138 +* inode version field on disk (NFSv4, Lustre)
139 +* reduced e2fsck time via uninit_bg feature
140 +* journal checksumming for robustness, performance
141 +* persistent file preallocation (e.g for streaming media, databases)
142 +* ability to pack bitmaps and inode tables into larger virtual groups via the
144 +* large file support
145 +* Inode allocation using large virtual block groups via flex_bg
146 +* delayed allocation
147 +* large block (up to pagesize) support
148 +* efficent new ordered mode in JBD2 and ext4(avoid using buffer head to force
151 2.2 Candidate features for future inclusion
153 -There are several under discussion, whether they all make it in is
154 -partly a function of how much time everyone has to work on them:
155 +* Online defrag (patches available but not well tested)
156 +* reduced mke2fs time via lazy itable initialization in conjuction with
157 + the uninit_bg feature (capability to do this is available in e2fsprogs
158 + but a kernel thread to do lazy zeroing of unused inode table blocks
159 + after filesystem is first mounted is required for safety)
161 -* improved file allocation (multi-block alloc, delayed alloc; basically done)
162 -* fix 32000 subdirectory limit (patch exists, needs some e2fsck work)
163 -* nsec timestamps for mtime, atime, ctime, create time (patch exists,
164 - needs some e2fsck work)
165 -* inode version field on disk (NFSv4, Lustre; prototype exists)
166 -* reduced mke2fs/e2fsck time via uninitialized groups (prototype exists)
167 -* journal checksumming for robustness, performance (prototype exists)
168 -* persistent file preallocation (e.g for streaming media, databases)
169 +There are several others under discussion, whether they all make it in is
170 +partly a function of how much time everyone has to work on them. Features like
171 +metadata checksumming have been discussed and planned for a bit but no patches
172 +exist yet so I'm not sure they're in the near-term roadmap.
174 -Features like metadata checksumming have been discussed and planned for
175 -a bit but no patches exist yet so I'm not sure they're in the near-term
177 +The big performance win will come with mballoc, delalloc and flex_bg
178 +grouping of bitmaps and inode tables. Some test results available here:
180 -The big performance win will come with mballoc and delalloc. CFS has
181 -been using mballoc for a few years already with Lustre, and IBM + Bull
182 -did a lot of benchmarking on it. The reason it isn't in the first set of
183 -patches is partly a manageability issue, and partly because it doesn't
184 -directly affect the on-disk format (outside of much better allocation)
185 -so it isn't critical to get into the first round of changes. I believe
186 -Alex is working on a new set of patches right now.
187 + - http://www.bullopensource.org/ext4/20080530/ffsb-write-2.6.26-rc2.html
188 + - http://www.bullopensource.org/ext4/20080530/ffsb-readwrite-2.6.26-rc2.html
192 @@ -222,9 +249,11 @@ stripe=n Number of filesystem blocks that mballoc will try
193 to use for allocation size and alignment. For RAID5/6
194 systems this should be the number of data
195 disks * RAID chunk size in file system blocks.
197 +delalloc (*) Deferring block allocation until write-out time.
198 +nodelalloc Disable delayed allocation. Blocks are allocation
199 + when data is copied from user to page cache.
203 There are 3 different data modes:
206 @@ -236,10 +265,10 @@ typically provide the best ext4 performance.
209 In data=ordered mode, ext4 only officially journals metadata, but it logically
210 -groups metadata and data blocks into a single unit called a transaction. When
211 -it's time to write the new metadata out to disk, the associated data blocks
212 -are written first. In general, this mode performs slightly slower than
213 -writeback but significantly faster than journal mode.
214 +groups metadata information related to data changes with the data blocks into a
215 +single unit called a transaction. When it's time to write the new metadata
216 +out to disk, the associated data blocks are written first. In general,
217 +this mode performs slightly slower than writeback but significantly faster than journal mode.
220 data=journal mode provides full data and metadata journaling. All new data is
221 @@ -247,7 +276,8 @@ written to the journal first, and then to its final location.
222 In the event of a crash, the journal can be replayed, bringing both data and
223 metadata into a consistent state. This mode is the slowest except when data
224 needs to be read from and written to disk at the same time where it
225 -outperforms all others modes.
226 +outperforms all others modes. Curently ext4 does not have delayed
227 +allocation support if this data journalling mode is selected.
231 @@ -256,7 +286,8 @@ kernel source: <file:fs/ext4/>
234 programs: http://e2fsprogs.sourceforge.net/
235 - http://ext2resize.sourceforge.net
237 useful links: http://fedoraproject.org/wiki/ext3-devel
238 http://www.bullopensource.org/ext4/
239 + http://ext4.wiki.kernel.org/index.php/Main_Page
240 + http://fedoraproject.org/wiki/Features/Ext4
241 diff --git a/fs/buffer.c b/fs/buffer.c
242 index 0f51c0f..5fa1512 100644
245 @@ -1691,11 +1691,13 @@ static int __block_write_full_page(struct inode *inode, struct page *page,
247 clear_buffer_dirty(bh);
248 set_buffer_uptodate(bh);
249 - } else if (!buffer_mapped(bh) && buffer_dirty(bh)) {
250 + } else if ((!buffer_mapped(bh) || buffer_delay(bh)) &&
251 + buffer_dirty(bh)) {
252 WARN_ON(bh->b_size != blocksize);
253 err = get_block(inode, block, bh, 1);
256 + clear_buffer_delay(bh);
257 if (buffer_new(bh)) {
258 /* blockdev mappings never come here */
259 clear_buffer_new(bh);
260 @@ -1774,7 +1776,8 @@ recover:
262 /* Recovery: lock and submit the mapped buffers */
264 - if (buffer_mapped(bh) && buffer_dirty(bh)) {
265 + if (buffer_mapped(bh) && buffer_dirty(bh) &&
266 + !buffer_delay(bh)) {
268 mark_buffer_async_write(bh);
270 @@ -2061,6 +2064,7 @@ int generic_write_end(struct file *file, struct address_space *mapping,
271 struct page *page, void *fsdata)
273 struct inode *inode = mapping->host;
274 + int i_size_changed = 0;
276 copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
278 @@ -2073,12 +2077,21 @@ int generic_write_end(struct file *file, struct address_space *mapping,
280 if (pos+copied > inode->i_size) {
281 i_size_write(inode, pos+copied);
282 - mark_inode_dirty(inode);
283 + i_size_changed = 1;
287 page_cache_release(page);
290 + * Don't mark the inode dirty under page lock. First, it unnecessarily
291 + * makes the holding time of page lock longer. Second, it forces lock
292 + * ordering of page lock and transaction start for journaling
295 + if (i_size_changed)
296 + mark_inode_dirty(inode);
300 EXPORT_SYMBOL(generic_write_end);
301 diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c
302 index 3c8dab8..a234b54 100644
305 @@ -40,34 +40,35 @@ ext4_acl_from_disk(const void *value, size_t size)
306 acl = posix_acl_alloc(count, GFP_NOFS);
308 return ERR_PTR(-ENOMEM);
309 - for (n=0; n < count; n++) {
310 + for (n = 0; n < count; n++) {
311 ext4_acl_entry *entry =
312 (ext4_acl_entry *)value;
313 if ((char *)value + sizeof(ext4_acl_entry_short) > end)
315 acl->a_entries[n].e_tag = le16_to_cpu(entry->e_tag);
316 acl->a_entries[n].e_perm = le16_to_cpu(entry->e_perm);
317 - switch(acl->a_entries[n].e_tag) {
319 - case ACL_GROUP_OBJ:
322 - value = (char *)value +
323 - sizeof(ext4_acl_entry_short);
324 - acl->a_entries[n].e_id = ACL_UNDEFINED_ID;
329 - value = (char *)value + sizeof(ext4_acl_entry);
330 - if ((char *)value > end)
332 - acl->a_entries[n].e_id =
333 - le32_to_cpu(entry->e_id);
338 + switch (acl->a_entries[n].e_tag) {
340 + case ACL_GROUP_OBJ:
343 + value = (char *)value +
344 + sizeof(ext4_acl_entry_short);
345 + acl->a_entries[n].e_id = ACL_UNDEFINED_ID;
350 + value = (char *)value + sizeof(ext4_acl_entry);
351 + if ((char *)value > end)
353 + acl->a_entries[n].e_id =
354 + le32_to_cpu(entry->e_id);
362 @@ -96,27 +97,26 @@ ext4_acl_to_disk(const struct posix_acl *acl, size_t *size)
363 return ERR_PTR(-ENOMEM);
364 ext_acl->a_version = cpu_to_le32(EXT4_ACL_VERSION);
365 e = (char *)ext_acl + sizeof(ext4_acl_header);
366 - for (n=0; n < acl->a_count; n++) {
367 + for (n = 0; n < acl->a_count; n++) {
368 ext4_acl_entry *entry = (ext4_acl_entry *)e;
369 entry->e_tag = cpu_to_le16(acl->a_entries[n].e_tag);
370 entry->e_perm = cpu_to_le16(acl->a_entries[n].e_perm);
371 - switch(acl->a_entries[n].e_tag) {
375 - cpu_to_le32(acl->a_entries[n].e_id);
376 - e += sizeof(ext4_acl_entry);
380 - case ACL_GROUP_OBJ:
383 - e += sizeof(ext4_acl_entry_short);
388 + switch (acl->a_entries[n].e_tag) {
391 + entry->e_id = cpu_to_le32(acl->a_entries[n].e_id);
392 + e += sizeof(ext4_acl_entry);
396 + case ACL_GROUP_OBJ:
399 + e += sizeof(ext4_acl_entry_short);
406 return (char *)ext_acl;
407 @@ -167,23 +167,23 @@ ext4_get_acl(struct inode *inode, int type)
408 if (!test_opt(inode->i_sb, POSIX_ACL))
412 - case ACL_TYPE_ACCESS:
413 - acl = ext4_iget_acl(inode, &ei->i_acl);
414 - if (acl != EXT4_ACL_NOT_CACHED)
416 - name_index = EXT4_XATTR_INDEX_POSIX_ACL_ACCESS;
419 - case ACL_TYPE_DEFAULT:
420 - acl = ext4_iget_acl(inode, &ei->i_default_acl);
421 - if (acl != EXT4_ACL_NOT_CACHED)
423 - name_index = EXT4_XATTR_INDEX_POSIX_ACL_DEFAULT;
427 - return ERR_PTR(-EINVAL);
429 + case ACL_TYPE_ACCESS:
430 + acl = ext4_iget_acl(inode, &ei->i_acl);
431 + if (acl != EXT4_ACL_NOT_CACHED)
433 + name_index = EXT4_XATTR_INDEX_POSIX_ACL_ACCESS;
436 + case ACL_TYPE_DEFAULT:
437 + acl = ext4_iget_acl(inode, &ei->i_default_acl);
438 + if (acl != EXT4_ACL_NOT_CACHED)
440 + name_index = EXT4_XATTR_INDEX_POSIX_ACL_DEFAULT;
444 + return ERR_PTR(-EINVAL);
446 retval = ext4_xattr_get(inode, name_index, "", NULL, 0);
448 @@ -201,14 +201,14 @@ ext4_get_acl(struct inode *inode, int type)
453 - case ACL_TYPE_ACCESS:
454 - ext4_iset_acl(inode, &ei->i_acl, acl);
457 - case ACL_TYPE_DEFAULT:
458 - ext4_iset_acl(inode, &ei->i_default_acl, acl);
461 + case ACL_TYPE_ACCESS:
462 + ext4_iset_acl(inode, &ei->i_acl, acl);
465 + case ACL_TYPE_DEFAULT:
466 + ext4_iset_acl(inode, &ei->i_default_acl, acl);
471 @@ -232,31 +232,31 @@ ext4_set_acl(handle_t *handle, struct inode *inode, int type,
472 if (S_ISLNK(inode->i_mode))
476 - case ACL_TYPE_ACCESS:
477 - name_index = EXT4_XATTR_INDEX_POSIX_ACL_ACCESS;
479 - mode_t mode = inode->i_mode;
480 - error = posix_acl_equiv_mode(acl, &mode);
484 - inode->i_mode = mode;
485 - ext4_mark_inode_dirty(handle, inode);
490 + case ACL_TYPE_ACCESS:
491 + name_index = EXT4_XATTR_INDEX_POSIX_ACL_ACCESS;
493 + mode_t mode = inode->i_mode;
494 + error = posix_acl_equiv_mode(acl, &mode);
498 + inode->i_mode = mode;
499 + ext4_mark_inode_dirty(handle, inode);
507 - case ACL_TYPE_DEFAULT:
508 - name_index = EXT4_XATTR_INDEX_POSIX_ACL_DEFAULT;
509 - if (!S_ISDIR(inode->i_mode))
510 - return acl ? -EACCES : 0;
512 + case ACL_TYPE_DEFAULT:
513 + name_index = EXT4_XATTR_INDEX_POSIX_ACL_DEFAULT;
514 + if (!S_ISDIR(inode->i_mode))
515 + return acl ? -EACCES : 0;
524 value = ext4_acl_to_disk(acl, &size);
525 @@ -269,14 +269,14 @@ ext4_set_acl(handle_t *handle, struct inode *inode, int type,
530 - case ACL_TYPE_ACCESS:
531 - ext4_iset_acl(inode, &ei->i_acl, acl);
534 - case ACL_TYPE_DEFAULT:
535 - ext4_iset_acl(inode, &ei->i_default_acl, acl);
538 + case ACL_TYPE_ACCESS:
539 + ext4_iset_acl(inode, &ei->i_acl, acl);
542 + case ACL_TYPE_DEFAULT:
543 + ext4_iset_acl(inode, &ei->i_default_acl, acl);
548 diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
549 index 9cc80b9..e9fa960 100644
550 --- a/fs/ext4/balloc.c
551 +++ b/fs/ext4/balloc.c
552 @@ -47,7 +47,7 @@ static int ext4_block_in_group(struct super_block *sb, ext4_fsblk_t block,
553 ext4_group_t block_group)
555 ext4_group_t actual_group;
556 - ext4_get_group_no_and_offset(sb, block, &actual_group, 0);
557 + ext4_get_group_no_and_offset(sb, block, &actual_group, NULL);
558 if (actual_group == block_group)
561 @@ -121,12 +121,7 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
562 le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks);
564 } else { /* For META_BG_BLOCK_GROUPS */
565 - int group_rel = (block_group -
566 - le32_to_cpu(sbi->s_es->s_first_meta_bg)) %
567 - EXT4_DESC_PER_BLOCK(sb);
568 - if (group_rel == 0 || group_rel == 1 ||
569 - (group_rel == EXT4_DESC_PER_BLOCK(sb) - 1))
571 + bit_max += ext4_bg_num_gdb(sb, block_group);
574 if (block_group == sbi->s_groups_count - 1) {
575 @@ -295,7 +290,7 @@ err_out:
579 - * read_block_bitmap()
580 + * ext4_read_block_bitmap()
582 * @block_group: given block group
584 @@ -305,7 +300,7 @@ err_out:
585 * Return buffer_head on success or NULL in case of failure.
588 -read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
589 +ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
591 struct ext4_group_desc * desc;
592 struct buffer_head * bh = NULL;
593 @@ -319,25 +314,28 @@ read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
595 ext4_error(sb, __func__,
596 "Cannot read block bitmap - "
597 - "block_group = %d, block_bitmap = %llu",
598 - (int)block_group, (unsigned long long)bitmap_blk);
599 + "block_group = %lu, block_bitmap = %llu",
600 + block_group, bitmap_blk);
603 if (bh_uptodate_or_lock(bh))
606 + spin_lock(sb_bgl_lock(EXT4_SB(sb), block_group));
607 if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
608 ext4_init_block_bitmap(sb, bh, block_group, desc);
609 set_buffer_uptodate(bh);
611 + spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group));
614 + spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group));
615 if (bh_submit_read(bh) < 0) {
617 ext4_error(sb, __func__,
618 "Cannot read block bitmap - "
619 - "block_group = %d, block_bitmap = %llu",
620 - (int)block_group, (unsigned long long)bitmap_blk);
621 + "block_group = %lu, block_bitmap = %llu",
622 + block_group, bitmap_blk);
625 ext4_valid_block_bitmap(sb, desc, block_group, bh);
626 @@ -409,8 +407,7 @@ restart:
629 printk("Window map complete.\n");
634 #define rsv_window_dump(root, verbose) \
635 __rsv_window_dump((root), (verbose), __func__)
636 @@ -694,7 +691,7 @@ do_more:
640 - bitmap_bh = read_block_bitmap(sb, block_group);
641 + bitmap_bh = ext4_read_block_bitmap(sb, block_group);
644 desc = ext4_get_group_desc (sb, block_group, &gd_bh);
645 @@ -810,6 +807,13 @@ do_more:
646 spin_unlock(sb_bgl_lock(sbi, block_group));
647 percpu_counter_add(&sbi->s_freeblocks_counter, count);
649 + if (sbi->s_log_groups_per_flex) {
650 + ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
651 + spin_lock(sb_bgl_lock(sbi, flex_group));
652 + sbi->s_flex_groups[flex_group].free_blocks += count;
653 + spin_unlock(sb_bgl_lock(sbi, flex_group));
656 /* We dirtied the bitmap block */
657 BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
658 err = ext4_journal_dirty_metadata(handle, bitmap_bh);
659 @@ -1598,23 +1602,38 @@ out:
662 * ext4_has_free_blocks()
663 - * @sbi: in-core super block structure.
664 + * @sbi: in-core super block structure.
665 + * @nblocks: number of neeed blocks
667 - * Check if filesystem has at least 1 free block available for allocation.
668 + * Check if filesystem has free blocks available for allocation.
669 + * Return the number of blocks avaible for allocation for this request
670 + * On success, return nblocks
672 -static int ext4_has_free_blocks(struct ext4_sb_info *sbi)
673 +ext4_fsblk_t ext4_has_free_blocks(struct ext4_sb_info *sbi,
674 + ext4_fsblk_t nblocks)
676 - ext4_fsblk_t free_blocks, root_blocks;
677 + ext4_fsblk_t free_blocks;
678 + ext4_fsblk_t root_blocks = 0;
680 free_blocks = percpu_counter_read_positive(&sbi->s_freeblocks_counter);
681 - root_blocks = ext4_r_blocks_count(sbi->s_es);
682 - if (free_blocks < root_blocks + 1 && !capable(CAP_SYS_RESOURCE) &&
684 + if (!capable(CAP_SYS_RESOURCE) &&
685 sbi->s_resuid != current->fsuid &&
686 - (sbi->s_resgid == 0 || !in_group_p (sbi->s_resgid))) {
687 + (sbi->s_resgid == 0 || !in_group_p(sbi->s_resgid)))
688 + root_blocks = ext4_r_blocks_count(sbi->s_es);
690 + if (free_blocks - root_blocks < FBC_BATCH)
692 + percpu_counter_sum_and_set(&sbi->s_freeblocks_counter);
694 + if (free_blocks <= root_blocks)
695 + /* we don't have free space */
700 + if (free_blocks - root_blocks < nblocks)
701 + return free_blocks - root_blocks;
707 * ext4_should_retry_alloc()
708 @@ -1630,7 +1649,7 @@ static int ext4_has_free_blocks(struct ext4_sb_info *sbi)
710 int ext4_should_retry_alloc(struct super_block *sb, int *retries)
712 - if (!ext4_has_free_blocks(EXT4_SB(sb)) || (*retries)++ > 3)
713 + if (!ext4_has_free_blocks(EXT4_SB(sb), 1) || (*retries)++ > 3)
716 jbd_debug(1, "%s: retrying operation after ENOSPC\n", sb->s_id);
717 @@ -1639,20 +1658,24 @@ int ext4_should_retry_alloc(struct super_block *sb, int *retries)
721 - * ext4_new_blocks_old() -- core block(s) allocation function
722 + * ext4_old_new_blocks() -- core block bitmap based block allocation function
724 * @handle: handle to this transaction
726 * @goal: given target block(filesystem wide)
727 * @count: target number of blocks to allocate
730 - * ext4_new_blocks uses a goal block to assist allocation. It tries to
731 - * allocate block(s) from the block group contains the goal block first. If that
732 - * fails, it will try to allocate block(s) from other block groups without
733 - * any specific goal block.
734 + * ext4_old_new_blocks uses a goal block to assist allocation and look up
735 + * the block bitmap directly to do block allocation. It tries to
736 + * allocate block(s) from the block group contains the goal block first. If
737 + * that fails, it will try to allocate block(s) from other block groups
738 + * without any specific goal block.
740 + * This function is called when -o nomballoc mount option is enabled
743 -ext4_fsblk_t ext4_new_blocks_old(handle_t *handle, struct inode *inode,
744 +ext4_fsblk_t ext4_old_new_blocks(handle_t *handle, struct inode *inode,
745 ext4_fsblk_t goal, unsigned long *count, int *errp)
747 struct buffer_head *bitmap_bh = NULL;
748 @@ -1676,13 +1699,26 @@ ext4_fsblk_t ext4_new_blocks_old(handle_t *handle, struct inode *inode,
749 ext4_group_t ngroups;
750 unsigned long num = *count;
756 printk("ext4_new_block: nonexistent device");
761 + if (!EXT4_I(inode)->i_delalloc_reserved_flag) {
763 + * With delalloc we already reserved the blocks
765 + *count = ext4_has_free_blocks(sbi, *count);
769 + return 0; /*return with ENOSPC error */
774 * Check quota for allocation of this block.
776 @@ -1706,11 +1742,6 @@ ext4_fsblk_t ext4_new_blocks_old(handle_t *handle, struct inode *inode,
777 if (block_i && ((windowsz = block_i->rsv_window_node.rsv_goal_size) > 0))
778 my_rsv = &block_i->rsv_window_node;
780 - if (!ext4_has_free_blocks(sbi)) {
786 * First, test whether the goal block is free.
788 @@ -1734,7 +1765,7 @@ retry_alloc:
791 if (free_blocks > 0) {
792 - bitmap_bh = read_block_bitmap(sb, group_no);
793 + bitmap_bh = ext4_read_block_bitmap(sb, group_no);
796 grp_alloc_blk = ext4_try_to_allocate_with_rsv(sb, handle,
797 @@ -1770,7 +1801,7 @@ retry_alloc:
801 - bitmap_bh = read_block_bitmap(sb, group_no);
802 + bitmap_bh = ext4_read_block_bitmap(sb, group_no);
806 @@ -1882,7 +1913,15 @@ allocated:
807 le16_add_cpu(&gdp->bg_free_blocks_count, -num);
808 gdp->bg_checksum = ext4_group_desc_csum(sbi, group_no, gdp);
809 spin_unlock(sb_bgl_lock(sbi, group_no));
810 - percpu_counter_sub(&sbi->s_freeblocks_counter, num);
811 + if (!EXT4_I(inode)->i_delalloc_reserved_flag)
812 + percpu_counter_sub(&sbi->s_freeblocks_counter, num);
814 + if (sbi->s_log_groups_per_flex) {
815 + ext4_group_t flex_group = ext4_flex_group(sbi, group_no);
816 + spin_lock(sb_bgl_lock(sbi, flex_group));
817 + sbi->s_flex_groups[flex_group].free_blocks -= num;
818 + spin_unlock(sb_bgl_lock(sbi, flex_group));
821 BUFFER_TRACE(gdp_bh, "journal_dirty_metadata for group descriptor");
822 err = ext4_journal_dirty_metadata(handle, gdp_bh);
823 @@ -1915,46 +1954,104 @@ out:
827 -ext4_fsblk_t ext4_new_block(handle_t *handle, struct inode *inode,
828 - ext4_fsblk_t goal, int *errp)
829 +#define EXT4_META_BLOCK 0x1
831 +static ext4_fsblk_t do_blk_alloc(handle_t *handle, struct inode *inode,
832 + ext4_lblk_t iblock, ext4_fsblk_t goal,
833 + unsigned long *count, int *errp, int flags)
835 struct ext4_allocation_request ar;
838 if (!test_opt(inode->i_sb, MBALLOC)) {
839 - unsigned long count = 1;
840 - ret = ext4_new_blocks_old(handle, inode, goal, &count, errp);
842 + return ext4_old_new_blocks(handle, inode, goal, count, errp);
845 memset(&ar, 0, sizeof(ar));
846 + /* Fill with neighbour allocated blocks */
852 + ar.logical = iblock;
854 + if (S_ISREG(inode->i_mode) && !(flags & EXT4_META_BLOCK))
855 + /* enable in-core preallocation for data block allocation */
856 + ar.flags = EXT4_MB_HINT_DATA;
858 + /* disable in-core preallocation for non-regular files */
861 ret = ext4_mb_new_blocks(handle, &ar, errp);
866 -ext4_fsblk_t ext4_new_blocks(handle_t *handle, struct inode *inode,
868 + * ext4_new_meta_blocks() -- allocate block for meta data (indexing) blocks
870 + * @handle: handle to this transaction
871 + * @inode: file inode
872 + * @goal: given target block(filesystem wide)
873 + * @count: total number of blocks need
874 + * @errp: error code
876 + * Return 1st allocated block numberon success, *count stores total account
877 + * error stores in errp pointer
879 +ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
880 ext4_fsblk_t goal, unsigned long *count, int *errp)
882 - struct ext4_allocation_request ar;
885 - if (!test_opt(inode->i_sb, MBALLOC)) {
886 - ret = ext4_new_blocks_old(handle, inode, goal, count, errp);
888 + ret = do_blk_alloc(handle, inode, 0, goal,
889 + count, errp, EXT4_META_BLOCK);
891 + * Account for the allocated meta blocks
894 + spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
895 + EXT4_I(inode)->i_allocated_meta_blocks += *count;
896 + spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
899 - memset(&ar, 0, sizeof(ar));
903 - ret = ext4_mb_new_blocks(handle, &ar, errp);
909 + * ext4_new_meta_block() -- allocate block for meta data (indexing) blocks
911 + * @handle: handle to this transaction
912 + * @inode: file inode
913 + * @goal: given target block(filesystem wide)
914 + * @errp: error code
916 + * Return allocated block number on success
918 +ext4_fsblk_t ext4_new_meta_block(handle_t *handle, struct inode *inode,
919 + ext4_fsblk_t goal, int *errp)
921 + unsigned long count = 1;
922 + return ext4_new_meta_blocks(handle, inode, goal, &count, errp);
926 + * ext4_new_blocks() -- allocate data blocks
928 + * @handle: handle to this transaction
929 + * @inode: file inode
930 + * @goal: given target block(filesystem wide)
931 + * @count: total number of blocks need
932 + * @errp: error code
934 + * Return 1st allocated block numberon success, *count stores total account
935 + * error stores in errp pointer
938 +ext4_fsblk_t ext4_new_blocks(handle_t *handle, struct inode *inode,
939 + ext4_lblk_t iblock, ext4_fsblk_t goal,
940 + unsigned long *count, int *errp)
942 + return do_blk_alloc(handle, inode, iblock, goal, count, errp, 0);
946 * ext4_count_free_blocks() -- count filesystem free blocks
947 @@ -1986,7 +2083,7 @@ ext4_fsblk_t ext4_count_free_blocks(struct super_block *sb)
949 desc_count += le16_to_cpu(gdp->bg_free_blocks_count);
951 - bitmap_bh = read_block_bitmap(sb, i);
952 + bitmap_bh = ext4_read_block_bitmap(sb, i);
953 if (bitmap_bh == NULL)
956 diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
957 index 2bf0331..ec8e33b 100644
960 @@ -129,7 +129,8 @@ static int ext4_readdir(struct file * filp,
961 struct buffer_head *bh = NULL;
964 - err = ext4_get_blocks_wrap(NULL, inode, blk, 1, &map_bh, 0, 0);
965 + err = ext4_get_blocks_wrap(NULL, inode, blk, 1, &map_bh,
968 pgoff_t index = map_bh.b_blocknr >>
969 (PAGE_CACHE_SHIFT - inode->i_blkbits);
970 @@ -272,7 +273,7 @@ static void free_rb_tree_fname(struct rb_root *root)
973 /* Do the node's children first */
974 - if ((n)->rb_left) {
979 @@ -301,24 +302,18 @@ static void free_rb_tree_fname(struct rb_root *root)
980 parent->rb_right = NULL;
983 - root->rb_node = NULL;
987 -static struct dir_private_info *create_dir_info(loff_t pos)
988 +static struct dir_private_info *ext4_htree_create_dir_info(loff_t pos)
990 struct dir_private_info *p;
992 - p = kmalloc(sizeof(struct dir_private_info), GFP_KERNEL);
993 + p = kzalloc(sizeof(struct dir_private_info), GFP_KERNEL);
996 - p->root.rb_node = NULL;
997 - p->curr_node = NULL;
998 - p->extra_fname = NULL;
1000 p->curr_hash = pos2maj_hash(pos);
1001 p->curr_minor_hash = pos2min_hash(pos);
1006 @@ -416,7 +411,7 @@ static int call_filldir(struct file * filp, void * dirent,
1007 get_dtype(sb, fname->file_type));
1009 filp->f_pos = curr_pos;
1010 - info->extra_fname = fname->next;
1011 + info->extra_fname = fname;
1014 fname = fname->next;
1015 @@ -433,7 +428,7 @@ static int ext4_dx_readdir(struct file * filp,
1019 - info = create_dir_info(filp->f_pos);
1020 + info = ext4_htree_create_dir_info(filp->f_pos);
1023 filp->private_data = info;
1024 @@ -455,11 +450,21 @@ static int ext4_dx_readdir(struct file * filp,
1025 * If there are any leftover names on the hash collision
1026 * chain, return them first.
1028 - if (info->extra_fname &&
1029 - call_filldir(filp, dirent, filldir, info->extra_fname))
1031 + if (info->extra_fname) {
1032 + if (call_filldir(filp, dirent, filldir, info->extra_fname))
1035 - if (!info->curr_node)
1036 + info->extra_fname = NULL;
1037 + info->curr_node = rb_next(info->curr_node);
1038 + if (!info->curr_node) {
1039 + if (info->next_hash == ~0) {
1040 + filp->f_pos = EXT4_HTREE_EOF;
1043 + info->curr_hash = info->next_hash;
1044 + info->curr_minor_hash = 0;
1046 + } else if (!info->curr_node)
1047 info->curr_node = rb_first(&info->root);
1050 diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
1051 index 8158083..2950032 100644
1052 --- a/fs/ext4/ext4.h
1053 +++ b/fs/ext4/ext4.h
1058 - * The second extended filesystem constants/structures
1059 + * The fourth extended filesystem constants/structures
1064 #define ext4_debug(f, a...) \
1066 printk (KERN_DEBUG "EXT4-fs DEBUG (%s, %d): %s:", \
1067 - __FILE__, __LINE__, __FUNCTION__); \
1068 + __FILE__, __LINE__, __func__); \
1069 printk (KERN_DEBUG f, ## a); \
1073 #define EXT4_MB_HINT_GOAL_ONLY 256
1074 /* goal is meaningful */
1075 #define EXT4_MB_HINT_TRY_GOAL 512
1076 +/* blocks already pre-reserved by delayed allocation */
1077 +#define EXT4_MB_DELALLOC_RESERVED 1024
1080 struct ext4_allocation_request {
1081 /* target inode for block we're allocating */
1082 @@ -170,6 +173,15 @@ struct ext4_group_desc
1083 __u32 bg_reserved2[3];
1087 + * Structure of a flex block group info
1090 +struct flex_groups {
1091 + __u32 free_inodes;
1092 + __u32 free_blocks;
1095 #define EXT4_BG_INODE_UNINIT 0x0001 /* Inode table/bitmap not in use */
1096 #define EXT4_BG_BLOCK_UNINIT 0x0002 /* Block bitmap not in use */
1097 #define EXT4_BG_INODE_ZEROED 0x0004 /* On-disk itable initialized to zero */
1098 @@ -527,6 +539,7 @@ do { \
1099 #define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */
1100 #define EXT4_MOUNT_I_VERSION 0x2000000 /* i_version support */
1101 #define EXT4_MOUNT_MBALLOC 0x4000000 /* Buddy allocation support */
1102 +#define EXT4_MOUNT_DELALLOC 0x8000000 /* Delalloc support */
1103 /* Compatibility, for having both ext2_fs.h and ext4_fs.h included at once */
1104 #ifndef _LINUX_EXT2_FS_H
1105 #define clear_opt(o, opt) o &= ~EXT4_MOUNT_##opt
1106 @@ -647,7 +660,10 @@ struct ext4_super_block {
1107 __le16 s_mmp_interval; /* # seconds to wait in MMP checking */
1108 __le64 s_mmp_block; /* Block for multi-mount protection */
1109 __le32 s_raid_stripe_width; /* blocks on all data disks (N*stride)*/
1110 - __u32 s_reserved[163]; /* Padding to the end of the block */
1111 + __u8 s_log_groups_per_flex; /* FLEX_BG group size */
1112 + __u8 s_reserved_char_pad2;
1113 + __le16 s_reserved_pad;
1114 + __u32 s_reserved[162]; /* Padding to the end of the block */
1118 @@ -958,12 +974,17 @@ extern ext4_grpblk_t ext4_block_group_offset(struct super_block *sb,
1119 extern int ext4_bg_has_super(struct super_block *sb, ext4_group_t group);
1120 extern unsigned long ext4_bg_num_gdb(struct super_block *sb,
1121 ext4_group_t group);
1122 -extern ext4_fsblk_t ext4_new_block (handle_t *handle, struct inode *inode,
1123 +extern ext4_fsblk_t ext4_new_meta_block(handle_t *handle, struct inode *inode,
1124 ext4_fsblk_t goal, int *errp);
1125 -extern ext4_fsblk_t ext4_new_blocks (handle_t *handle, struct inode *inode,
1126 +extern ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
1127 ext4_fsblk_t goal, unsigned long *count, int *errp);
1128 -extern ext4_fsblk_t ext4_new_blocks_old(handle_t *handle, struct inode *inode,
1129 +extern ext4_fsblk_t ext4_new_blocks(handle_t *handle, struct inode *inode,
1130 + ext4_lblk_t iblock, ext4_fsblk_t goal,
1131 + unsigned long *count, int *errp);
1132 +extern ext4_fsblk_t ext4_old_new_blocks(handle_t *handle, struct inode *inode,
1133 ext4_fsblk_t goal, unsigned long *count, int *errp);
1134 +extern ext4_fsblk_t ext4_has_free_blocks(struct ext4_sb_info *sbi,
1135 + ext4_fsblk_t nblocks);
1136 extern void ext4_free_blocks (handle_t *handle, struct inode *inode,
1137 ext4_fsblk_t block, unsigned long count, int metadata);
1138 extern void ext4_free_blocks_sb (handle_t *handle, struct super_block *sb,
1139 @@ -1016,6 +1037,10 @@ extern int __init init_ext4_mballoc(void);
1140 extern void exit_ext4_mballoc(void);
1141 extern void ext4_mb_free_blocks(handle_t *, struct inode *,
1142 unsigned long, unsigned long, int, unsigned long *);
1143 +extern int ext4_mb_add_more_groupinfo(struct super_block *sb,
1144 + ext4_group_t i, struct ext4_group_desc *desc);
1145 +extern void ext4_mb_update_group_info(struct ext4_group_info *grp,
1146 + ext4_grpblk_t add);
1150 @@ -1033,19 +1058,25 @@ int ext4_get_blocks_handle(handle_t *handle, struct inode *inode,
1151 extern struct inode *ext4_iget(struct super_block *, unsigned long);
1152 extern int ext4_write_inode (struct inode *, int);
1153 extern int ext4_setattr (struct dentry *, struct iattr *);
1154 +extern int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry,
1155 + struct kstat *stat);
1156 extern void ext4_delete_inode (struct inode *);
1157 extern int ext4_sync_inode (handle_t *, struct inode *);
1158 extern void ext4_discard_reservation (struct inode *);
1159 extern void ext4_dirty_inode(struct inode *);
1160 extern int ext4_change_inode_journal_flag(struct inode *, int);
1161 extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *);
1162 +extern int ext4_can_truncate(struct inode *inode);
1163 extern void ext4_truncate (struct inode *);
1164 extern void ext4_set_inode_flags(struct inode *);
1165 extern void ext4_get_inode_flags(struct ext4_inode_info *);
1166 extern void ext4_set_aops(struct inode *inode);
1167 extern int ext4_writepage_trans_blocks(struct inode *);
1168 -extern int ext4_block_truncate_page(handle_t *handle, struct page *page,
1169 +extern int ext4_meta_trans_blocks(struct inode *, int nrblocks, int idxblocks);
1170 +extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks);
1171 +extern int ext4_block_truncate_page(handle_t *handle,
1172 struct address_space *mapping, loff_t from);
1173 +extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct page *page);
1176 extern long ext4_ioctl(struct file *, unsigned int, unsigned long);
1177 @@ -1159,10 +1190,21 @@ struct ext4_group_info *ext4_get_group_info(struct super_block *sb,
1181 +static inline ext4_group_t ext4_flex_group(struct ext4_sb_info *sbi,
1182 + ext4_group_t block_group)
1184 + return block_group >> sbi->s_log_groups_per_flex;
1187 +static inline unsigned int ext4_flex_bg_size(struct ext4_sb_info *sbi)
1189 + return 1 << sbi->s_log_groups_per_flex;
1192 #define ext4_std_error(sb, errno) \
1195 - __ext4_std_error((sb), __FUNCTION__, (errno)); \
1196 + __ext4_std_error((sb), __func__, (errno)); \
1200 @@ -1187,11 +1229,13 @@ extern const struct inode_operations ext4_fast_symlink_inode_operations;
1202 extern int ext4_ext_tree_init(handle_t *handle, struct inode *);
1203 extern int ext4_ext_writepage_trans_blocks(struct inode *, int);
1204 +extern int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks,
1206 extern int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
1208 unsigned long max_blocks, struct buffer_head *bh_result,
1209 int create, int extend_disksize);
1210 -extern void ext4_ext_truncate(struct inode *, struct page *);
1211 +extern void ext4_ext_truncate(struct inode *);
1212 extern void ext4_ext_init(struct super_block *);
1213 extern void ext4_ext_release(struct super_block *);
1214 extern long ext4_fallocate(struct inode *inode, int mode, loff_t offset,
1215 @@ -1199,7 +1243,7 @@ extern long ext4_fallocate(struct inode *inode, int mode, loff_t offset,
1216 extern int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode,
1217 sector_t block, unsigned long max_blocks,
1218 struct buffer_head *bh, int create,
1219 - int extend_disksize);
1220 + int extend_disksize, int flag);
1221 #endif /* __KERNEL__ */
1223 #endif /* _EXT4_H */
1224 diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h
1225 index 75333b5..d33dc56 100644
1226 --- a/fs/ext4/ext4_extents.h
1227 +++ b/fs/ext4/ext4_extents.h
1228 @@ -212,10 +212,13 @@ static inline int ext4_ext_get_actual_len(struct ext4_extent *ext)
1229 (le16_to_cpu(ext->ee_len) - EXT_INIT_MAX_LEN));
1232 +extern int ext4_ext_calc_metadata_amount(struct inode *inode, int blocks);
1233 extern ext4_fsblk_t idx_pblock(struct ext4_extent_idx *);
1234 extern void ext4_ext_store_pblock(struct ext4_extent *, ext4_fsblk_t);
1235 extern int ext4_extent_tree_init(handle_t *, struct inode *);
1236 -extern int ext4_ext_calc_credits_for_insert(struct inode *, struct ext4_ext_path *);
1237 +extern int ext4_ext_calc_credits_for_single_extent(struct inode *inode,
1239 + struct ext4_ext_path *path);
1240 extern int ext4_ext_try_to_merge(struct inode *inode,
1241 struct ext4_ext_path *path,
1242 struct ext4_extent *);
1243 diff --git a/fs/ext4/ext4_i.h b/fs/ext4/ext4_i.h
1244 index 26a4ae2..ef7409f 100644
1245 --- a/fs/ext4/ext4_i.h
1246 +++ b/fs/ext4/ext4_i.h
1247 @@ -79,7 +79,7 @@ struct ext4_ext_cache {
1251 - * third extended file system inode data in memory
1252 + * fourth extended file system inode data in memory
1254 struct ext4_inode_info {
1255 __le32 i_data[15]; /* unconverted */
1256 @@ -150,6 +150,7 @@ struct ext4_inode_info {
1258 struct rw_semaphore i_data_sem;
1259 struct inode vfs_inode;
1260 + struct jbd2_inode jinode;
1262 unsigned long i_ext_generation;
1263 struct ext4_ext_cache i_cached_extent;
1264 @@ -162,6 +163,13 @@ struct ext4_inode_info {
1266 struct list_head i_prealloc_list;
1267 spinlock_t i_prealloc_lock;
1269 + /* allocation reservation info for delalloc */
1270 + unsigned long i_reserved_data_blocks;
1271 + unsigned long i_reserved_meta_blocks;
1272 + unsigned long i_allocated_meta_blocks;
1273 + unsigned short i_delalloc_reserved_flag;
1274 + spinlock_t i_block_reservation_lock;
1277 #endif /* _EXT4_I */
1278 diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
1279 index 9255a7d..b455c68 100644
1280 --- a/fs/ext4/ext4_jbd2.h
1281 +++ b/fs/ext4/ext4_jbd2.h
1283 EXT4_XATTR_TRANS_BLOCKS - 2 + \
1284 2*EXT4_QUOTA_TRANS_BLOCKS(sb))
1287 + * Define the number of metadata blocks we need to account to modify data.
1289 + * This include super block, inode block, quota blocks and xattr blocks
1291 +#define EXT4_META_TRANS_BLOCKS(sb) (EXT4_XATTR_TRANS_BLOCKS + \
1292 + 2*EXT4_QUOTA_TRANS_BLOCKS(sb))
1294 /* Delete operations potentially hit one directory's namespace plus an
1295 * entire inode, plus arbitrary amounts of bitmap/indirection data. Be
1296 * generous. We can grow the delete transaction later if necessary. */
1297 @@ -142,19 +150,17 @@ int __ext4_journal_dirty_metadata(const char *where,
1298 handle_t *handle, struct buffer_head *bh);
1300 #define ext4_journal_get_undo_access(handle, bh) \
1301 - __ext4_journal_get_undo_access(__FUNCTION__, (handle), (bh))
1302 + __ext4_journal_get_undo_access(__func__, (handle), (bh))
1303 #define ext4_journal_get_write_access(handle, bh) \
1304 - __ext4_journal_get_write_access(__FUNCTION__, (handle), (bh))
1305 + __ext4_journal_get_write_access(__func__, (handle), (bh))
1306 #define ext4_journal_revoke(handle, blocknr, bh) \
1307 - __ext4_journal_revoke(__FUNCTION__, (handle), (blocknr), (bh))
1308 + __ext4_journal_revoke(__func__, (handle), (blocknr), (bh))
1309 #define ext4_journal_get_create_access(handle, bh) \
1310 - __ext4_journal_get_create_access(__FUNCTION__, (handle), (bh))
1311 + __ext4_journal_get_create_access(__func__, (handle), (bh))
1312 #define ext4_journal_dirty_metadata(handle, bh) \
1313 - __ext4_journal_dirty_metadata(__FUNCTION__, (handle), (bh))
1314 + __ext4_journal_dirty_metadata(__func__, (handle), (bh))
1315 #define ext4_journal_forget(handle, bh) \
1316 - __ext4_journal_forget(__FUNCTION__, (handle), (bh))
1318 -int ext4_journal_dirty_data(handle_t *handle, struct buffer_head *bh);
1319 + __ext4_journal_forget(__func__, (handle), (bh))
1321 handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks);
1322 int __ext4_journal_stop(const char *where, handle_t *handle);
1323 @@ -165,7 +171,7 @@ static inline handle_t *ext4_journal_start(struct inode *inode, int nblocks)
1326 #define ext4_journal_stop(handle) \
1327 - __ext4_journal_stop(__FUNCTION__, (handle))
1328 + __ext4_journal_stop(__func__, (handle))
1330 static inline handle_t *ext4_journal_current_handle(void)
1332 @@ -192,6 +198,11 @@ static inline int ext4_journal_force_commit(journal_t *journal)
1333 return jbd2_journal_force_commit(journal);
1336 +static inline int ext4_jbd2_file_inode(handle_t *handle, struct inode *inode)
1338 + return jbd2_journal_file_inode(handle, &EXT4_I(inode)->jinode);
1342 int ext4_force_commit(struct super_block *sb);
1344 diff --git a/fs/ext4/ext4_sb.h b/fs/ext4/ext4_sb.h
1345 index 5802e69..6300226 100644
1346 --- a/fs/ext4/ext4_sb.h
1347 +++ b/fs/ext4/ext4_sb.h
1349 #include <linux/rbtree.h>
1352 - * third extended-fs super-block data in memory
1353 + * fourth extended-fs super-block data in memory
1355 struct ext4_sb_info {
1356 unsigned long s_desc_size; /* Size of a group descriptor in bytes */
1357 @@ -143,6 +143,9 @@ struct ext4_sb_info {
1359 /* locality groups */
1360 struct ext4_locality_group *s_locality_groups;
1362 + unsigned int s_log_groups_per_flex;
1363 + struct flex_groups *s_flex_groups;
1366 #endif /* _EXT4_SB */
1367 diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
1368 index 47929c4..b24d3c5 100644
1369 --- a/fs/ext4/extents.c
1370 +++ b/fs/ext4/extents.c
1371 @@ -92,17 +92,16 @@ static void ext4_idx_store_pblock(struct ext4_extent_idx *ix, ext4_fsblk_t pb)
1372 ix->ei_leaf_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) & 0xffff);
1375 -static handle_t *ext4_ext_journal_restart(handle_t *handle, int needed)
1376 +static int ext4_ext_journal_restart(handle_t *handle, int needed)
1380 if (handle->h_buffer_credits > needed)
1382 - if (!ext4_journal_extend(handle, needed))
1384 - err = ext4_journal_restart(handle, needed);
1388 + err = ext4_journal_extend(handle, needed);
1391 + return ext4_journal_restart(handle, needed);
1395 @@ -180,15 +179,18 @@ static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode,
1396 return bg_start + colour + block;
1400 + * Allocation for a meta data block
1403 -ext4_ext_new_block(handle_t *handle, struct inode *inode,
1404 +ext4_ext_new_meta_block(handle_t *handle, struct inode *inode,
1405 struct ext4_ext_path *path,
1406 struct ext4_extent *ex, int *err)
1408 ext4_fsblk_t goal, newblock;
1410 goal = ext4_ext_find_goal(inode, path, le32_to_cpu(ex->ee_block));
1411 - newblock = ext4_new_block(handle, inode, goal, err);
1412 + newblock = ext4_new_meta_block(handle, inode, goal, err);
1416 @@ -246,6 +248,36 @@ static int ext4_ext_space_root_idx(struct inode *inode)
1421 + * Calculate the number of metadata blocks needed
1422 + * to allocate @blocks
1423 + * Worse case is one block per extent
1425 +int ext4_ext_calc_metadata_amount(struct inode *inode, int blocks)
1427 + int lcap, icap, rcap, leafs, idxs, num;
1428 + int newextents = blocks;
1430 + rcap = ext4_ext_space_root_idx(inode);
1431 + lcap = ext4_ext_space_block(inode);
1432 + icap = ext4_ext_space_block_idx(inode);
1434 + /* number of new leaf blocks needed */
1435 + num = leafs = (newextents + lcap - 1) / lcap;
1438 + * Worse case, we need separate index block(s)
1439 + * to link all new leaf blocks
1441 + idxs = (leafs + icap - 1) / icap;
1444 + idxs = (idxs + icap - 1) / icap;
1445 + } while (idxs > rcap);
1451 ext4_ext_max_entries(struct inode *inode, int depth)
1453 @@ -524,6 +556,7 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
1457 + path[0].p_bh = NULL;
1460 /* walk through the tree */
1461 @@ -552,12 +585,14 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
1464 path[ppos].p_depth = i;
1465 - path[ppos].p_hdr = eh;
1466 path[ppos].p_ext = NULL;
1467 path[ppos].p_idx = NULL;
1470 ext4_ext_binsearch(inode, path + ppos, block);
1471 + /* if not an empty leaf */
1472 + if (path[ppos].p_ext)
1473 + path[ppos].p_block = ext_pblock(path[ppos].p_ext);
1475 ext4_ext_show_path(inode, path);
1477 @@ -688,7 +723,8 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
1478 /* allocate all needed blocks */
1479 ext_debug("allocate %d blocks for indexes/leaf\n", depth - at);
1480 for (a = 0; a < depth - at; a++) {
1481 - newblock = ext4_ext_new_block(handle, inode, path, newext, &err);
1482 + newblock = ext4_ext_new_meta_block(handle, inode, path,
1486 ablocks[a] = newblock;
1487 @@ -884,7 +920,7 @@ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
1488 ext4_fsblk_t newblock;
1491 - newblock = ext4_ext_new_block(handle, inode, path, newext, &err);
1492 + newblock = ext4_ext_new_meta_block(handle, inode, path, newext, &err);
1496 @@ -981,6 +1017,8 @@ repeat:
1497 /* if we found index with free entry, then use that
1498 * entry: create all needed subtree and add new leaf */
1499 err = ext4_ext_split(handle, inode, path, newext, i);
1504 ext4_ext_drop_refs(path);
1505 @@ -1403,7 +1441,7 @@ unsigned int ext4_ext_check_overlap(struct inode *inode,
1508 * get the next allocated block if the extent in the path
1509 - * is before the requested block(s)
1510 + * is before the requested block(s)
1513 b2 = ext4_ext_next_allocated_block(path);
1514 @@ -1709,54 +1747,61 @@ static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode,
1518 - * ext4_ext_calc_credits_for_insert:
1519 - * This routine returns max. credits that the extent tree can consume.
1520 - * It should be OK for low-performance paths like ->writepage()
1521 - * To allow many writing processes to fit into a single transaction,
1522 - * the caller should calculate credits under i_data_sem and
1523 - * pass the actual path.
1524 + * ext4_ext_calc_credits_for_single_extent:
1525 + * This routine returns max. credits that needed to insert an extent
1526 + * to the extent tree.
1527 + * When pass the actual path, the caller should calculate credits
1528 + * under i_data_sem.
1530 -int ext4_ext_calc_credits_for_insert(struct inode *inode,
1531 +int ext4_ext_calc_credits_for_single_extent(struct inode *inode, int nrblocks,
1532 struct ext4_ext_path *path)
1534 - int depth, needed;
1537 + int depth = ext_depth(inode);
1540 /* probably there is space in leaf? */
1541 - depth = ext_depth(inode);
1542 if (le16_to_cpu(path[depth].p_hdr->eh_entries)
1543 - < le16_to_cpu(path[depth].p_hdr->eh_max))
1546 + < le16_to_cpu(path[depth].p_hdr->eh_max)) {
1549 - * given 32-bit logical block (4294967296 blocks), max. tree
1550 - * can be 4 levels in depth -- 4 * 340^4 == 53453440000.
1551 - * Let's also add one more level for imbalance.
1555 - /* allocation of new data block(s) */
1558 + * There are some space in the leaf tree, no
1559 + * need to account for leaf block credit
1561 + * bitmaps and block group descriptor blocks
1562 + * and other metadat blocks still need to be
1565 + /* 1 bitmap, 1 block group descriptor */
1566 + ret = 2 + EXT4_META_TRANS_BLOCKS(inode->i_sb);
1571 - * tree can be full, so it would need to grow in depth:
1572 - * we need one credit to modify old root, credits for
1573 - * new root will be added in split accounting
1576 + return ext4_chunk_trans_blocks(inode, nrblocks);
1580 - * Index split can happen, we would need:
1581 - * allocate intermediate indexes (bitmap + group)
1582 - * + change two blocks at each level, but root (already included)
1584 - needed += (depth * 2) + (depth * 2);
1586 + * How many index/leaf blocks need to change/allocate to modify nrblocks?
1588 + * if nrblocks are fit in a single extent (chunk flag is 1), then
1589 + * in the worse case, each tree level index/leaf need to be changed
1590 + * if the tree split due to insert a new extent, then the old tree
1591 + * index/leaf need to be updated too
1593 + * If the nrblocks are discontiguous, they could cause
1594 + * the whole tree split more than once, but this is really rare.
1596 +int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks, int chunk)
1599 + int depth = ext_depth(inode);
1601 - /* any allocation modifies superblock */
1604 + index = depth * 2;
1606 + index = depth * 3;
1612 static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
1613 @@ -1872,22 +1917,22 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
1614 BUG_ON(b != ex_ee_block + ex_ee_len - 1);
1617 - /* at present, extent can't cross block group: */
1618 - /* leaf + bitmap + group desc + sb + inode */
1621 + * 3 for leaf, sb, and inode plus 2 (bmap and group
1622 + * descriptor) for each block group; assume two block
1623 + * groups plus ex_ee_len/blocks_per_block_group for
1626 + credits = 7 + 2*(ex_ee_len/EXT4_BLOCKS_PER_GROUP(inode->i_sb));
1627 if (ex == EXT_FIRST_EXTENT(eh)) {
1629 credits += (ext_depth(inode)) + 1;
1631 -#ifdef CONFIG_QUOTA
1632 credits += 2 * EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb);
1635 - handle = ext4_ext_journal_restart(handle, credits);
1636 - if (IS_ERR(handle)) {
1637 - err = PTR_ERR(handle);
1638 + err = ext4_ext_journal_restart(handle, credits);
1643 err = ext4_ext_get_access(handle, inode, path + depth);
1645 @@ -2287,7 +2332,10 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
1646 unsigned int newdepth;
1647 /* If extent has less than EXT4_EXT_ZERO_LEN zerout directly */
1648 if (allocated <= EXT4_EXT_ZERO_LEN) {
1649 - /* Mark first half uninitialized.
1651 + * iblock == ee_block is handled by the zerouout
1652 + * at the beginning.
1653 + * Mark first half uninitialized.
1654 * Mark second half initialized and zero out the
1655 * initialized extent
1657 @@ -2310,7 +2358,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
1658 ex->ee_len = orig_ex.ee_len;
1659 ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
1660 ext4_ext_dirty(handle, inode, path + depth);
1661 - /* zeroed the full extent */
1662 + /* blocks available from iblock */
1666 @@ -2338,6 +2386,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
1667 err = PTR_ERR(path);
1670 + /* get the second half extent details */
1671 ex = path[depth].p_ext;
1672 err = ext4_ext_get_access(handle, inode,
1674 @@ -2367,6 +2416,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
1675 ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
1676 ext4_ext_dirty(handle, inode, path + depth);
1677 /* zeroed the full extent */
1678 + /* blocks available from iblock */
1682 @@ -2382,23 +2432,22 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
1684 orig_ex.ee_len = cpu_to_le16(ee_len -
1685 ext4_ext_get_actual_len(ex3));
1686 - if (newdepth != depth) {
1688 - ext4_ext_drop_refs(path);
1689 - path = ext4_ext_find_extent(inode, iblock, path);
1690 - if (IS_ERR(path)) {
1691 - err = PTR_ERR(path);
1694 - eh = path[depth].p_hdr;
1695 - ex = path[depth].p_ext;
1696 - if (ex2 != &newex)
1699 - err = ext4_ext_get_access(handle, inode, path + depth);
1703 + ext4_ext_drop_refs(path);
1704 + path = ext4_ext_find_extent(inode, iblock, path);
1705 + if (IS_ERR(path)) {
1706 + err = PTR_ERR(path);
1709 + eh = path[depth].p_hdr;
1710 + ex = path[depth].p_ext;
1711 + if (ex2 != &newex)
1714 + err = ext4_ext_get_access(handle, inode, path + depth);
1718 allocated = max_blocks;
1720 /* If extent has less than EXT4_EXT_ZERO_LEN and we are trying
1721 @@ -2416,6 +2465,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
1722 ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
1723 ext4_ext_dirty(handle, inode, path + depth);
1724 /* zero out the first half */
1725 + /* blocks available from iblock */
1729 @@ -2529,6 +2579,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
1730 int err = 0, depth, ret;
1731 unsigned long allocated = 0;
1732 struct ext4_allocation_request ar;
1735 __clear_bit(BH_New, &bh_result->b_state);
1736 ext_debug("blocks %u/%lu requested for inode %u\n",
1737 @@ -2616,8 +2667,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
1739 if (allocated > max_blocks)
1740 allocated = max_blocks;
1741 - /* mark the buffer unwritten */
1742 - __set_bit(BH_Unwritten, &bh_result->b_state);
1743 + set_buffer_unwritten(bh_result);
1747 @@ -2716,14 +2766,19 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
1751 - if (extend_disksize && inode->i_size > EXT4_I(inode)->i_disksize)
1752 - EXT4_I(inode)->i_disksize = inode->i_size;
1754 /* previous routine could use block we allocated */
1755 newblock = ext_pblock(&newex);
1756 allocated = ext4_ext_get_actual_len(&newex);
1758 - __set_bit(BH_New, &bh_result->b_state);
1759 + if (extend_disksize) {
1760 + disksize = ((loff_t) iblock + ar.len) << inode->i_blkbits;
1761 + if (disksize > i_size_read(inode))
1762 + disksize = i_size_read(inode);
1763 + if (disksize > EXT4_I(inode)->i_disksize)
1764 + EXT4_I(inode)->i_disksize = disksize;
1767 + set_buffer_new(bh_result);
1769 /* Cache only when it is _not_ an uninitialized extent */
1770 if (create != EXT4_CREATE_UNINITIALIZED_EXT)
1771 @@ -2733,7 +2788,7 @@ out:
1772 if (allocated > max_blocks)
1773 allocated = max_blocks;
1774 ext4_ext_show_leaf(inode, path);
1775 - __set_bit(BH_Mapped, &bh_result->b_state);
1776 + set_buffer_mapped(bh_result);
1777 bh_result->b_bdev = inode->i_sb->s_bdev;
1778 bh_result->b_blocknr = newblock;
1780 @@ -2744,7 +2799,7 @@ out2:
1781 return err ? err : allocated;
1784 -void ext4_ext_truncate(struct inode * inode, struct page *page)
1785 +void ext4_ext_truncate(struct inode *inode)
1787 struct address_space *mapping = inode->i_mapping;
1788 struct super_block *sb = inode->i_sb;
1789 @@ -2755,33 +2810,27 @@ void ext4_ext_truncate(struct inode * inode, struct page *page)
1791 * probably first extent we're gonna free will be last in block
1793 - err = ext4_writepage_trans_blocks(inode) + 3;
1794 + err = ext4_writepage_trans_blocks(inode);
1795 handle = ext4_journal_start(inode, err);
1796 - if (IS_ERR(handle)) {
1798 - clear_highpage(page);
1799 - flush_dcache_page(page);
1800 - unlock_page(page);
1801 - page_cache_release(page);
1803 + if (IS_ERR(handle))
1808 - ext4_block_truncate_page(handle, page, mapping, inode->i_size);
1809 + if (inode->i_size & (sb->s_blocksize - 1))
1810 + ext4_block_truncate_page(handle, mapping, inode->i_size);
1812 + if (ext4_orphan_add(handle, inode))
1815 down_write(&EXT4_I(inode)->i_data_sem);
1816 ext4_ext_invalidate_cache(inode);
1818 - ext4_mb_discard_inode_preallocations(inode);
1819 + ext4_discard_reservation(inode);
1822 * TODO: optimization is possible here.
1823 * Probably we need not scan at all,
1824 * because page truncation is enough.
1826 - if (ext4_orphan_add(handle, inode))
1829 /* we have to know where to truncate from in crash case */
1830 EXT4_I(inode)->i_disksize = inode->i_size;
1831 @@ -2798,6 +2847,7 @@ void ext4_ext_truncate(struct inode * inode, struct page *page)
1835 + up_write(&EXT4_I(inode)->i_data_sem);
1837 * If this was a simple ftruncate() and the file will remain alive,
1838 * then we need to clear up the orphan record which we created above.
1839 @@ -2808,33 +2858,11 @@ out_stop:
1841 ext4_orphan_del(handle, inode);
1843 - up_write(&EXT4_I(inode)->i_data_sem);
1844 inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
1845 ext4_mark_inode_dirty(handle, inode);
1846 ext4_journal_stop(handle);
1850 - * ext4_ext_writepage_trans_blocks:
1851 - * calculate max number of blocks we could modify
1852 - * in order to allocate new block for an inode
1854 -int ext4_ext_writepage_trans_blocks(struct inode *inode, int num)
1858 - needed = ext4_ext_calc_credits_for_insert(inode, NULL);
1860 - /* caller wants to allocate num blocks, but note it includes sb */
1861 - needed = needed * num - (num - 1);
1863 -#ifdef CONFIG_QUOTA
1864 - needed += 2 * EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb);
1870 static void ext4_falloc_update_inode(struct inode *inode,
1871 int mode, loff_t new_size, int update_ctime)
1873 @@ -2895,10 +2923,9 @@ long ext4_fallocate(struct inode *inode, int mode, loff_t offset, loff_t len)
1874 max_blocks = (EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits)
1877 - * credits to insert 1 extent into extent tree + buffers to be able to
1878 - * modify 1 super block, 1 block bitmap and 1 group descriptor.
1879 + * credits to insert 1 extent into extent tree
1881 - credits = EXT4_DATA_TRANS_BLOCKS(inode->i_sb) + 3;
1882 + credits = ext4_chunk_trans_blocks(inode, max_blocks);
1883 mutex_lock(&inode->i_mutex);
1885 while (ret >= 0 && ret < max_blocks) {
1886 @@ -2911,7 +2938,7 @@ retry:
1888 ret = ext4_get_blocks_wrap(handle, inode, block,
1889 max_blocks, &map_bh,
1890 - EXT4_CREATE_UNINITIALIZED_EXT, 0);
1891 + EXT4_CREATE_UNINITIALIZED_EXT, 0, 0);
1895 diff --git a/fs/ext4/file.c b/fs/ext4/file.c
1896 index 4159be6..430eb79 100644
1897 --- a/fs/ext4/file.c
1898 +++ b/fs/ext4/file.c
1899 @@ -123,6 +123,23 @@ force_commit:
1903 +static struct vm_operations_struct ext4_file_vm_ops = {
1904 + .fault = filemap_fault,
1905 + .page_mkwrite = ext4_page_mkwrite,
1908 +static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma)
1910 + struct address_space *mapping = file->f_mapping;
1912 + if (!mapping->a_ops->readpage)
1914 + file_accessed(file);
1915 + vma->vm_ops = &ext4_file_vm_ops;
1916 + vma->vm_flags |= VM_CAN_NONLINEAR;
1920 const struct file_operations ext4_file_operations = {
1921 .llseek = generic_file_llseek,
1922 .read = do_sync_read,
1923 @@ -133,7 +150,7 @@ const struct file_operations ext4_file_operations = {
1924 #ifdef CONFIG_COMPAT
1925 .compat_ioctl = ext4_compat_ioctl,
1927 - .mmap = generic_file_mmap,
1928 + .mmap = ext4_file_mmap,
1929 .open = generic_file_open,
1930 .release = ext4_release_file,
1931 .fsync = ext4_sync_file,
1932 @@ -144,6 +161,7 @@ const struct file_operations ext4_file_operations = {
1933 const struct inode_operations ext4_file_inode_operations = {
1934 .truncate = ext4_truncate,
1935 .setattr = ext4_setattr,
1936 + .getattr = ext4_getattr,
1937 #ifdef CONFIG_EXT4DEV_FS_XATTR
1938 .setxattr = generic_setxattr,
1939 .getxattr = generic_getxattr,
1940 diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
1941 index 1c8ba48..a45c373 100644
1942 --- a/fs/ext4/fsync.c
1943 +++ b/fs/ext4/fsync.c
1945 #include <linux/sched.h>
1946 #include <linux/writeback.h>
1947 #include <linux/jbd2.h>
1948 +#include <linux/blkdev.h>
1950 #include "ext4_jbd2.h"
1953 int ext4_sync_file(struct file * file, struct dentry *dentry, int datasync)
1955 struct inode *inode = dentry->d_inode;
1956 + journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
1959 J_ASSERT(ext4_journal_current_handle() == NULL);
1960 @@ -85,6 +87,8 @@ int ext4_sync_file(struct file * file, struct dentry *dentry, int datasync)
1961 .nr_to_write = 0, /* sys_fsync did this */
1963 ret = sync_inode(inode, &wbc);
1964 + if (journal && (journal->j_flags & JBD2_BARRIER))
1965 + blkdev_issue_flush(inode->i_sb->s_bdev, NULL);
1969 diff --git a/fs/ext4/group.h b/fs/ext4/group.h
1970 index 7eb0604..c2c0a8d 100644
1971 --- a/fs/ext4/group.h
1972 +++ b/fs/ext4/group.h
1973 @@ -13,7 +13,7 @@ extern __le16 ext4_group_desc_csum(struct ext4_sb_info *sbi, __u32 group,
1974 struct ext4_group_desc *gdp);
1975 extern int ext4_group_desc_csum_verify(struct ext4_sb_info *sbi, __u32 group,
1976 struct ext4_group_desc *gdp);
1977 -struct buffer_head *read_block_bitmap(struct super_block *sb,
1978 +struct buffer_head *ext4_read_block_bitmap(struct super_block *sb,
1979 ext4_group_t block_group);
1980 extern unsigned ext4_init_block_bitmap(struct super_block *sb,
1981 struct buffer_head *bh,
1982 diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
1983 index c6efbab..f344834 100644
1984 --- a/fs/ext4/ialloc.c
1985 +++ b/fs/ext4/ialloc.c
1986 @@ -97,34 +97,44 @@ unsigned ext4_init_inode_bitmap(struct super_block *sb, struct buffer_head *bh,
1987 * Return buffer_head of bitmap on success or NULL.
1989 static struct buffer_head *
1990 -read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
1991 +ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
1993 struct ext4_group_desc *desc;
1994 struct buffer_head *bh = NULL;
1995 + ext4_fsblk_t bitmap_blk;
1997 desc = ext4_get_group_desc(sb, block_group, NULL);
2001 + bitmap_blk = ext4_inode_bitmap(sb, desc);
2002 + bh = sb_getblk(sb, bitmap_blk);
2003 + if (unlikely(!bh)) {
2004 + ext4_error(sb, __func__,
2005 + "Cannot read inode bitmap - "
2006 + "block_group = %lu, inode_bitmap = %llu",
2007 + block_group, bitmap_blk);
2010 + if (bh_uptodate_or_lock(bh))
2013 + spin_lock(sb_bgl_lock(EXT4_SB(sb), block_group));
2014 if (desc->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) {
2015 - bh = sb_getblk(sb, ext4_inode_bitmap(sb, desc));
2016 - if (!buffer_uptodate(bh)) {
2018 - if (!buffer_uptodate(bh)) {
2019 - ext4_init_inode_bitmap(sb, bh, block_group,
2021 - set_buffer_uptodate(bh);
2023 - unlock_buffer(bh);
2026 - bh = sb_bread(sb, ext4_inode_bitmap(sb, desc));
2027 + ext4_init_inode_bitmap(sb, bh, block_group, desc);
2028 + set_buffer_uptodate(bh);
2029 + unlock_buffer(bh);
2030 + spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group));
2034 - ext4_error(sb, "read_inode_bitmap",
2035 + spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group));
2036 + if (bh_submit_read(bh) < 0) {
2038 + ext4_error(sb, __func__,
2039 "Cannot read inode bitmap - "
2040 "block_group = %lu, inode_bitmap = %llu",
2041 - block_group, ext4_inode_bitmap(sb, desc));
2043 + block_group, bitmap_blk);
2049 @@ -157,6 +167,7 @@ void ext4_free_inode (handle_t *handle, struct inode * inode)
2050 struct ext4_super_block * es;
2051 struct ext4_sb_info *sbi;
2053 + ext4_group_t flex_group;
2055 if (atomic_read(&inode->i_count) > 1) {
2056 printk ("ext4_free_inode: inode has count=%d\n",
2057 @@ -199,7 +210,7 @@ void ext4_free_inode (handle_t *handle, struct inode * inode)
2059 block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb);
2060 bit = (ino - 1) % EXT4_INODES_PER_GROUP(sb);
2061 - bitmap_bh = read_inode_bitmap(sb, block_group);
2062 + bitmap_bh = ext4_read_inode_bitmap(sb, block_group);
2066 @@ -232,6 +243,12 @@ void ext4_free_inode (handle_t *handle, struct inode * inode)
2068 percpu_counter_dec(&sbi->s_dirs_counter);
2070 + if (sbi->s_log_groups_per_flex) {
2071 + flex_group = ext4_flex_group(sbi, block_group);
2072 + spin_lock(sb_bgl_lock(sbi, flex_group));
2073 + sbi->s_flex_groups[flex_group].free_inodes++;
2074 + spin_unlock(sb_bgl_lock(sbi, flex_group));
2077 BUFFER_TRACE(bh2, "call ext4_journal_dirty_metadata");
2078 err = ext4_journal_dirty_metadata(handle, bh2);
2079 @@ -286,6 +303,80 @@ static int find_group_dir(struct super_block *sb, struct inode *parent,
2083 +#define free_block_ratio 10
2085 +static int find_group_flex(struct super_block *sb, struct inode *parent,
2086 + ext4_group_t *best_group)
2088 + struct ext4_sb_info *sbi = EXT4_SB(sb);
2089 + struct ext4_group_desc *desc;
2090 + struct buffer_head *bh;
2091 + struct flex_groups *flex_group = sbi->s_flex_groups;
2092 + ext4_group_t parent_group = EXT4_I(parent)->i_block_group;
2093 + ext4_group_t parent_fbg_group = ext4_flex_group(sbi, parent_group);
2094 + ext4_group_t ngroups = sbi->s_groups_count;
2095 + int flex_size = ext4_flex_bg_size(sbi);
2096 + ext4_group_t best_flex = parent_fbg_group;
2097 + int blocks_per_flex = sbi->s_blocks_per_group * flex_size;
2098 + int flexbg_free_blocks;
2099 + int flex_freeb_ratio;
2100 + ext4_group_t n_fbg_groups;
2103 + n_fbg_groups = (sbi->s_groups_count + flex_size - 1) >>
2104 + sbi->s_log_groups_per_flex;
2106 +find_close_to_parent:
2107 + flexbg_free_blocks = flex_group[best_flex].free_blocks;
2108 + flex_freeb_ratio = flexbg_free_blocks * 100 / blocks_per_flex;
2109 + if (flex_group[best_flex].free_inodes &&
2110 + flex_freeb_ratio > free_block_ratio)
2111 + goto found_flexbg;
2113 + if (best_flex && best_flex == parent_fbg_group) {
2115 + goto find_close_to_parent;
2118 + for (i = 0; i < n_fbg_groups; i++) {
2119 + if (i == parent_fbg_group || i == parent_fbg_group - 1)
2122 + flexbg_free_blocks = flex_group[i].free_blocks;
2123 + flex_freeb_ratio = flexbg_free_blocks * 100 / blocks_per_flex;
2125 + if (flex_freeb_ratio > free_block_ratio &&
2126 + flex_group[i].free_inodes) {
2128 + goto found_flexbg;
2131 + if (flex_group[best_flex].free_inodes == 0 ||
2132 + (flex_group[i].free_blocks >
2133 + flex_group[best_flex].free_blocks &&
2134 + flex_group[i].free_inodes))
2138 + if (!flex_group[best_flex].free_inodes ||
2139 + !flex_group[best_flex].free_blocks)
2143 + for (i = best_flex * flex_size; i < ngroups &&
2144 + i < (best_flex + 1) * flex_size; i++) {
2145 + desc = ext4_get_group_desc(sb, i, &bh);
2146 + if (le16_to_cpu(desc->bg_free_inodes_count)) {
2158 * Orlov's allocator for directories.
2160 @@ -501,6 +592,7 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode * dir, int mode)
2164 + ext4_group_t flex_group;
2166 /* Cannot create files in a deleted directory */
2167 if (!dir || !dir->i_nlink)
2168 @@ -514,6 +606,12 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode * dir, int mode)
2173 + if (sbi->s_log_groups_per_flex) {
2174 + ret2 = find_group_flex(sb, dir, &group);
2178 if (S_ISDIR(mode)) {
2179 if (test_opt (sb, OLDALLOC))
2180 ret2 = find_group_dir(sb, dir, &group);
2181 @@ -522,6 +620,7 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode * dir, int mode)
2183 ret2 = find_group_other(sb, dir, &group);
2189 @@ -534,7 +633,7 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode * dir, int mode)
2193 - bitmap_bh = read_inode_bitmap(sb, group);
2194 + bitmap_bh = ext4_read_inode_bitmap(sb, group);
2198 @@ -600,7 +699,7 @@ got:
2199 /* We may have to initialize the block bitmap if it isn't already */
2200 if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM) &&
2201 gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
2202 - struct buffer_head *block_bh = read_block_bitmap(sb, group);
2203 + struct buffer_head *block_bh = ext4_read_block_bitmap(sb, group);
2205 BUFFER_TRACE(block_bh, "get block bitmap access");
2206 err = ext4_journal_get_write_access(handle, block_bh);
2207 @@ -639,7 +738,7 @@ got:
2209 /* When marking the block group with
2210 * ~EXT4_BG_INODE_UNINIT we don't want to depend
2211 - * on the value of bg_itable_unsed even though
2212 + * on the value of bg_itable_unused even though
2213 * mke2fs could have initialized the same for us.
2214 * Instead we calculated the value below
2216 @@ -676,6 +775,13 @@ got:
2217 percpu_counter_inc(&sbi->s_dirs_counter);
2220 + if (sbi->s_log_groups_per_flex) {
2221 + flex_group = ext4_flex_group(sbi, group);
2222 + spin_lock(sb_bgl_lock(sbi, flex_group));
2223 + sbi->s_flex_groups[flex_group].free_inodes--;
2224 + spin_unlock(sb_bgl_lock(sbi, flex_group));
2227 inode->i_uid = current->fsuid;
2228 if (test_opt (sb, GRPID))
2229 inode->i_gid = dir->i_gid;
2230 @@ -740,14 +846,10 @@ got:
2231 goto fail_free_drop;
2233 if (test_opt(sb, EXTENTS)) {
2234 - /* set extent flag only for diretory, file and normal symlink*/
2235 + /* set extent flag only for directory, file and normal symlink*/
2236 if (S_ISDIR(mode) || S_ISREG(mode) || S_ISLNK(mode)) {
2237 EXT4_I(inode)->i_flags |= EXT4_EXTENTS_FL;
2238 ext4_ext_tree_init(handle, inode);
2239 - err = ext4_update_incompat_feature(handle, sb,
2240 - EXT4_FEATURE_INCOMPAT_EXTENTS);
2242 - goto fail_free_drop;
2246 @@ -799,7 +901,7 @@ struct inode *ext4_orphan_get(struct super_block *sb, unsigned long ino)
2248 block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb);
2249 bit = (ino - 1) % EXT4_INODES_PER_GROUP(sb);
2250 - bitmap_bh = read_inode_bitmap(sb, block_group);
2251 + bitmap_bh = ext4_read_inode_bitmap(sb, block_group);
2253 ext4_warning(sb, __func__,
2254 "inode bitmap error for orphan %lu", ino);
2255 @@ -817,6 +919,14 @@ struct inode *ext4_orphan_get(struct super_block *sb, unsigned long ino)
2260 + * If the orphans has i_nlinks > 0 then it should be able to be
2261 + * truncated, otherwise it won't be removed from the orphan list
2262 + * during processing and an infinite loop will result.
2264 + if (inode->i_nlink && !ext4_can_truncate(inode))
2267 if (NEXT_ORPHAN(inode) > max_ino)
2270 @@ -838,6 +948,7 @@ bad_orphan:
2271 printk(KERN_NOTICE "NEXT_ORPHAN(inode)=%u\n",
2272 NEXT_ORPHAN(inode));
2273 printk(KERN_NOTICE "max_ino=%lu\n", max_ino);
2274 + printk(KERN_NOTICE "i_nlink=%u\n", inode->i_nlink);
2275 /* Avoid freeing blocks if we got a bad deleted inode */
2276 if (inode->i_nlink == 0)
2277 inode->i_blocks = 0;
2278 @@ -868,7 +979,7 @@ unsigned long ext4_count_free_inodes (struct super_block * sb)
2280 desc_count += le16_to_cpu(gdp->bg_free_inodes_count);
2282 - bitmap_bh = read_inode_bitmap(sb, i);
2283 + bitmap_bh = ext4_read_inode_bitmap(sb, i);
2287 diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
2288 index 8d97077..3c0195a 100644
2289 --- a/fs/ext4/inode.c
2290 +++ b/fs/ext4/inode.c
2292 #include <linux/string.h>
2293 #include <linux/buffer_head.h>
2294 #include <linux/writeback.h>
2295 +#include <linux/pagevec.h>
2296 #include <linux/mpage.h>
2297 #include <linux/uio.h>
2298 #include <linux/bio.h>
2299 #include "ext4_jbd2.h"
2302 +#include "ext4_extents.h"
2304 +#define MPAGE_DA_EXTENT_TAIL 0x01
2306 +static inline int ext4_begin_ordered_truncate(struct inode *inode,
2309 + return jbd2_journal_begin_ordered_truncate(&EXT4_I(inode)->jinode,
2313 +static void ext4_invalidatepage(struct page *page, unsigned long offset);
2316 * Test whether an inode is a fast symlink.
2317 @@ -180,14 +193,18 @@ static int ext4_journal_test_restart(handle_t *handle, struct inode *inode)
2318 void ext4_delete_inode (struct inode * inode)
2323 + if (ext4_should_order_data(inode))
2324 + ext4_begin_ordered_truncate(inode, 0);
2325 truncate_inode_pages(&inode->i_data, 0);
2327 if (is_bad_inode(inode))
2330 - handle = start_transaction(inode);
2331 + handle = ext4_journal_start(inode, blocks_for_truncate(inode)+3);
2332 if (IS_ERR(handle)) {
2333 + ext4_std_error(inode->i_sb, PTR_ERR(handle));
2335 * If we're going to skip the normal cleanup, we still need to
2336 * make sure that the in-core orphan linked list is properly
2337 @@ -200,8 +217,34 @@ void ext4_delete_inode (struct inode * inode)
2341 + err = ext4_mark_inode_dirty(handle, inode);
2343 + ext4_warning(inode->i_sb, __func__,
2344 + "couldn't mark inode dirty (err %d)", err);
2347 if (inode->i_blocks)
2348 ext4_truncate(inode);
2351 + * ext4_ext_truncate() doesn't reserve any slop when it
2352 + * restarts journal transactions; therefore there may not be
2353 + * enough credits left in the handle to remove the inode from
2354 + * the orphan list and set the dtime field.
2356 + if (handle->h_buffer_credits < 3) {
2357 + err = ext4_journal_extend(handle, 3);
2359 + err = ext4_journal_restart(handle, 3);
2361 + ext4_warning(inode->i_sb, __func__,
2362 + "couldn't extend journal (err %d)", err);
2364 + ext4_journal_stop(handle);
2370 * Kill off the orphan record which ext4_truncate created.
2371 * AKPM: I think this can be inside the above `if'.
2372 @@ -508,11 +551,12 @@ static int ext4_blks_to_allocate(Indirect *branch, int k, unsigned long blks,
2375 static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
2376 - ext4_fsblk_t goal, int indirect_blks, int blks,
2377 - ext4_fsblk_t new_blocks[4], int *err)
2378 + ext4_lblk_t iblock, ext4_fsblk_t goal,
2379 + int indirect_blks, int blks,
2380 + ext4_fsblk_t new_blocks[4], int *err)
2383 - unsigned long count = 0;
2384 + unsigned long count = 0, blk_allocated = 0;
2386 ext4_fsblk_t current_block = 0;
2388 @@ -525,12 +569,13 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
2389 * the first direct block of this branch. That's the
2390 * minimum number of blocks need to allocate(required)
2392 - target = blks + indirect_blks;
2395 + /* first we try to allocate the indirect blocks */
2396 + target = indirect_blks;
2397 + while (target > 0) {
2399 /* allocating blocks for indirect blocks and direct blocks */
2400 - current_block = ext4_new_blocks(handle,inode,goal,&count,err);
2401 + current_block = ext4_new_meta_blocks(handle, inode,
2402 + goal, &count, err);
2406 @@ -540,16 +585,48 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
2407 new_blocks[index++] = current_block++;
2414 + * save the new block number
2415 + * for the first direct block
2417 + new_blocks[index] = current_block;
2418 + printk(KERN_INFO "%s returned more blocks than "
2419 + "requested\n", __func__);
2425 - /* save the new block number for the first direct block */
2426 - new_blocks[index] = current_block;
2428 + target = blks - count ;
2429 + blk_allocated = count;
2432 + /* Now allocate data blocks */
2434 + /* allocating blocks for data blocks */
2435 + current_block = ext4_new_blocks(handle, inode, iblock,
2436 + goal, &count, err);
2437 + if (*err && (target == blks)) {
2439 + * if the allocation failed and we didn't allocate
2440 + * any blocks before
2445 + if (target == blks) {
2447 + * save the new block number
2448 + * for the first direct block
2450 + new_blocks[index] = current_block;
2452 + blk_allocated += count;
2455 /* total number of blocks allocated for direct blocks */
2457 + ret = blk_allocated;
2461 @@ -584,8 +661,9 @@ failed_out:
2462 * as described above and return 0.
2464 static int ext4_alloc_branch(handle_t *handle, struct inode *inode,
2465 - int indirect_blks, int *blks, ext4_fsblk_t goal,
2466 - ext4_lblk_t *offsets, Indirect *branch)
2467 + ext4_lblk_t iblock, int indirect_blks,
2468 + int *blks, ext4_fsblk_t goal,
2469 + ext4_lblk_t *offsets, Indirect *branch)
2471 int blocksize = inode->i_sb->s_blocksize;
2473 @@ -595,7 +673,7 @@ static int ext4_alloc_branch(handle_t *handle, struct inode *inode,
2474 ext4_fsblk_t new_blocks[4];
2475 ext4_fsblk_t current_block;
2477 - num = ext4_alloc_blocks(handle, inode, goal, indirect_blks,
2478 + num = ext4_alloc_blocks(handle, inode, iblock, goal, indirect_blks,
2479 *blks, new_blocks, &err);
2482 @@ -799,6 +877,7 @@ int ext4_get_blocks_handle(handle_t *handle, struct inode *inode,
2483 struct ext4_inode_info *ei = EXT4_I(inode);
2485 ext4_fsblk_t first_block = 0;
2489 J_ASSERT(!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL));
2490 @@ -855,8 +934,9 @@ int ext4_get_blocks_handle(handle_t *handle, struct inode *inode,
2492 * Block out ext4_truncate while we alter the tree
2494 - err = ext4_alloc_branch(handle, inode, indirect_blks, &count, goal,
2495 - offsets + (partial - chain), partial);
2496 + err = ext4_alloc_branch(handle, inode, iblock, indirect_blks,
2498 + offsets + (partial - chain), partial);
2501 * The ext4_splice_branch call will free and forget any buffers
2502 @@ -873,8 +953,13 @@ int ext4_get_blocks_handle(handle_t *handle, struct inode *inode,
2503 * protect it if you're about to implement concurrent
2504 * ext4_get_block() -bzzz
2506 - if (!err && extend_disksize && inode->i_size > ei->i_disksize)
2507 - ei->i_disksize = inode->i_size;
2508 + if (!err && extend_disksize) {
2509 + disksize = ((loff_t) iblock + count) << inode->i_blkbits;
2510 + if (disksize > i_size_read(inode))
2511 + disksize = i_size_read(inode);
2512 + if (disksize > ei->i_disksize)
2513 + ei->i_disksize = disksize;
2518 @@ -897,23 +982,74 @@ out:
2522 -/* Maximum number of blocks we map for direct IO at once. */
2523 -#define DIO_MAX_BLOCKS 4096
2525 - * Number of credits we need for writing DIO_MAX_BLOCKS:
2526 - * We need sb + group descriptor + bitmap + inode -> 4
2527 - * For B blocks with A block pointers per block we need:
2528 - * 1 (triple ind.) + (B/A/A + 2) (doubly ind.) + (B/A + 2) (indirect).
2529 - * If we plug in 4096 for B and 256 for A (for 1KB block size), we get 25.
2530 + * Calculate the number of metadata blocks need to reserve
2531 + * to allocate @blocks for non extent file based file
2533 -#define DIO_CREDITS 25
2534 +static int ext4_indirect_calc_metadata_amount(struct inode *inode, int blocks)
2536 + int icap = EXT4_ADDR_PER_BLOCK(inode->i_sb);
2537 + int ind_blks, dind_blks, tind_blks;
2539 + /* number of new indirect blocks needed */
2540 + ind_blks = (blocks + icap - 1) / icap;
2542 + dind_blks = (ind_blks + icap - 1) / icap;
2546 + return ind_blks + dind_blks + tind_blks;
2550 + * Calculate the number of metadata blocks need to reserve
2551 + * to allocate given number of blocks
2553 +static int ext4_calc_metadata_amount(struct inode *inode, int blocks)
2558 + if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)
2559 + return ext4_ext_calc_metadata_amount(inode, blocks);
2561 + return ext4_indirect_calc_metadata_amount(inode, blocks);
2564 +static void ext4_da_update_reserve_space(struct inode *inode, int used)
2566 + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
2567 + int total, mdb, mdb_free;
2569 + spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
2570 + /* recalculate the number of metablocks still need to be reserved */
2571 + total = EXT4_I(inode)->i_reserved_data_blocks - used;
2572 + mdb = ext4_calc_metadata_amount(inode, total);
2574 + /* figure out how many metablocks to release */
2575 + BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks);
2576 + mdb_free = EXT4_I(inode)->i_reserved_meta_blocks - mdb;
2578 + /* Account for allocated meta_blocks */
2579 + mdb_free -= EXT4_I(inode)->i_allocated_meta_blocks;
2581 + /* update fs free blocks counter for truncate case */
2582 + percpu_counter_add(&sbi->s_freeblocks_counter, mdb_free);
2584 + /* update per-inode reservations */
2585 + BUG_ON(used > EXT4_I(inode)->i_reserved_data_blocks);
2586 + EXT4_I(inode)->i_reserved_data_blocks -= used;
2588 + BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks);
2589 + EXT4_I(inode)->i_reserved_meta_blocks = mdb;
2590 + EXT4_I(inode)->i_allocated_meta_blocks = 0;
2591 + spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
2595 + * The ext4_get_blocks_wrap() function try to look up the requested blocks,
2596 + * and returns if the blocks are already mapped.
2599 - * ext4_ext4 get_block() wrapper function
2600 - * It will do a look up first, and returns if the blocks already mapped.
2601 * Otherwise it takes the write lock of the i_data_sem and allocate blocks
2602 * and store the allocated blocks in the result buffer head and mark it
2604 @@ -934,7 +1070,7 @@ out:
2606 int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block,
2607 unsigned long max_blocks, struct buffer_head *bh,
2608 - int create, int extend_disksize)
2609 + int create, int extend_disksize, int flag)
2613 @@ -975,6 +1111,15 @@ int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block,
2614 * with create == 1 flag.
2616 down_write((&EXT4_I(inode)->i_data_sem));
2619 + * if the caller is from delayed allocation writeout path
2620 + * we have already reserved fs blocks for allocation
2621 + * let the underlying get_block() function know to
2622 + * avoid double accounting
2625 + EXT4_I(inode)->i_delalloc_reserved_flag = 1;
2627 * We need to check for EXT4 here because migrate
2628 * could have changed the inode type in between
2629 @@ -996,23 +1141,39 @@ int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block,
2635 + EXT4_I(inode)->i_delalloc_reserved_flag = 0;
2637 + * Update reserved blocks/metadata blocks
2638 + * after successful block allocation
2639 + * which were deferred till now
2641 + if ((retval > 0) && buffer_delay(bh))
2642 + ext4_da_update_reserve_space(inode, retval);
2645 up_write((&EXT4_I(inode)->i_data_sem));
2649 +/* Maximum number of blocks we map for direct IO at once. */
2650 +#define DIO_MAX_BLOCKS 4096
2652 static int ext4_get_block(struct inode *inode, sector_t iblock,
2653 struct buffer_head *bh_result, int create)
2655 handle_t *handle = ext4_journal_current_handle();
2656 int ret = 0, started = 0;
2657 unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
2660 if (create && !handle) {
2661 /* Direct IO write... */
2662 if (max_blocks > DIO_MAX_BLOCKS)
2663 max_blocks = DIO_MAX_BLOCKS;
2664 - handle = ext4_journal_start(inode, DIO_CREDITS +
2665 - 2 * EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb));
2666 + dio_credits = ext4_chunk_trans_blocks(inode, max_blocks);
2667 + handle = ext4_journal_start(inode, dio_credits);
2668 if (IS_ERR(handle)) {
2669 ret = PTR_ERR(handle);
2671 @@ -1021,7 +1182,7 @@ static int ext4_get_block(struct inode *inode, sector_t iblock,
2674 ret = ext4_get_blocks_wrap(handle, inode, iblock,
2675 - max_blocks, bh_result, create, 0);
2676 + max_blocks, bh_result, create, 0, 0);
2678 bh_result->b_size = (ret << inode->i_blkbits);
2680 @@ -1047,7 +1208,7 @@ struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode,
2681 dummy.b_blocknr = -1000;
2682 buffer_trace_init(&dummy.b_history);
2683 err = ext4_get_blocks_wrap(handle, inode, block, 1,
2684 - &dummy, create, 1);
2685 + &dummy, create, 1, 0);
2687 * ext4_get_blocks_handle() returns number of blocks
2688 * mapped. 0 in case of a HOLE.
2689 @@ -1203,19 +1364,20 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping,
2693 - page = __grab_cache_page(mapping, index);
2698 handle = ext4_journal_start(inode, needed_blocks);
2699 if (IS_ERR(handle)) {
2700 - unlock_page(page);
2701 - page_cache_release(page);
2702 ret = PTR_ERR(handle);
2706 + page = __grab_cache_page(mapping, index);
2708 + ext4_journal_stop(handle);
2714 ret = block_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
2717 @@ -1225,8 +1387,8 @@ retry:
2721 - ext4_journal_stop(handle);
2723 + ext4_journal_stop(handle);
2724 page_cache_release(page);
2727 @@ -1236,15 +1398,6 @@ out:
2731 -int ext4_journal_dirty_data(handle_t *handle, struct buffer_head *bh)
2733 - int err = jbd2_journal_dirty_data(handle, bh);
2735 - ext4_journal_abort_handle(__func__, __func__,
2740 /* For write_end() in data=journal mode */
2741 static int write_end_fn(handle_t *handle, struct buffer_head *bh)
2743 @@ -1255,29 +1408,6 @@ static int write_end_fn(handle_t *handle, struct buffer_head *bh)
2747 - * Generic write_end handler for ordered and writeback ext4 journal modes.
2748 - * We can't use generic_write_end, because that unlocks the page and we need to
2749 - * unlock the page after ext4_journal_stop, but ext4_journal_stop must run
2750 - * after block_write_end.
2752 -static int ext4_generic_write_end(struct file *file,
2753 - struct address_space *mapping,
2754 - loff_t pos, unsigned len, unsigned copied,
2755 - struct page *page, void *fsdata)
2757 - struct inode *inode = file->f_mapping->host;
2759 - copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
2761 - if (pos+copied > inode->i_size) {
2762 - i_size_write(inode, pos+copied);
2763 - mark_inode_dirty(inode);
2770 * We need to pick up the new inode size which generic_commit_write gave us
2771 * `file' can be NULL - eg, when called from page_symlink().
2773 @@ -1290,15 +1420,10 @@ static int ext4_ordered_write_end(struct file *file,
2774 struct page *page, void *fsdata)
2776 handle_t *handle = ext4_journal_current_handle();
2777 - struct inode *inode = file->f_mapping->host;
2778 - unsigned from, to;
2779 + struct inode *inode = mapping->host;
2782 - from = pos & (PAGE_CACHE_SIZE - 1);
2785 - ret = walk_page_buffers(handle, page_buffers(page),
2786 - from, to, NULL, ext4_journal_dirty_data);
2787 + ret = ext4_jbd2_file_inode(handle, inode);
2791 @@ -1311,7 +1436,7 @@ static int ext4_ordered_write_end(struct file *file,
2792 new_i_size = pos + copied;
2793 if (new_i_size > EXT4_I(inode)->i_disksize)
2794 EXT4_I(inode)->i_disksize = new_i_size;
2795 - ret2 = ext4_generic_write_end(file, mapping, pos, len, copied,
2796 + ret2 = generic_write_end(file, mapping, pos, len, copied,
2800 @@ -1320,8 +1445,6 @@ static int ext4_ordered_write_end(struct file *file,
2801 ret2 = ext4_journal_stop(handle);
2804 - unlock_page(page);
2805 - page_cache_release(page);
2807 return ret ? ret : copied;
2809 @@ -1332,7 +1455,7 @@ static int ext4_writeback_write_end(struct file *file,
2810 struct page *page, void *fsdata)
2812 handle_t *handle = ext4_journal_current_handle();
2813 - struct inode *inode = file->f_mapping->host;
2814 + struct inode *inode = mapping->host;
2818 @@ -1340,7 +1463,7 @@ static int ext4_writeback_write_end(struct file *file,
2819 if (new_i_size > EXT4_I(inode)->i_disksize)
2820 EXT4_I(inode)->i_disksize = new_i_size;
2822 - ret2 = ext4_generic_write_end(file, mapping, pos, len, copied,
2823 + ret2 = generic_write_end(file, mapping, pos, len, copied,
2827 @@ -1349,8 +1472,6 @@ static int ext4_writeback_write_end(struct file *file,
2828 ret2 = ext4_journal_stop(handle);
2831 - unlock_page(page);
2832 - page_cache_release(page);
2834 return ret ? ret : copied;
2836 @@ -1389,15 +1510,1028 @@ static int ext4_journalled_write_end(struct file *file,
2840 + unlock_page(page);
2841 ret2 = ext4_journal_stop(handle);
2844 - unlock_page(page);
2845 page_cache_release(page);
2847 return ret ? ret : copied;
2850 +static int ext4_da_reserve_space(struct inode *inode, int nrblocks)
2852 + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
2853 + unsigned long md_needed, mdblocks, total = 0;
2856 + * recalculate the amount of metadata blocks to reserve
2857 + * in order to allocate nrblocks
2858 + * worse case is one extent per block
2860 + spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
2861 + total = EXT4_I(inode)->i_reserved_data_blocks + nrblocks;
2862 + mdblocks = ext4_calc_metadata_amount(inode, total);
2863 + BUG_ON(mdblocks < EXT4_I(inode)->i_reserved_meta_blocks);
2865 + md_needed = mdblocks - EXT4_I(inode)->i_reserved_meta_blocks;
2866 + total = md_needed + nrblocks;
2868 + if (ext4_has_free_blocks(sbi, total) < total) {
2869 + spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
2872 + /* reduce fs free blocks counter */
2873 + percpu_counter_sub(&sbi->s_freeblocks_counter, total);
2875 + EXT4_I(inode)->i_reserved_data_blocks += nrblocks;
2876 + EXT4_I(inode)->i_reserved_meta_blocks = mdblocks;
2878 + spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
2879 + return 0; /* success */
2882 +static void ext4_da_release_space(struct inode *inode, int to_free)
2884 + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
2885 + int total, mdb, mdb_free, release;
2888 + return; /* Nothing to release, exit */
2890 + spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
2892 + if (!EXT4_I(inode)->i_reserved_data_blocks) {
2894 + * if there is no reserved blocks, but we try to free some
2895 + * then the counter is messed up somewhere.
2896 + * but since this function is called from invalidate
2897 + * page, it's harmless to return without any action
2899 + printk(KERN_INFO "ext4 delalloc try to release %d reserved "
2900 + "blocks for inode %lu, but there is no reserved "
2901 + "data blocks\n", to_free, inode->i_ino);
2902 + spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
2906 + /* recalculate the number of metablocks still need to be reserved */
2907 + total = EXT4_I(inode)->i_reserved_data_blocks - to_free;
2908 + mdb = ext4_calc_metadata_amount(inode, total);
2910 + /* figure out how many metablocks to release */
2911 + BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks);
2912 + mdb_free = EXT4_I(inode)->i_reserved_meta_blocks - mdb;
2914 + release = to_free + mdb_free;
2916 + /* update fs free blocks counter for truncate case */
2917 + percpu_counter_add(&sbi->s_freeblocks_counter, release);
2919 + /* update per-inode reservations */
2920 + BUG_ON(to_free > EXT4_I(inode)->i_reserved_data_blocks);
2921 + EXT4_I(inode)->i_reserved_data_blocks -= to_free;
2923 + BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks);
2924 + EXT4_I(inode)->i_reserved_meta_blocks = mdb;
2925 + spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
2928 +static void ext4_da_page_release_reservation(struct page *page,
2929 + unsigned long offset)
2931 + int to_release = 0;
2932 + struct buffer_head *head, *bh;
2933 + unsigned int curr_off = 0;
2935 + head = page_buffers(page);
2938 + unsigned int next_off = curr_off + bh->b_size;
2940 + if ((offset <= curr_off) && (buffer_delay(bh))) {
2942 + clear_buffer_delay(bh);
2944 + curr_off = next_off;
2945 + } while ((bh = bh->b_this_page) != head);
2946 + ext4_da_release_space(page->mapping->host, to_release);
2950 + * Delayed allocation stuff
2953 +struct mpage_da_data {
2954 + struct inode *inode;
2955 + struct buffer_head lbh; /* extent of blocks */
2956 + unsigned long first_page, next_page; /* extent of pages */
2957 + get_block_t *get_block;
2958 + struct writeback_control *wbc;
2960 + long pages_written;
2964 + * mpage_da_submit_io - walks through extent of pages and try to write
2965 + * them with writepage() call back
2967 + * @mpd->inode: inode
2968 + * @mpd->first_page: first page of the extent
2969 + * @mpd->next_page: page after the last page of the extent
2970 + * @mpd->get_block: the filesystem's block mapper function
2972 + * By the time mpage_da_submit_io() is called we expect all blocks
2973 + * to be allocated. this may be wrong if allocation failed.
2975 + * As pages are already locked by write_cache_pages(), we can't use it
2977 +static int mpage_da_submit_io(struct mpage_da_data *mpd)
2979 + struct address_space *mapping = mpd->inode->i_mapping;
2980 + int ret = 0, err, nr_pages, i;
2981 + unsigned long index, end;
2982 + struct pagevec pvec;
2984 + BUG_ON(mpd->next_page <= mpd->first_page);
2985 + pagevec_init(&pvec, 0);
2986 + index = mpd->first_page;
2987 + end = mpd->next_page - 1;
2989 + while (index <= end) {
2990 + /* XXX: optimize tail */
2991 + nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE);
2992 + if (nr_pages == 0)
2994 + for (i = 0; i < nr_pages; i++) {
2995 + struct page *page = pvec.pages[i];
2997 + index = page->index;
3002 + err = mapping->a_ops->writepage(page, mpd->wbc);
3004 + mpd->pages_written++;
3006 + * In error case, we have to continue because
3007 + * remaining pages are still locked
3008 + * XXX: unlock and re-dirty them?
3013 + pagevec_release(&pvec);
3019 + * mpage_put_bnr_to_bhs - walk blocks and assign them actual numbers
3021 + * @mpd->inode - inode to walk through
3022 + * @exbh->b_blocknr - first block on a disk
3023 + * @exbh->b_size - amount of space in bytes
3024 + * @logical - first logical block to start assignment with
3026 + * the function goes through all passed space and put actual disk
3027 + * block numbers into buffer heads, dropping BH_Delay
3029 +static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd, sector_t logical,
3030 + struct buffer_head *exbh)
3032 + struct inode *inode = mpd->inode;
3033 + struct address_space *mapping = inode->i_mapping;
3034 + int blocks = exbh->b_size >> inode->i_blkbits;
3035 + sector_t pblock = exbh->b_blocknr, cur_logical;
3036 + struct buffer_head *head, *bh;
3037 + pgoff_t index, end;
3038 + struct pagevec pvec;
3041 + index = logical >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
3042 + end = (logical + blocks - 1) >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
3043 + cur_logical = index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
3045 + pagevec_init(&pvec, 0);
3047 + while (index <= end) {
3048 + /* XXX: optimize tail */
3049 + nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE);
3050 + if (nr_pages == 0)
3052 + for (i = 0; i < nr_pages; i++) {
3053 + struct page *page = pvec.pages[i];
3055 + index = page->index;
3060 + BUG_ON(!PageLocked(page));
3061 + BUG_ON(PageWriteback(page));
3062 + BUG_ON(!page_has_buffers(page));
3064 + bh = page_buffers(page);
3067 + /* skip blocks out of the range */
3069 + if (cur_logical >= logical)
3072 + } while ((bh = bh->b_this_page) != head);
3075 + if (cur_logical >= logical + blocks)
3077 + if (buffer_delay(bh)) {
3078 + bh->b_blocknr = pblock;
3079 + clear_buffer_delay(bh);
3080 + bh->b_bdev = inode->i_sb->s_bdev;
3081 + } else if (buffer_unwritten(bh)) {
3082 + bh->b_blocknr = pblock;
3083 + clear_buffer_unwritten(bh);
3084 + set_buffer_mapped(bh);
3085 + set_buffer_new(bh);
3086 + bh->b_bdev = inode->i_sb->s_bdev;
3087 + } else if (buffer_mapped(bh))
3088 + BUG_ON(bh->b_blocknr != pblock);
3092 + } while ((bh = bh->b_this_page) != head);
3094 + pagevec_release(&pvec);
3100 + * __unmap_underlying_blocks - just a helper function to unmap
3101 + * set of blocks described by @bh
3103 +static inline void __unmap_underlying_blocks(struct inode *inode,
3104 + struct buffer_head *bh)
3106 + struct block_device *bdev = inode->i_sb->s_bdev;
3109 + blocks = bh->b_size >> inode->i_blkbits;
3110 + for (i = 0; i < blocks; i++)
3111 + unmap_underlying_metadata(bdev, bh->b_blocknr + i);
3115 + * mpage_da_map_blocks - go through given space
3117 + * @mpd->lbh - bh describing space
3118 + * @mpd->get_block - the filesystem's block mapper function
3120 + * The function skips space we know is already mapped to disk blocks.
3123 +static void mpage_da_map_blocks(struct mpage_da_data *mpd)
3126 + struct buffer_head *lbh = &mpd->lbh;
3127 + sector_t next = lbh->b_blocknr;
3128 + struct buffer_head new;
3131 + * We consider only non-mapped and non-allocated blocks
3133 + if (buffer_mapped(lbh) && !buffer_delay(lbh))
3136 + new.b_state = lbh->b_state;
3137 + new.b_blocknr = 0;
3138 + new.b_size = lbh->b_size;
3141 + * If we didn't accumulate anything
3142 + * to write simply return
3146 + err = mpd->get_block(mpd->inode, next, &new, 1);
3149 + BUG_ON(new.b_size == 0);
3151 + if (buffer_new(&new))
3152 + __unmap_underlying_blocks(mpd->inode, &new);
3155 + * If blocks are delayed marked, we need to
3156 + * put actual blocknr and drop delayed bit
3158 + if (buffer_delay(lbh) || buffer_unwritten(lbh))
3159 + mpage_put_bnr_to_bhs(mpd, next, &new);
3164 +#define BH_FLAGS ((1 << BH_Uptodate) | (1 << BH_Mapped) | \
3165 + (1 << BH_Delay) | (1 << BH_Unwritten))
3168 + * mpage_add_bh_to_extent - try to add one more block to extent of blocks
3170 + * @mpd->lbh - extent of blocks
3171 + * @logical - logical number of the block in the file
3172 + * @bh - bh of the block (used to access block's state)
3174 + * the function is used to collect contig. blocks in same state
3176 +static void mpage_add_bh_to_extent(struct mpage_da_data *mpd,
3177 + sector_t logical, struct buffer_head *bh)
3180 + size_t b_size = bh->b_size;
3181 + struct buffer_head *lbh = &mpd->lbh;
3182 + int nrblocks = lbh->b_size >> mpd->inode->i_blkbits;
3184 + /* check if thereserved journal credits might overflow */
3185 + if (!(EXT4_I(mpd->inode)->i_flags & EXT4_EXTENTS_FL)) {
3186 + if (nrblocks >= EXT4_MAX_TRANS_DATA) {
3188 + * With non-extent format we are limited by the journal
3189 + * credit available. Total credit needed to insert
3190 + * nrblocks contiguous blocks is dependent on the
3191 + * nrblocks. So limit nrblocks.
3194 + } else if ((nrblocks + (b_size >> mpd->inode->i_blkbits)) >
3195 + EXT4_MAX_TRANS_DATA) {
3197 + * Adding the new buffer_head would make it cross the
3198 + * allowed limit for which we have journal credit
3199 + * reserved. So limit the new bh->b_size
3201 + b_size = (EXT4_MAX_TRANS_DATA - nrblocks) <<
3202 + mpd->inode->i_blkbits;
3203 + /* we will do mpage_da_submit_io in the next loop */
3207 + * First block in the extent
3209 + if (lbh->b_size == 0) {
3210 + lbh->b_blocknr = logical;
3211 + lbh->b_size = b_size;
3212 + lbh->b_state = bh->b_state & BH_FLAGS;
3216 + next = lbh->b_blocknr + nrblocks;
3218 + * Can we merge the block to our big extent?
3220 + if (logical == next && (bh->b_state & BH_FLAGS) == lbh->b_state) {
3221 + lbh->b_size += b_size;
3227 + * We couldn't merge the block to our extent, so we
3228 + * need to flush current extent and start new one
3230 + mpage_da_map_blocks(mpd);
3231 + mpage_da_submit_io(mpd);
3237 + * __mpage_da_writepage - finds extent of pages and blocks
3239 + * @page: page to consider
3240 + * @wbc: not used, we just follow rules
3243 + * The function finds extents of pages and scan them for all blocks.
3245 +static int __mpage_da_writepage(struct page *page,
3246 + struct writeback_control *wbc, void *data)
3248 + struct mpage_da_data *mpd = data;
3249 + struct inode *inode = mpd->inode;
3250 + struct buffer_head *bh, *head, fake;
3253 + if (mpd->io_done) {
3255 + * Rest of the page in the page_vec
3256 + * redirty then and skip then. We will
3257 + * try to to write them again after
3258 + * starting a new transaction
3260 + redirty_page_for_writepage(wbc, page);
3261 + unlock_page(page);
3262 + return MPAGE_DA_EXTENT_TAIL;
3265 + * Can we merge this page to current extent?
3267 + if (mpd->next_page != page->index) {
3269 + * Nope, we can't. So, we map non-allocated blocks
3270 + * and start IO on them using writepage()
3272 + if (mpd->next_page != mpd->first_page) {
3273 + mpage_da_map_blocks(mpd);
3274 + mpage_da_submit_io(mpd);
3276 + * skip rest of the page in the page_vec
3279 + redirty_page_for_writepage(wbc, page);
3280 + unlock_page(page);
3281 + return MPAGE_DA_EXTENT_TAIL;
3285 + * Start next extent of pages ...
3287 + mpd->first_page = page->index;
3292 + mpd->lbh.b_size = 0;
3293 + mpd->lbh.b_state = 0;
3294 + mpd->lbh.b_blocknr = 0;
3297 + mpd->next_page = page->index + 1;
3298 + logical = (sector_t) page->index <<
3299 + (PAGE_CACHE_SHIFT - inode->i_blkbits);
3301 + if (!page_has_buffers(page)) {
3303 + * There is no attached buffer heads yet (mmap?)
3304 + * we treat the page asfull of dirty blocks
3307 + bh->b_size = PAGE_CACHE_SIZE;
3309 + set_buffer_dirty(bh);
3310 + set_buffer_uptodate(bh);
3311 + mpage_add_bh_to_extent(mpd, logical, bh);
3313 + return MPAGE_DA_EXTENT_TAIL;
3316 + * Page with regular buffer heads, just add all dirty ones
3318 + head = page_buffers(page);
3321 + BUG_ON(buffer_locked(bh));
3322 + if (buffer_dirty(bh) &&
3323 + (!buffer_mapped(bh) || buffer_delay(bh))) {
3324 + mpage_add_bh_to_extent(mpd, logical, bh);
3326 + return MPAGE_DA_EXTENT_TAIL;
3329 + } while ((bh = bh->b_this_page) != head);
3336 + * mpage_da_writepages - walk the list of dirty pages of the given
3337 + * address space, allocates non-allocated blocks, maps newly-allocated
3338 + * blocks to existing bhs and issue IO them
3340 + * @mapping: address space structure to write
3341 + * @wbc: subtract the number of written pages from *@wbc->nr_to_write
3342 + * @get_block: the filesystem's block mapper function.
3344 + * This is a library function, which implements the writepages()
3345 + * address_space_operation.
3347 +static int mpage_da_writepages(struct address_space *mapping,
3348 + struct writeback_control *wbc,
3349 + get_block_t get_block)
3351 + struct mpage_da_data mpd;
3356 + return generic_writepages(mapping, wbc);
3359 + mpd.inode = mapping->host;
3360 + mpd.lbh.b_size = 0;
3361 + mpd.lbh.b_state = 0;
3362 + mpd.lbh.b_blocknr = 0;
3363 + mpd.first_page = 0;
3364 + mpd.next_page = 0;
3365 + mpd.get_block = get_block;
3367 + mpd.pages_written = 0;
3369 + to_write = wbc->nr_to_write;
3371 + ret = write_cache_pages(mapping, wbc, __mpage_da_writepage, &mpd);
3374 + * Handle last extent of pages
3376 + if (!mpd.io_done && mpd.next_page != mpd.first_page) {
3377 + mpage_da_map_blocks(&mpd);
3378 + mpage_da_submit_io(&mpd);
3381 + wbc->nr_to_write = to_write - mpd.pages_written;
3386 + * this is a special callback for ->write_begin() only
3387 + * it's intention is to return mapped block or reserve space
3389 +static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
3390 + struct buffer_head *bh_result, int create)
3394 + BUG_ON(create == 0);
3395 + BUG_ON(bh_result->b_size != inode->i_sb->s_blocksize);
3398 + * first, we need to know whether the block is allocated already
3399 + * preallocated blocks are unmapped but should treated
3400 + * the same as allocated blocks.
3402 + ret = ext4_get_blocks_wrap(NULL, inode, iblock, 1, bh_result, 0, 0, 0);
3403 + if ((ret == 0) && !buffer_delay(bh_result)) {
3404 + /* the block isn't (pre)allocated yet, let's reserve space */
3406 + * XXX: __block_prepare_write() unmaps passed block,
3409 + ret = ext4_da_reserve_space(inode, 1);
3411 + /* not enough space to reserve */
3414 + map_bh(bh_result, inode->i_sb, 0);
3415 + set_buffer_new(bh_result);
3416 + set_buffer_delay(bh_result);
3417 + } else if (ret > 0) {
3418 + bh_result->b_size = (ret << inode->i_blkbits);
3424 +#define EXT4_DELALLOC_RSVED 1
3425 +static int ext4_da_get_block_write(struct inode *inode, sector_t iblock,
3426 + struct buffer_head *bh_result, int create)
3429 + unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
3430 + loff_t disksize = EXT4_I(inode)->i_disksize;
3431 + handle_t *handle = NULL;
3433 + handle = ext4_journal_current_handle();
3435 + ret = ext4_get_blocks_wrap(handle, inode, iblock, max_blocks,
3436 + bh_result, 0, 0, 0);
3439 + ret = ext4_get_blocks_wrap(handle, inode, iblock, max_blocks,
3440 + bh_result, create, 0, EXT4_DELALLOC_RSVED);
3444 + bh_result->b_size = (ret << inode->i_blkbits);
3447 + * Update on-disk size along with block allocation
3448 + * we don't use 'extend_disksize' as size may change
3449 + * within already allocated block -bzzz
3451 + disksize = ((loff_t) iblock + ret) << inode->i_blkbits;
3452 + if (disksize > i_size_read(inode))
3453 + disksize = i_size_read(inode);
3454 + if (disksize > EXT4_I(inode)->i_disksize) {
3456 + * XXX: replace with spinlock if seen contended -bzzz
3458 + down_write(&EXT4_I(inode)->i_data_sem);
3459 + if (disksize > EXT4_I(inode)->i_disksize)
3460 + EXT4_I(inode)->i_disksize = disksize;
3461 + up_write(&EXT4_I(inode)->i_data_sem);
3463 + if (EXT4_I(inode)->i_disksize == disksize) {
3464 + ret = ext4_mark_inode_dirty(handle, inode);
3473 +static int ext4_bh_unmapped_or_delay(handle_t *handle, struct buffer_head *bh)
3476 + * unmapped buffer is possible for holes.
3477 + * delay buffer is possible with delayed allocation
3479 + return ((!buffer_mapped(bh) || buffer_delay(bh)) && buffer_dirty(bh));
3482 +static int ext4_normal_get_block_write(struct inode *inode, sector_t iblock,
3483 + struct buffer_head *bh_result, int create)
3486 + unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
3489 + * we don't want to do block allocation in writepage
3490 + * so call get_block_wrap with create = 0
3492 + ret = ext4_get_blocks_wrap(NULL, inode, iblock, max_blocks,
3493 + bh_result, 0, 0, 0);
3495 + bh_result->b_size = (ret << inode->i_blkbits);
3502 + * get called vi ext4_da_writepages after taking page lock (have journal handle)
3503 + * get called via journal_submit_inode_data_buffers (no journal handle)
3504 + * get called via shrink_page_list via pdflush (no journal handle)
3505 + * or grab_page_cache when doing write_begin (have journal handle)
3507 +static int ext4_da_writepage(struct page *page,
3508 + struct writeback_control *wbc)
3512 + unsigned long len;
3513 + struct buffer_head *page_bufs;
3514 + struct inode *inode = page->mapping->host;
3516 + size = i_size_read(inode);
3517 + if (page->index == size >> PAGE_CACHE_SHIFT)
3518 + len = size & ~PAGE_CACHE_MASK;
3520 + len = PAGE_CACHE_SIZE;
3522 + if (page_has_buffers(page)) {
3523 + page_bufs = page_buffers(page);
3524 + if (walk_page_buffers(NULL, page_bufs, 0, len, NULL,
3525 + ext4_bh_unmapped_or_delay)) {
3527 + * We don't want to do block allocation
3528 + * So redirty the page and return
3529 + * We may reach here when we do a journal commit
3530 + * via journal_submit_inode_data_buffers.
3531 + * If we don't have mapping block we just ignore
3532 + * them. We can also reach here via shrink_page_list
3534 + redirty_page_for_writepage(wbc, page);
3535 + unlock_page(page);
3540 + * The test for page_has_buffers() is subtle:
3541 + * We know the page is dirty but it lost buffers. That means
3542 + * that at some moment in time after write_begin()/write_end()
3543 + * has been called all buffers have been clean and thus they
3544 + * must have been written at least once. So they are all
3545 + * mapped and we can happily proceed with mapping them
3546 + * and writing the page.
3548 + * Try to initialize the buffer_heads and check whether
3549 + * all are mapped and non delay. We don't want to
3550 + * do block allocation here.
3552 + ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE,
3553 + ext4_normal_get_block_write);
3555 + page_bufs = page_buffers(page);
3556 + /* check whether all are mapped and non delay */
3557 + if (walk_page_buffers(NULL, page_bufs, 0, len, NULL,
3558 + ext4_bh_unmapped_or_delay)) {
3559 + redirty_page_for_writepage(wbc, page);
3560 + unlock_page(page);
3565 + * We can't do block allocation here
3566 + * so just redity the page and unlock
3569 + redirty_page_for_writepage(wbc, page);
3570 + unlock_page(page);
3575 + if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode))
3576 + ret = nobh_writepage(page, ext4_normal_get_block_write, wbc);
3578 + ret = block_write_full_page(page,
3579 + ext4_normal_get_block_write,
3586 + * This is called via ext4_da_writepages() to
3587 + * calulate the total number of credits to reserve to fit
3588 + * a single extent allocation into a single transaction,
3589 + * ext4_da_writpeages() will loop calling this before
3590 + * the block allocation.
3593 +static int ext4_da_writepages_trans_blocks(struct inode *inode)
3595 + int max_blocks = EXT4_I(inode)->i_reserved_data_blocks;
3598 + * With non-extent format the journal credit needed to
3599 + * insert nrblocks contiguous block is dependent on
3600 + * number of contiguous block. So we will limit
3601 + * number of contiguous block to a sane value
3603 + if (!(inode->i_flags & EXT4_EXTENTS_FL) &&
3604 + (max_blocks > EXT4_MAX_TRANS_DATA))
3605 + max_blocks = EXT4_MAX_TRANS_DATA;
3607 + return ext4_chunk_trans_blocks(inode, max_blocks);
3610 +static int ext4_da_writepages(struct address_space *mapping,
3611 + struct writeback_control *wbc)
3613 + handle_t *handle = NULL;
3614 + loff_t range_start = 0;
3615 + struct inode *inode = mapping->host;
3616 + int needed_blocks, ret = 0, nr_to_writebump = 0;
3617 + long to_write, pages_skipped = 0;
3618 + struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);
3621 + * No pages to write? This is mainly a kludge to avoid starting
3622 + * a transaction for special inodes like journal inode on last iput()
3623 + * because that could violate lock ordering on umount
3625 + if (!mapping->nrpages || !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
3628 + * Make sure nr_to_write is >= sbi->s_mb_stream_request
3629 + * This make sure small files blocks are allocated in
3630 + * single attempt. This ensure that small files
3631 + * get less fragmented.
3633 + if (wbc->nr_to_write < sbi->s_mb_stream_request) {
3634 + nr_to_writebump = sbi->s_mb_stream_request - wbc->nr_to_write;
3635 + wbc->nr_to_write = sbi->s_mb_stream_request;
3638 + if (!wbc->range_cyclic)
3640 + * If range_cyclic is not set force range_cont
3641 + * and save the old writeback_index
3643 + wbc->range_cont = 1;
3645 + range_start = wbc->range_start;
3646 + pages_skipped = wbc->pages_skipped;
3649 + to_write = wbc->nr_to_write;
3650 + while (!ret && to_write > 0) {
3653 + * we insert one extent at a time. So we need
3654 + * credit needed for single extent allocation.
3655 + * journalled mode is currently not supported
3658 + BUG_ON(ext4_should_journal_data(inode));
3659 + needed_blocks = ext4_da_writepages_trans_blocks(inode);
3661 + /* start a new transaction*/
3662 + handle = ext4_journal_start(inode, needed_blocks);
3663 + if (IS_ERR(handle)) {
3664 + ret = PTR_ERR(handle);
3665 + printk(KERN_EMERG "%s: jbd2_start: "
3666 + "%ld pages, ino %lu; err %d\n", __func__,
3667 + wbc->nr_to_write, inode->i_ino, ret);
3669 + goto out_writepages;
3671 + if (ext4_should_order_data(inode)) {
3673 + * With ordered mode we need to add
3674 + * the inode to the journal handl
3675 + * when we do block allocation.
3677 + ret = ext4_jbd2_file_inode(handle, inode);
3679 + ext4_journal_stop(handle);
3680 + goto out_writepages;
3684 + to_write -= wbc->nr_to_write;
3685 + ret = mpage_da_writepages(mapping, wbc,
3686 + ext4_da_get_block_write);
3687 + ext4_journal_stop(handle);
3688 + if (ret == MPAGE_DA_EXTENT_TAIL) {
3690 + * got one extent now try with
3691 + * rest of the pages
3693 + to_write += wbc->nr_to_write;
3695 + } else if (wbc->nr_to_write) {
3697 + * There is no more writeout needed
3698 + * or we requested for a noblocking writeout
3699 + * and we found the device congested
3701 + to_write += wbc->nr_to_write;
3704 + wbc->nr_to_write = to_write;
3707 + if (wbc->range_cont && (pages_skipped != wbc->pages_skipped)) {
3708 + /* We skipped pages in this loop */
3709 + wbc->range_start = range_start;
3710 + wbc->nr_to_write = to_write +
3711 + wbc->pages_skipped - pages_skipped;
3712 + wbc->pages_skipped = pages_skipped;
3713 + goto restart_loop;
3717 + wbc->nr_to_write = to_write - nr_to_writebump;
3718 + wbc->range_start = range_start;
3722 +static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
3723 + loff_t pos, unsigned len, unsigned flags,
3724 + struct page **pagep, void **fsdata)
3726 + int ret, retries = 0;
3727 + struct page *page;
3729 + unsigned from, to;
3730 + struct inode *inode = mapping->host;
3733 + index = pos >> PAGE_CACHE_SHIFT;
3734 + from = pos & (PAGE_CACHE_SIZE - 1);
3739 + * With delayed allocation, we don't log the i_disksize update
3740 + * if there is delayed block allocation. But we still need
3741 + * to journalling the i_disksize update if writes to the end
3742 + * of file which has an already mapped buffer.
3744 + handle = ext4_journal_start(inode, 1);
3745 + if (IS_ERR(handle)) {
3746 + ret = PTR_ERR(handle);
3750 + page = __grab_cache_page(mapping, index);
3752 + ext4_journal_stop(handle);
3758 + ret = block_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
3759 + ext4_da_get_block_prep);
3761 + unlock_page(page);
3762 + ext4_journal_stop(handle);
3763 + page_cache_release(page);
3766 + if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
3773 + * Check if we should update i_disksize
3774 + * when write to the end of file but not require block allocation
3776 +static int ext4_da_should_update_i_disksize(struct page *page,
3777 + unsigned long offset)
3779 + struct buffer_head *bh;
3780 + struct inode *inode = page->mapping->host;
3784 + bh = page_buffers(page);
3785 + idx = offset >> inode->i_blkbits;
3787 + for (i=0; i < idx; i++)
3788 + bh = bh->b_this_page;
3790 + if (!buffer_mapped(bh) || (buffer_delay(bh)))
3795 +static int ext4_da_write_end(struct file *file,
3796 + struct address_space *mapping,
3797 + loff_t pos, unsigned len, unsigned copied,
3798 + struct page *page, void *fsdata)
3800 + struct inode *inode = mapping->host;
3801 + int ret = 0, ret2;
3802 + handle_t *handle = ext4_journal_current_handle();
3803 + loff_t new_i_size;
3804 + unsigned long start, end;
3806 + start = pos & (PAGE_CACHE_SIZE - 1);
3807 + end = start + copied -1;
3810 + * generic_write_end() will run mark_inode_dirty() if i_size
3811 + * changes. So let's piggyback the i_disksize mark_inode_dirty
3815 + new_i_size = pos + copied;
3816 + if (new_i_size > EXT4_I(inode)->i_disksize) {
3817 + if (ext4_da_should_update_i_disksize(page, end)) {
3818 + down_write(&EXT4_I(inode)->i_data_sem);
3819 + if (new_i_size > EXT4_I(inode)->i_disksize) {
3821 + * Updating i_disksize when extending file
3822 + * without needing block allocation
3824 + if (ext4_should_order_data(inode))
3825 + ret = ext4_jbd2_file_inode(handle,
3828 + EXT4_I(inode)->i_disksize = new_i_size;
3830 + up_write(&EXT4_I(inode)->i_data_sem);
3833 + ret2 = generic_write_end(file, mapping, pos, len, copied,
3838 + ret2 = ext4_journal_stop(handle);
3842 + return ret ? ret : copied;
3845 +static void ext4_da_invalidatepage(struct page *page, unsigned long offset)
3848 + * Drop reserved blocks
3850 + BUG_ON(!PageLocked(page));
3851 + if (!page_has_buffers(page))
3854 + ext4_da_page_release_reservation(page, offset);
3857 + ext4_invalidatepage(page, offset);
3864 * bmap() is special. It gets used by applications such as lilo and by
3865 * the swapper to find the on-disk block of a specific piece of data.
3866 @@ -1418,6 +2552,16 @@ static sector_t ext4_bmap(struct address_space *mapping, sector_t block)
3870 + if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY) &&
3871 + test_opt(inode->i_sb, DELALLOC)) {
3873 + * With delalloc we want to sync the file
3874 + * so that we can make sure we allocate
3877 + filemap_write_and_wait(mapping);
3880 if (EXT4_I(inode)->i_state & EXT4_STATE_JDATA) {
3882 * This is a REALLY heavyweight approach, but the use of
3883 @@ -1462,21 +2606,17 @@ static int bput_one(handle_t *handle, struct buffer_head *bh)
3887 -static int jbd2_journal_dirty_data_fn(handle_t *handle, struct buffer_head *bh)
3889 - if (buffer_mapped(bh))
3890 - return ext4_journal_dirty_data(handle, bh);
3895 - * Note that we always start a transaction even if we're not journalling
3896 - * data. This is to preserve ordering: any hole instantiation within
3897 - * __block_write_full_page -> ext4_get_block() should be journalled
3898 - * along with the data so we don't crash and then get metadata which
3899 - * refers to old data.
3900 + * Note that we don't need to start a transaction unless we're journaling data
3901 + * because we should have holes filled from ext4_page_mkwrite(). We even don't
3902 + * need to file the inode to the transaction's list in ordered mode because if
3903 + * we are writing back data added by write(), the inode is already there and if
3904 + * we are writing back data modified via mmap(), noone guarantees in which
3905 + * transaction the data will hit the disk. In case we are journaling data, we
3906 + * cannot start transaction directly because transaction start ranks above page
3907 + * lock so we have to do some magic.
3909 - * In all journalling modes block_write_full_page() will start the I/O.
3910 + * In all journaling modes block_write_full_page() will start the I/O.
3914 @@ -1518,105 +2658,103 @@ static int jbd2_journal_dirty_data_fn(handle_t *handle, struct buffer_head *bh)
3915 * disastrous. Any write() or metadata operation will sync the fs for
3918 - * AKPM2: if all the page's buffers are mapped to disk and !data=journal,
3919 - * we don't need to open a transaction here.
3921 -static int ext4_ordered_writepage(struct page *page,
3922 +static int __ext4_normal_writepage(struct page *page,
3923 struct writeback_control *wbc)
3925 struct inode *inode = page->mapping->host;
3926 - struct buffer_head *page_bufs;
3927 - handle_t *handle = NULL;
3931 - J_ASSERT(PageLocked(page));
3934 - * We give up here if we're reentered, because it might be for a
3935 - * different filesystem.
3937 - if (ext4_journal_current_handle())
3940 - handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode));
3941 + if (test_opt(inode->i_sb, NOBH))
3942 + return nobh_writepage(page,
3943 + ext4_normal_get_block_write, wbc);
3945 + return block_write_full_page(page,
3946 + ext4_normal_get_block_write,
3950 - if (IS_ERR(handle)) {
3951 - ret = PTR_ERR(handle);
3954 +static int ext4_normal_writepage(struct page *page,
3955 + struct writeback_control *wbc)
3957 + struct inode *inode = page->mapping->host;
3958 + loff_t size = i_size_read(inode);
3961 - if (!page_has_buffers(page)) {
3962 - create_empty_buffers(page, inode->i_sb->s_blocksize,
3963 - (1 << BH_Dirty)|(1 << BH_Uptodate));
3964 + J_ASSERT(PageLocked(page));
3965 + if (page->index == size >> PAGE_CACHE_SHIFT)
3966 + len = size & ~PAGE_CACHE_MASK;
3968 + len = PAGE_CACHE_SIZE;
3970 + if (page_has_buffers(page)) {
3971 + /* if page has buffers it should all be mapped
3972 + * and allocated. If there are not buffers attached
3973 + * to the page we know the page is dirty but it lost
3974 + * buffers. That means that at some moment in time
3975 + * after write_begin() / write_end() has been called
3976 + * all buffers have been clean and thus they must have been
3977 + * written at least once. So they are all mapped and we can
3978 + * happily proceed with mapping them and writing the page.
3980 + BUG_ON(walk_page_buffers(NULL, page_buffers(page), 0, len, NULL,
3981 + ext4_bh_unmapped_or_delay));
3983 - page_bufs = page_buffers(page);
3984 - walk_page_buffers(handle, page_bufs, 0,
3985 - PAGE_CACHE_SIZE, NULL, bget_one);
3987 - ret = block_write_full_page(page, ext4_get_block, wbc);
3990 - * The page can become unlocked at any point now, and
3991 - * truncate can then come in and change things. So we
3992 - * can't touch *page from now on. But *page_bufs is
3993 - * safe due to elevated refcount.
3997 - * And attach them to the current transaction. But only if
3998 - * block_write_full_page() succeeded. Otherwise they are unmapped,
3999 - * and generally junk.
4002 - err = walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE,
4003 - NULL, jbd2_journal_dirty_data_fn);
4007 - walk_page_buffers(handle, page_bufs, 0,
4008 - PAGE_CACHE_SIZE, NULL, bput_one);
4009 - err = ext4_journal_stop(handle);
4013 + if (!ext4_journal_current_handle())
4014 + return __ext4_normal_writepage(page, wbc);
4017 redirty_page_for_writepage(wbc, page);
4023 -static int ext4_writeback_writepage(struct page *page,
4024 +static int __ext4_journalled_writepage(struct page *page,
4025 struct writeback_control *wbc)
4027 - struct inode *inode = page->mapping->host;
4028 + struct address_space *mapping = page->mapping;
4029 + struct inode *inode = mapping->host;
4030 + struct buffer_head *page_bufs;
4031 handle_t *handle = NULL;
4035 - if (ext4_journal_current_handle())
4037 + ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE,
4038 + ext4_normal_get_block_write);
4042 + page_bufs = page_buffers(page);
4043 + walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE, NULL,
4045 + /* As soon as we unlock the page, it can go away, but we have
4046 + * references to buffers so we are safe */
4047 + unlock_page(page);
4049 handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode));
4050 if (IS_ERR(handle)) {
4051 ret = PTR_ERR(handle);
4056 - if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode))
4057 - ret = nobh_writepage(page, ext4_get_block, wbc);
4059 - ret = block_write_full_page(page, ext4_get_block, wbc);
4060 + ret = walk_page_buffers(handle, page_bufs, 0,
4061 + PAGE_CACHE_SIZE, NULL, do_journal_get_write_access);
4063 + err = walk_page_buffers(handle, page_bufs, 0,
4064 + PAGE_CACHE_SIZE, NULL, write_end_fn);
4067 err = ext4_journal_stop(handle);
4073 - redirty_page_for_writepage(wbc, page);
4074 + walk_page_buffers(handle, page_bufs, 0,
4075 + PAGE_CACHE_SIZE, NULL, bput_one);
4076 + EXT4_I(inode)->i_state |= EXT4_STATE_JDATA;
4085 @@ -1624,59 +2762,53 @@ static int ext4_journalled_writepage(struct page *page,
4086 struct writeback_control *wbc)
4088 struct inode *inode = page->mapping->host;
4089 - handle_t *handle = NULL;
4092 + loff_t size = i_size_read(inode);
4095 - if (ext4_journal_current_handle())
4097 + J_ASSERT(PageLocked(page));
4098 + if (page->index == size >> PAGE_CACHE_SHIFT)
4099 + len = size & ~PAGE_CACHE_MASK;
4101 + len = PAGE_CACHE_SIZE;
4103 + if (page_has_buffers(page)) {
4104 + /* if page has buffers it should all be mapped
4105 + * and allocated. If there are not buffers attached
4106 + * to the page we know the page is dirty but it lost
4107 + * buffers. That means that at some moment in time
4108 + * after write_begin() / write_end() has been called
4109 + * all buffers have been clean and thus they must have been
4110 + * written at least once. So they are all mapped and we can
4111 + * happily proceed with mapping them and writing the page.
4113 + BUG_ON(walk_page_buffers(NULL, page_buffers(page), 0, len, NULL,
4114 + ext4_bh_unmapped_or_delay));
4117 - handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode));
4118 - if (IS_ERR(handle)) {
4119 - ret = PTR_ERR(handle);
4120 + if (ext4_journal_current_handle())
4124 - if (!page_has_buffers(page) || PageChecked(page)) {
4125 + if (PageChecked(page)) {
4127 * It's mmapped pagecache. Add buffers and journal it. There
4128 * doesn't seem much point in redirtying the page here.
4130 ClearPageChecked(page);
4131 - ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE,
4134 - ext4_journal_stop(handle);
4137 - ret = walk_page_buffers(handle, page_buffers(page), 0,
4138 - PAGE_CACHE_SIZE, NULL, do_journal_get_write_access);
4140 - err = walk_page_buffers(handle, page_buffers(page), 0,
4141 - PAGE_CACHE_SIZE, NULL, write_end_fn);
4144 - EXT4_I(inode)->i_state |= EXT4_STATE_JDATA;
4145 - unlock_page(page);
4146 + return __ext4_journalled_writepage(page, wbc);
4149 * It may be a page full of checkpoint-mode buffers. We don't
4150 * really know unless we go poke around in the buffer_heads.
4151 * But block_write_full_page will do the right thing.
4153 - ret = block_write_full_page(page, ext4_get_block, wbc);
4154 + return block_write_full_page(page,
4155 + ext4_normal_get_block_write,
4158 - err = ext4_journal_stop(handle);
4165 redirty_page_for_writepage(wbc, page);
4172 static int ext4_readpage(struct file *file, struct page *page)
4173 @@ -1819,7 +2951,7 @@ static int ext4_journalled_set_page_dirty(struct page *page)
4174 static const struct address_space_operations ext4_ordered_aops = {
4175 .readpage = ext4_readpage,
4176 .readpages = ext4_readpages,
4177 - .writepage = ext4_ordered_writepage,
4178 + .writepage = ext4_normal_writepage,
4179 .sync_page = block_sync_page,
4180 .write_begin = ext4_write_begin,
4181 .write_end = ext4_ordered_write_end,
4182 @@ -1833,7 +2965,7 @@ static const struct address_space_operations ext4_ordered_aops = {
4183 static const struct address_space_operations ext4_writeback_aops = {
4184 .readpage = ext4_readpage,
4185 .readpages = ext4_readpages,
4186 - .writepage = ext4_writeback_writepage,
4187 + .writepage = ext4_normal_writepage,
4188 .sync_page = block_sync_page,
4189 .write_begin = ext4_write_begin,
4190 .write_end = ext4_writeback_write_end,
4191 @@ -1857,10 +2989,31 @@ static const struct address_space_operations ext4_journalled_aops = {
4192 .releasepage = ext4_releasepage,
4195 +static const struct address_space_operations ext4_da_aops = {
4196 + .readpage = ext4_readpage,
4197 + .readpages = ext4_readpages,
4198 + .writepage = ext4_da_writepage,
4199 + .writepages = ext4_da_writepages,
4200 + .sync_page = block_sync_page,
4201 + .write_begin = ext4_da_write_begin,
4202 + .write_end = ext4_da_write_end,
4203 + .bmap = ext4_bmap,
4204 + .invalidatepage = ext4_da_invalidatepage,
4205 + .releasepage = ext4_releasepage,
4206 + .direct_IO = ext4_direct_IO,
4207 + .migratepage = buffer_migrate_page,
4210 void ext4_set_aops(struct inode *inode)
4212 - if (ext4_should_order_data(inode))
4213 + if (ext4_should_order_data(inode) &&
4214 + test_opt(inode->i_sb, DELALLOC))
4215 + inode->i_mapping->a_ops = &ext4_da_aops;
4216 + else if (ext4_should_order_data(inode))
4217 inode->i_mapping->a_ops = &ext4_ordered_aops;
4218 + else if (ext4_should_writeback_data(inode) &&
4219 + test_opt(inode->i_sb, DELALLOC))
4220 + inode->i_mapping->a_ops = &ext4_da_aops;
4221 else if (ext4_should_writeback_data(inode))
4222 inode->i_mapping->a_ops = &ext4_writeback_aops;
4224 @@ -1873,7 +3026,7 @@ void ext4_set_aops(struct inode *inode)
4225 * This required during truncate. We need to physically zero the tail end
4226 * of that block so it doesn't yield old data if the file is later grown.
4228 -int ext4_block_truncate_page(handle_t *handle, struct page *page,
4229 +int ext4_block_truncate_page(handle_t *handle,
4230 struct address_space *mapping, loff_t from)
4232 ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT;
4233 @@ -1882,8 +3035,13 @@ int ext4_block_truncate_page(handle_t *handle, struct page *page,
4235 struct inode *inode = mapping->host;
4236 struct buffer_head *bh;
4237 + struct page *page;
4240 + page = grab_cache_page(mapping, from >> PAGE_CACHE_SHIFT);
4244 blocksize = inode->i_sb->s_blocksize;
4245 length = blocksize - (offset & (blocksize - 1));
4246 iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
4247 @@ -1956,7 +3114,7 @@ int ext4_block_truncate_page(handle_t *handle, struct page *page,
4248 err = ext4_journal_dirty_metadata(handle, bh);
4250 if (ext4_should_order_data(inode))
4251 - err = ext4_journal_dirty_data(handle, bh);
4252 + err = ext4_jbd2_file_inode(handle, inode);
4253 mark_buffer_dirty(bh);
4256 @@ -2179,7 +3337,21 @@ static void ext4_free_data(handle_t *handle, struct inode *inode,
4259 BUFFER_TRACE(this_bh, "call ext4_journal_dirty_metadata");
4260 - ext4_journal_dirty_metadata(handle, this_bh);
4263 + * The buffer head should have an attached journal head at this
4264 + * point. However, if the data is corrupted and an indirect
4265 + * block pointed to itself, it would have been detached when
4266 + * the block was cleared. Check for this instead of OOPSing.
4268 + if (bh2jh(this_bh))
4269 + ext4_journal_dirty_metadata(handle, this_bh);
4271 + ext4_error(inode->i_sb, __func__,
4272 + "circular indirect block detected, "
4273 + "inode=%lu, block=%llu",
4275 + (unsigned long long) this_bh->b_blocknr);
4279 @@ -2305,6 +3477,19 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
4283 +int ext4_can_truncate(struct inode *inode)
4285 + if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
4287 + if (S_ISREG(inode->i_mode))
4289 + if (S_ISDIR(inode->i_mode))
4291 + if (S_ISLNK(inode->i_mode))
4292 + return !ext4_inode_is_fast_symlink(inode);
4299 @@ -2347,51 +3532,25 @@ void ext4_truncate(struct inode *inode)
4301 ext4_lblk_t last_block;
4302 unsigned blocksize = inode->i_sb->s_blocksize;
4303 - struct page *page;
4305 - if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
4306 - S_ISLNK(inode->i_mode)))
4308 - if (ext4_inode_is_fast_symlink(inode))
4310 - if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
4311 + if (!ext4_can_truncate(inode))
4315 - * We have to lock the EOF page here, because lock_page() nests
4316 - * outside jbd2_journal_start().
4318 - if ((inode->i_size & (blocksize - 1)) == 0) {
4319 - /* Block boundary? Nothing to do */
4322 - page = grab_cache_page(mapping,
4323 - inode->i_size >> PAGE_CACHE_SHIFT);
4328 if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) {
4329 - ext4_ext_truncate(inode, page);
4330 + ext4_ext_truncate(inode);
4334 handle = start_transaction(inode);
4335 - if (IS_ERR(handle)) {
4337 - clear_highpage(page);
4338 - flush_dcache_page(page);
4339 - unlock_page(page);
4340 - page_cache_release(page);
4342 + if (IS_ERR(handle))
4343 return; /* AKPM: return what? */
4346 last_block = (inode->i_size + blocksize-1)
4347 >> EXT4_BLOCK_SIZE_BITS(inode->i_sb);
4350 - ext4_block_truncate_page(handle, page, mapping, inode->i_size);
4351 + if (inode->i_size & (blocksize - 1))
4352 + if (ext4_block_truncate_page(handle, mapping, inode->i_size))
4355 n = ext4_block_to_path(inode, last_block, offsets, NULL);
4357 @@ -2410,6 +3569,14 @@ void ext4_truncate(struct inode *inode)
4361 + * From here we block out all ext4_get_block() callers who want to
4362 + * modify the block allocation tree.
4364 + down_write(&ei->i_data_sem);
4366 + ext4_discard_reservation(inode);
4369 * The orphan list entry will now protect us from any crash which
4370 * occurs before the truncate completes, so it is now safe to propagate
4371 * the new, shorter inode size (held for now in i_size) into the
4372 @@ -2418,12 +3585,6 @@ void ext4_truncate(struct inode *inode)
4374 ei->i_disksize = inode->i_size;
4377 - * From here we block out all ext4_get_block() callers who want to
4378 - * modify the block allocation tree.
4380 - down_write(&ei->i_data_sem);
4382 if (n == 1) { /* direct blocks */
4383 ext4_free_data(handle, inode, NULL, i_data+offsets[0],
4384 i_data + EXT4_NDIR_BLOCKS);
4385 @@ -2484,8 +3645,6 @@ do_indirects:
4389 - ext4_discard_reservation(inode);
4391 up_write(&ei->i_data_sem);
4392 inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
4393 ext4_mark_inode_dirty(handle, inode);
4394 @@ -2571,6 +3730,16 @@ static int __ext4_get_inode_loc(struct inode *inode,
4396 if (!buffer_uptodate(bh)) {
4400 + * If the buffer has the write error flag, we have failed
4401 + * to write out another inode in the same block. In this
4402 + * case, we don't have to read the block because we may
4403 + * read the old inode data successfully.
4405 + if (buffer_write_io_error(bh) && !buffer_uptodate(bh))
4406 + set_buffer_uptodate(bh);
4408 if (buffer_uptodate(bh)) {
4409 /* someone brought it uptodate while we waited */
4411 @@ -3107,7 +4276,14 @@ int ext4_write_inode(struct inode *inode, int wait)
4412 * be freed, so we have a strong guarantee that no future commit will
4413 * leave these blocks visible to the user.)
4415 - * Called with inode->sem down.
4416 + * Another thing we have to assure is that if we are in ordered mode
4417 + * and inode is still attached to the committing transaction, we must
4418 + * we start writeout of all the dirty pages which are being truncated.
4419 + * This way we are sure that all the data written in the previous
4420 + * transaction are already on disk (truncate waits for pages under
4423 + * Called with inode->i_mutex down.
4425 int ext4_setattr(struct dentry *dentry, struct iattr *attr)
4427 @@ -3173,6 +4349,22 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
4430 ext4_journal_stop(handle);
4432 + if (ext4_should_order_data(inode)) {
4433 + error = ext4_begin_ordered_truncate(inode,
4436 + /* Do as much error cleanup as possible */
4437 + handle = ext4_journal_start(inode, 3);
4438 + if (IS_ERR(handle)) {
4439 + ext4_orphan_del(NULL, inode);
4442 + ext4_orphan_del(handle, inode);
4443 + ext4_journal_stop(handle);
4449 rc = inode_setattr(inode, attr);
4450 @@ -3193,58 +4385,156 @@ err_out:
4454 +int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry,
4455 + struct kstat *stat)
4457 + struct inode *inode;
4458 + unsigned long delalloc_blocks;
4460 + inode = dentry->d_inode;
4461 + generic_fillattr(inode, stat);
4464 + * We can't update i_blocks if the block allocation is delayed
4465 + * otherwise in the case of system crash before the real block
4466 + * allocation is done, we will have i_blocks inconsistent with
4467 + * on-disk file blocks.
4468 + * We always keep i_blocks updated together with real
4469 + * allocation. But to not confuse with user, stat
4470 + * will return the blocks that include the delayed allocation
4471 + * blocks for this file.
4473 + spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
4474 + delalloc_blocks = EXT4_I(inode)->i_reserved_data_blocks;
4475 + spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
4477 + stat->blocks += (delalloc_blocks << inode->i_sb->s_blocksize_bits)>>9;
4481 +static int ext4_indirect_trans_blocks(struct inode *inode, int nrblocks,
4486 + /* if nrblocks are contiguous */
4489 + * With N contiguous data blocks, it need at most
4490 + * N/EXT4_ADDR_PER_BLOCK(inode->i_sb) indirect blocks
4491 + * 2 dindirect blocks
4492 + * 1 tindirect block
4494 + indirects = nrblocks / EXT4_ADDR_PER_BLOCK(inode->i_sb);
4495 + return indirects + 3;
4498 + * if nrblocks are not contiguous, worse case, each block touch
4499 + * a indirect block, and each indirect block touch a double indirect
4500 + * block, plus a triple indirect block
4502 + indirects = nrblocks * 2 + 1;
4506 +static int ext4_index_trans_blocks(struct inode *inode, int nrblocks, int chunk)
4508 + if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
4509 + return ext4_indirect_trans_blocks(inode, nrblocks, 0);
4510 + return ext4_ext_index_trans_blocks(inode, nrblocks, 0);
4513 - * How many blocks doth make a writepage()?
4515 - * With N blocks per page, it may be:
4517 - * 2 indirect block
4520 - * N+5 bitmap blocks (from the above)
4521 - * N+5 group descriptor summary blocks
4524 - * 2 * EXT4_SINGLEDATA_TRANS_BLOCKS for the quote files
4526 - * 3 * (N + 5) + 2 + 2 * EXT4_SINGLEDATA_TRANS_BLOCKS
4528 - * With ordered or writeback data it's the same, less the N data blocks.
4530 - * If the inode's direct blocks can hold an integral number of pages then a
4531 - * page cannot straddle two indirect blocks, and we can only touch one indirect
4532 - * and dindirect block, and the "5" above becomes "3".
4534 - * This still overestimates under most circumstances. If we were to pass the
4535 - * start and end offsets in here as well we could do block_to_path() on each
4536 - * block and work out the exact number of indirects which are touched. Pah.
4537 + * Account for index blocks, block groups bitmaps and block group
4538 + * descriptor blocks if modify datablocks and index blocks
4539 + * worse case, the indexs blocks spread over different block groups
4541 + * If datablocks are discontiguous, they are possible to spread over
4542 + * different block groups too. If they are contiugous, with flexbg,
4543 + * they could still across block group boundary.
4545 + * Also account for superblock, inode, quota and xattr blocks
4547 +int ext4_meta_trans_blocks(struct inode *inode, int nrblocks, int chunk)
4549 + int groups, gdpblocks;
4554 + * How many index blocks need to touch to modify nrblocks?
4555 + * The "Chunk" flag indicating whether the nrblocks is
4556 + * physically contiguous on disk
4558 + * For Direct IO and fallocate, they calls get_block to allocate
4559 + * one single extent at a time, so they could set the "Chunk" flag
4561 + idxblocks = ext4_index_trans_blocks(inode, nrblocks, chunk);
4566 + * Now let's see how many group bitmaps and group descriptors need
4569 + groups = idxblocks;
4573 + groups += nrblocks;
4575 + gdpblocks = groups;
4576 + if (groups > EXT4_SB(inode->i_sb)->s_groups_count)
4577 + groups = EXT4_SB(inode->i_sb)->s_groups_count;
4578 + if (groups > EXT4_SB(inode->i_sb)->s_gdb_count)
4579 + gdpblocks = EXT4_SB(inode->i_sb)->s_gdb_count;
4581 + /* bitmaps and block group descriptor blocks */
4582 + ret += groups + gdpblocks;
4584 + /* Blocks for super block, inode, quota and xattr blocks */
4585 + ret += EXT4_META_TRANS_BLOCKS(inode->i_sb);
4591 + * Calulate the total number of credits to reserve to fit
4592 + * the modification of a single pages into a single transaction,
4593 + * which may include multiple chunks of block allocations.
4595 + * This could be called via ext4_write_begin()
4597 + * We need to consider the worse case, when
4598 + * one new block per extent.
4600 int ext4_writepage_trans_blocks(struct inode *inode)
4602 int bpp = ext4_journal_blocks_per_page(inode);
4603 - int indirects = (EXT4_NDIR_BLOCKS % bpp) ? 5 : 3;
4606 - if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)
4607 - return ext4_ext_writepage_trans_blocks(inode, bpp);
4608 + ret = ext4_meta_trans_blocks(inode, bpp, 0);
4610 + /* Account for data blocks for journalled mode */
4611 if (ext4_should_journal_data(inode))
4612 - ret = 3 * (bpp + indirects) + 2;
4614 - ret = 2 * (bpp + indirects) + 2;
4616 -#ifdef CONFIG_QUOTA
4617 - /* We know that structure was already allocated during DQUOT_INIT so
4618 - * we will be updating only the data blocks + inodes */
4619 - ret += 2*EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb);
4627 + * Calculate the journal credits for a chunk of data modification.
4629 + * This is called from DIO, fallocate or whoever calling
4630 + * ext4_get_blocks_wrap() to map/allocate a chunk of contigous disk blocks.
4632 + * journal buffers for data blocks are not included here, as DIO
4633 + * and fallocate do no need to journal data buffers.
4635 +int ext4_chunk_trans_blocks(struct inode *inode, int nrblocks)
4637 + return ext4_meta_trans_blocks(inode, nrblocks, 1);
4641 * The caller must have previously called ext4_reserve_inode_write().
4642 * Give this, we know that the caller already has write access to iloc->bh.
4644 @@ -3506,3 +4796,64 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
4649 +static int ext4_bh_unmapped(handle_t *handle, struct buffer_head *bh)
4651 + return !buffer_mapped(bh);
4654 +int ext4_page_mkwrite(struct vm_area_struct *vma, struct page *page)
4657 + unsigned long len;
4658 + int ret = -EINVAL;
4659 + struct file *file = vma->vm_file;
4660 + struct inode *inode = file->f_path.dentry->d_inode;
4661 + struct address_space *mapping = inode->i_mapping;
4664 + * Get i_alloc_sem to stop truncates messing with the inode. We cannot
4665 + * get i_mutex because we are already holding mmap_sem.
4667 + down_read(&inode->i_alloc_sem);
4668 + size = i_size_read(inode);
4669 + if (page->mapping != mapping || size <= page_offset(page)
4670 + || !PageUptodate(page)) {
4671 + /* page got truncated from under us? */
4675 + if (PageMappedToDisk(page))
4678 + if (page->index == size >> PAGE_CACHE_SHIFT)
4679 + len = size & ~PAGE_CACHE_MASK;
4681 + len = PAGE_CACHE_SIZE;
4683 + if (page_has_buffers(page)) {
4684 + /* return if we have all the buffers mapped */
4685 + if (!walk_page_buffers(NULL, page_buffers(page), 0, len, NULL,
4686 + ext4_bh_unmapped))
4690 + * OK, we need to fill the hole... Do write_begin write_end
4691 + * to do block allocation/reservation.We are not holding
4692 + * inode.i__mutex here. That allow * parallel write_begin,
4693 + * write_end call. lock_page prevent this from happening
4694 + * on the same page though
4696 + ret = mapping->a_ops->write_begin(file, mapping, page_offset(page),
4697 + len, AOP_FLAG_UNINTERRUPTIBLE, &page, NULL);
4700 + ret = mapping->a_ops->write_end(file, mapping, page_offset(page),
4701 + len, len, page, NULL);
4706 + up_read(&inode->i_alloc_sem);
4709 diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
4710 index c9900aa..e0e3a5e 100644
4711 --- a/fs/ext4/mballoc.c
4712 +++ b/fs/ext4/mballoc.c
4713 @@ -381,22 +381,28 @@ static inline void mb_clear_bit_atomic(spinlock_t *lock, int bit, void *addr)
4715 static inline int mb_find_next_zero_bit(void *addr, int max, int start)
4718 + int fix = 0, ret, tmpmax;
4719 addr = mb_correct_addr_and_bit(&fix, addr);
4721 + tmpmax = max + fix;
4724 - return ext4_find_next_zero_bit(addr, max, start) - fix;
4725 + ret = ext4_find_next_zero_bit(addr, tmpmax, start) - fix;
4731 static inline int mb_find_next_bit(void *addr, int max, int start)
4734 + int fix = 0, ret, tmpmax;
4735 addr = mb_correct_addr_and_bit(&fix, addr);
4737 + tmpmax = max + fix;
4740 - return ext4_find_next_bit(addr, max, start) - fix;
4741 + ret = ext4_find_next_bit(addr, tmpmax, start) - fix;
4747 static void *mb_find_buddy(struct ext4_buddy *e4b, int order, int *max)
4748 @@ -781,13 +787,16 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
4749 if (bh_uptodate_or_lock(bh[i]))
4752 + spin_lock(sb_bgl_lock(EXT4_SB(sb), first_group + i));
4753 if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
4754 ext4_init_block_bitmap(sb, bh[i],
4755 first_group + i, desc);
4756 set_buffer_uptodate(bh[i]);
4757 unlock_buffer(bh[i]);
4758 + spin_unlock(sb_bgl_lock(EXT4_SB(sb), first_group + i));
4761 + spin_unlock(sb_bgl_lock(EXT4_SB(sb), first_group + i));
4763 bh[i]->b_end_io = end_buffer_read_sync;
4764 submit_bh(READ, bh[i]);
4765 @@ -803,6 +812,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
4766 if (!buffer_uptodate(bh[i]))
4770 first_block = page->index * blocks_per_page;
4771 for (i = 0; i < blocks_per_page; i++) {
4773 @@ -883,6 +893,7 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
4779 mb_debug("load group %lu\n", group);
4781 @@ -914,15 +925,21 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
4783 BUG_ON(page->mapping != inode->i_mapping);
4784 if (!PageUptodate(page)) {
4785 - ext4_mb_init_cache(page, NULL);
4786 + ret = ext4_mb_init_cache(page, NULL);
4788 + unlock_page(page);
4791 mb_cmp_bitmaps(e4b, page_address(page) +
4792 (poff * sb->s_blocksize));
4797 - if (page == NULL || !PageUptodate(page))
4798 + if (page == NULL || !PageUptodate(page)) {
4802 e4b->bd_bitmap_page = page;
4803 e4b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize);
4804 mark_page_accessed(page);
4805 @@ -938,14 +955,20 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
4806 page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
4808 BUG_ON(page->mapping != inode->i_mapping);
4809 - if (!PageUptodate(page))
4810 - ext4_mb_init_cache(page, e4b->bd_bitmap);
4812 + if (!PageUptodate(page)) {
4813 + ret = ext4_mb_init_cache(page, e4b->bd_bitmap);
4815 + unlock_page(page);
4822 - if (page == NULL || !PageUptodate(page))
4823 + if (page == NULL || !PageUptodate(page)) {
4827 e4b->bd_buddy_page = page;
4828 e4b->bd_buddy = page_address(page) + (poff * sb->s_blocksize);
4829 mark_page_accessed(page);
4830 @@ -962,7 +985,7 @@ err:
4831 page_cache_release(e4b->bd_buddy_page);
4832 e4b->bd_buddy = NULL;
4833 e4b->bd_bitmap = NULL;
4838 static void ext4_mb_release_desc(struct ext4_buddy *e4b)
4839 @@ -1031,7 +1054,7 @@ static void mb_set_bits(spinlock_t *lock, void *bm, int cur, int len)
4843 -static int mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
4844 +static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
4845 int first, int count)
4848 @@ -1071,11 +1094,12 @@ static int mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
4851 le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
4853 + ext4_unlock_group(sb, e4b->bd_group);
4854 ext4_error(sb, __func__, "double-free of inode"
4855 " %lu's block %llu(bit %u in group %lu)\n",
4856 inode ? inode->i_ino : 0, blocknr, block,
4858 + ext4_lock_group(sb, e4b->bd_group);
4860 mb_clear_bit(block, EXT4_MB_BITMAP(e4b));
4861 e4b->bd_info->bb_counters[order]++;
4862 @@ -1113,8 +1137,6 @@ static int mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
4865 mb_check_buddy(e4b);
4870 static int mb_find_extent(struct ext4_buddy *e4b, int order, int block,
4871 @@ -1730,10 +1752,6 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
4872 ac->ac_g_ex.fe_start = sbi->s_mb_last_start;
4873 spin_unlock(&sbi->s_md_lock);
4876 - /* searching for the right group start from the goal value specified */
4877 - group = ac->ac_g_ex.fe_group;
4879 /* Let's just scan groups to find more-less suitable blocks */
4880 cr = ac->ac_2order ? 0 : 1;
4882 @@ -1743,6 +1761,12 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
4884 for (; cr < 4 && ac->ac_status == AC_STATUS_CONTINUE; cr++) {
4885 ac->ac_criteria = cr;
4887 + * searching for the right group start
4888 + * from the goal value specified
4890 + group = ac->ac_g_ex.fe_group;
4892 for (i = 0; i < EXT4_SB(sb)->s_groups_count; group++, i++) {
4893 struct ext4_group_info *grp;
4894 struct ext4_group_desc *desc;
4895 @@ -1963,6 +1987,8 @@ static int ext4_mb_seq_history_open(struct inode *inode, struct file *file)
4899 + if (unlikely(sbi->s_mb_history == NULL))
4901 s = kmalloc(sizeof(*s), GFP_KERNEL);
4904 @@ -2165,9 +2191,7 @@ static void ext4_mb_history_init(struct super_block *sb)
4905 sbi->s_mb_history_cur = 0;
4906 spin_lock_init(&sbi->s_mb_history_lock);
4907 i = sbi->s_mb_history_max * sizeof(struct ext4_mb_history);
4908 - sbi->s_mb_history = kmalloc(i, GFP_KERNEL);
4909 - if (likely(sbi->s_mb_history != NULL))
4910 - memset(sbi->s_mb_history, 0, i);
4911 + sbi->s_mb_history = kzalloc(i, GFP_KERNEL);
4912 /* if we can't allocate history, then we simple won't use it */
4915 @@ -2215,21 +2239,192 @@ ext4_mb_store_history(struct ext4_allocation_context *ac)
4916 #define ext4_mb_history_init(sb)
4920 +/* Create and initialize ext4_group_info data for the given group. */
4921 +int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
4922 + struct ext4_group_desc *desc)
4926 + struct ext4_sb_info *sbi = EXT4_SB(sb);
4927 + struct ext4_group_info **meta_group_info;
4930 + * First check if this group is the first of a reserved block.
4931 + * If it's true, we have to allocate a new table of pointers
4932 + * to ext4_group_info structures
4934 + if (group % EXT4_DESC_PER_BLOCK(sb) == 0) {
4935 + metalen = sizeof(*meta_group_info) <<
4936 + EXT4_DESC_PER_BLOCK_BITS(sb);
4937 + meta_group_info = kmalloc(metalen, GFP_KERNEL);
4938 + if (meta_group_info == NULL) {
4939 + printk(KERN_ERR "EXT4-fs: can't allocate mem for a "
4941 + goto exit_meta_group_info;
4943 + sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)] =
4948 + * calculate needed size. if change bb_counters size,
4949 + * don't forget about ext4_mb_generate_buddy()
4951 + len = offsetof(typeof(**meta_group_info),
4952 + bb_counters[sb->s_blocksize_bits + 2]);
4955 + sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)];
4956 + i = group & (EXT4_DESC_PER_BLOCK(sb) - 1);
4958 + meta_group_info[i] = kzalloc(len, GFP_KERNEL);
4959 + if (meta_group_info[i] == NULL) {
4960 + printk(KERN_ERR "EXT4-fs: can't allocate buddy mem\n");
4961 + goto exit_group_info;
4963 + set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT,
4964 + &(meta_group_info[i]->bb_state));
4967 + * initialize bb_free to be able to skip
4968 + * empty groups without initialization
4970 + if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
4971 + meta_group_info[i]->bb_free =
4972 + ext4_free_blocks_after_init(sb, group, desc);
4974 + meta_group_info[i]->bb_free =
4975 + le16_to_cpu(desc->bg_free_blocks_count);
4978 + INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list);
4980 +#ifdef DOUBLE_CHECK
4982 + struct buffer_head *bh;
4983 + meta_group_info[i]->bb_bitmap =
4984 + kmalloc(sb->s_blocksize, GFP_KERNEL);
4985 + BUG_ON(meta_group_info[i]->bb_bitmap == NULL);
4986 + bh = ext4_read_block_bitmap(sb, group);
4987 + BUG_ON(bh == NULL);
4988 + memcpy(meta_group_info[i]->bb_bitmap, bh->b_data,
4997 + /* If a meta_group_info table has been allocated, release it now */
4998 + if (group % EXT4_DESC_PER_BLOCK(sb) == 0)
4999 + kfree(sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)]);
5000 +exit_meta_group_info:
5002 +} /* ext4_mb_add_groupinfo */
5005 + * Add a group to the existing groups.
5006 + * This function is used for online resize
5008 +int ext4_mb_add_more_groupinfo(struct super_block *sb, ext4_group_t group,
5009 + struct ext4_group_desc *desc)
5011 + struct ext4_sb_info *sbi = EXT4_SB(sb);
5012 + struct inode *inode = sbi->s_buddy_cache;
5013 + int blocks_per_page;
5016 + struct page *page;
5019 + /* Add group based on group descriptor*/
5020 + err = ext4_mb_add_groupinfo(sb, group, desc);
5025 + * Cache pages containing dynamic mb_alloc datas (buddy and bitmap
5026 + * datas) are set not up to date so that they will be re-initilaized
5027 + * during the next call to ext4_mb_load_buddy
5030 + /* Set buddy page as not up to date */
5031 + blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
5032 + block = group * 2;
5033 + pnum = block / blocks_per_page;
5034 + page = find_get_page(inode->i_mapping, pnum);
5035 + if (page != NULL) {
5036 + ClearPageUptodate(page);
5037 + page_cache_release(page);
5040 + /* Set bitmap page as not up to date */
5042 + pnum = block / blocks_per_page;
5043 + page = find_get_page(inode->i_mapping, pnum);
5044 + if (page != NULL) {
5045 + ClearPageUptodate(page);
5046 + page_cache_release(page);
5053 + * Update an existing group.
5054 + * This function is used for online resize
5056 +void ext4_mb_update_group_info(struct ext4_group_info *grp, ext4_grpblk_t add)
5058 + grp->bb_free += add;
5061 static int ext4_mb_init_backend(struct super_block *sb)
5064 - int j, len, metalen;
5066 struct ext4_sb_info *sbi = EXT4_SB(sb);
5067 - int num_meta_group_infos =
5068 - (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) >>
5069 - EXT4_DESC_PER_BLOCK_BITS(sb);
5070 + struct ext4_super_block *es = sbi->s_es;
5071 + int num_meta_group_infos;
5072 + int num_meta_group_infos_max;
5074 struct ext4_group_info **meta_group_info;
5075 + struct ext4_group_desc *desc;
5077 + /* This is the number of blocks used by GDT */
5078 + num_meta_group_infos = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) -
5079 + 1) >> EXT4_DESC_PER_BLOCK_BITS(sb);
5082 + * This is the total number of blocks used by GDT including
5083 + * the number of reserved blocks for GDT.
5084 + * The s_group_info array is allocated with this value
5085 + * to allow a clean online resize without a complex
5086 + * manipulation of pointer.
5087 + * The drawback is the unused memory when no resize
5088 + * occurs but it's very low in terms of pages
5089 + * (see comments below)
5090 + * Need to handle this properly when META_BG resizing is allowed
5092 + num_meta_group_infos_max = num_meta_group_infos +
5093 + le16_to_cpu(es->s_reserved_gdt_blocks);
5096 + * array_size is the size of s_group_info array. We round it
5097 + * to the next power of two because this approximation is done
5098 + * internally by kmalloc so we can have some more memory
5099 + * for free here (e.g. may be used for META_BG resize).
5102 + while (array_size < sizeof(*sbi->s_group_info) *
5103 + num_meta_group_infos_max)
5104 + array_size = array_size << 1;
5105 /* An 8TB filesystem with 64-bit pointers requires a 4096 byte
5106 * kmalloc. A 128kb malloc should suffice for a 256TB filesystem.
5107 * So a two level scheme suffices for now. */
5108 - sbi->s_group_info = kmalloc(sizeof(*sbi->s_group_info) *
5109 - num_meta_group_infos, GFP_KERNEL);
5110 + sbi->s_group_info = kmalloc(array_size, GFP_KERNEL);
5111 if (sbi->s_group_info == NULL) {
5112 printk(KERN_ERR "EXT4-fs: can't allocate buddy meta group\n");
5114 @@ -2256,63 +2451,15 @@ static int ext4_mb_init_backend(struct super_block *sb)
5115 sbi->s_group_info[i] = meta_group_info;
5119 - * calculate needed size. if change bb_counters size,
5120 - * don't forget about ext4_mb_generate_buddy()
5122 - len = sizeof(struct ext4_group_info);
5123 - len += sizeof(unsigned short) * (sb->s_blocksize_bits + 2);
5124 for (i = 0; i < sbi->s_groups_count; i++) {
5125 - struct ext4_group_desc *desc;
5128 - sbi->s_group_info[i >> EXT4_DESC_PER_BLOCK_BITS(sb)];
5129 - j = i & (EXT4_DESC_PER_BLOCK(sb) - 1);
5131 - meta_group_info[j] = kzalloc(len, GFP_KERNEL);
5132 - if (meta_group_info[j] == NULL) {
5133 - printk(KERN_ERR "EXT4-fs: can't allocate buddy mem\n");
5134 - goto err_freebuddy;
5136 desc = ext4_get_group_desc(sb, i, NULL);
5139 "EXT4-fs: can't read descriptor %lu\n", i);
5143 - memset(meta_group_info[j], 0, len);
5144 - set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT,
5145 - &(meta_group_info[j]->bb_state));
5148 - * initialize bb_free to be able to skip
5149 - * empty groups without initialization
5151 - if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
5152 - meta_group_info[j]->bb_free =
5153 - ext4_free_blocks_after_init(sb, i, desc);
5155 - meta_group_info[j]->bb_free =
5156 - le16_to_cpu(desc->bg_free_blocks_count);
5159 - INIT_LIST_HEAD(&meta_group_info[j]->bb_prealloc_list);
5161 -#ifdef DOUBLE_CHECK
5163 - struct buffer_head *bh;
5164 - meta_group_info[j]->bb_bitmap =
5165 - kmalloc(sb->s_blocksize, GFP_KERNEL);
5166 - BUG_ON(meta_group_info[j]->bb_bitmap == NULL);
5167 - bh = read_block_bitmap(sb, i);
5168 - BUG_ON(bh == NULL);
5169 - memcpy(meta_group_info[j]->bb_bitmap, bh->b_data,
5175 + if (ext4_mb_add_groupinfo(sb, i, desc) != 0)
5176 + goto err_freebuddy;
5180 @@ -2333,9 +2480,10 @@ err_freesgi:
5181 int ext4_mb_init(struct super_block *sb, int needs_recovery)
5183 struct ext4_sb_info *sbi = EXT4_SB(sb);
5190 if (!test_opt(sb, MBALLOC))
5192 @@ -2370,12 +2518,12 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
5193 } while (i <= sb->s_blocksize_bits + 1);
5195 /* init file for buddy data */
5196 - i = ext4_mb_init_backend(sb);
5198 + ret = ext4_mb_init_backend(sb);
5200 clear_opt(sbi->s_mount_opt, MBALLOC);
5201 kfree(sbi->s_mb_offsets);
5202 kfree(sbi->s_mb_maxs);
5207 spin_lock_init(&sbi->s_md_lock);
5208 @@ -2392,7 +2540,7 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
5209 sbi->s_mb_history_filter = EXT4_MB_HISTORY_DEFAULT;
5210 sbi->s_mb_group_prealloc = MB_DEFAULT_GROUP_PREALLOC;
5212 - i = sizeof(struct ext4_locality_group) * NR_CPUS;
5213 + i = sizeof(struct ext4_locality_group) * nr_cpu_ids;
5214 sbi->s_locality_groups = kmalloc(i, GFP_KERNEL);
5215 if (sbi->s_locality_groups == NULL) {
5216 clear_opt(sbi->s_mount_opt, MBALLOC);
5217 @@ -2400,11 +2548,12 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
5218 kfree(sbi->s_mb_maxs);
5221 - for (i = 0; i < NR_CPUS; i++) {
5222 + for (i = 0; i < nr_cpu_ids; i++) {
5223 struct ext4_locality_group *lg;
5224 lg = &sbi->s_locality_groups[i];
5225 mutex_init(&lg->lg_mutex);
5226 - INIT_LIST_HEAD(&lg->lg_prealloc_list);
5227 + for (j = 0; j < PREALLOC_TB_SIZE; j++)
5228 + INIT_LIST_HEAD(&lg->lg_prealloc_list[j]);
5229 spin_lock_init(&lg->lg_prealloc_lock);
5232 @@ -2548,8 +2697,7 @@ ext4_mb_free_committed_blocks(struct super_block *sb)
5233 ext4_lock_group(sb, md->group);
5234 for (i = 0; i < md->num; i++) {
5235 mb_debug(" %u", md->blocks[i]);
5236 - err = mb_free_blocks(NULL, &e4b, md->blocks[i], 1);
5238 + mb_free_blocks(NULL, &e4b, md->blocks[i], 1);
5241 ext4_unlock_group(sb, md->group);
5242 @@ -2575,25 +2723,24 @@ ext4_mb_free_committed_blocks(struct super_block *sb)
5246 -#define MB_PROC_VALUE_READ(name) \
5247 -static int ext4_mb_read_##name(char *page, char **start, \
5248 - off_t off, int count, int *eof, void *data) \
5249 +#define MB_PROC_FOPS(name) \
5250 +static int ext4_mb_##name##_proc_show(struct seq_file *m, void *v) \
5252 - struct ext4_sb_info *sbi = data; \
5257 - len = sprintf(page, "%ld\n", sbi->s_mb_##name); \
5262 -#define MB_PROC_VALUE_WRITE(name) \
5263 -static int ext4_mb_write_##name(struct file *file, \
5264 - const char __user *buf, unsigned long cnt, void *data) \
5265 + struct ext4_sb_info *sbi = m->private; \
5267 + seq_printf(m, "%ld\n", sbi->s_mb_##name); \
5271 +static int ext4_mb_##name##_proc_open(struct inode *inode, struct file *file)\
5273 - struct ext4_sb_info *sbi = data; \
5274 + return single_open(file, ext4_mb_##name##_proc_show, PDE(inode)->data);\
5277 +static ssize_t ext4_mb_##name##_proc_write(struct file *file, \
5278 + const char __user *buf, size_t cnt, loff_t *ppos) \
5280 + struct ext4_sb_info *sbi = PDE(file->f_path.dentry->d_inode)->data;\
5283 if (cnt >= sizeof(str)) \
5284 @@ -2605,31 +2752,32 @@ static int ext4_mb_write_##name(struct file *file, \
5286 sbi->s_mb_##name = value; \
5291 +static const struct file_operations ext4_mb_##name##_proc_fops = { \
5292 + .owner = THIS_MODULE, \
5293 + .open = ext4_mb_##name##_proc_open, \
5294 + .read = seq_read, \
5295 + .llseek = seq_lseek, \
5296 + .release = single_release, \
5297 + .write = ext4_mb_##name##_proc_write, \
5300 -MB_PROC_VALUE_READ(stats);
5301 -MB_PROC_VALUE_WRITE(stats);
5302 -MB_PROC_VALUE_READ(max_to_scan);
5303 -MB_PROC_VALUE_WRITE(max_to_scan);
5304 -MB_PROC_VALUE_READ(min_to_scan);
5305 -MB_PROC_VALUE_WRITE(min_to_scan);
5306 -MB_PROC_VALUE_READ(order2_reqs);
5307 -MB_PROC_VALUE_WRITE(order2_reqs);
5308 -MB_PROC_VALUE_READ(stream_request);
5309 -MB_PROC_VALUE_WRITE(stream_request);
5310 -MB_PROC_VALUE_READ(group_prealloc);
5311 -MB_PROC_VALUE_WRITE(group_prealloc);
5312 +MB_PROC_FOPS(stats);
5313 +MB_PROC_FOPS(max_to_scan);
5314 +MB_PROC_FOPS(min_to_scan);
5315 +MB_PROC_FOPS(order2_reqs);
5316 +MB_PROC_FOPS(stream_request);
5317 +MB_PROC_FOPS(group_prealloc);
5319 #define MB_PROC_HANDLER(name, var) \
5321 - proc = create_proc_entry(name, mode, sbi->s_mb_proc); \
5322 + proc = proc_create_data(name, mode, sbi->s_mb_proc, \
5323 + &ext4_mb_##var##_proc_fops, sbi); \
5324 if (proc == NULL) { \
5325 printk(KERN_ERR "EXT4-fs: can't to create %s\n", name); \
5328 - proc->data = sbi; \
5329 - proc->read_proc = ext4_mb_read_##var ; \
5330 - proc->write_proc = ext4_mb_write_##var; \
5333 static int ext4_mb_init_per_dev_proc(struct super_block *sb)
5334 @@ -2639,6 +2787,10 @@ static int ext4_mb_init_per_dev_proc(struct super_block *sb)
5335 struct proc_dir_entry *proc;
5338 + if (proc_root_ext4 == NULL) {
5339 + sbi->s_mb_proc = NULL;
5342 bdevname(sb->s_bdev, devname);
5343 sbi->s_mb_proc = proc_mkdir(devname, proc_root_ext4);
5345 @@ -2747,7 +2899,7 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
5349 - bitmap_bh = read_block_bitmap(sb, ac->ac_b_ex.fe_group);
5350 + bitmap_bh = ext4_read_block_bitmap(sb, ac->ac_b_ex.fe_group);
5354 @@ -2816,7 +2968,23 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
5355 le16_add_cpu(&gdp->bg_free_blocks_count, -ac->ac_b_ex.fe_len);
5356 gdp->bg_checksum = ext4_group_desc_csum(sbi, ac->ac_b_ex.fe_group, gdp);
5357 spin_unlock(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group));
5358 - percpu_counter_sub(&sbi->s_freeblocks_counter, ac->ac_b_ex.fe_len);
5361 + * free blocks account has already be reduced/reserved
5362 + * at write_begin() time for delayed allocation
5363 + * do not double accounting
5365 + if (!(ac->ac_flags & EXT4_MB_DELALLOC_RESERVED))
5366 + percpu_counter_sub(&sbi->s_freeblocks_counter,
5367 + ac->ac_b_ex.fe_len);
5369 + if (sbi->s_log_groups_per_flex) {
5370 + ext4_group_t flex_group = ext4_flex_group(sbi,
5371 + ac->ac_b_ex.fe_group);
5372 + spin_lock(sb_bgl_lock(sbi, flex_group));
5373 + sbi->s_flex_groups[flex_group].free_blocks -= ac->ac_b_ex.fe_len;
5374 + spin_unlock(sb_bgl_lock(sbi, flex_group));
5377 err = ext4_journal_dirty_metadata(handle, bitmap_bh);
5379 @@ -3096,6 +3264,7 @@ static void ext4_mb_use_group_pa(struct ext4_allocation_context *ac,
5380 struct ext4_prealloc_space *pa)
5382 unsigned int len = ac->ac_o_ex.fe_len;
5384 ext4_get_group_no_and_offset(ac->ac_sb, pa->pa_pstart,
5385 &ac->ac_b_ex.fe_group,
5386 &ac->ac_b_ex.fe_start);
5387 @@ -3113,14 +3282,45 @@ static void ext4_mb_use_group_pa(struct ext4_allocation_context *ac,
5391 + * Return the prealloc space that have minimal distance
5392 + * from the goal block. @cpa is the prealloc
5393 + * space that is having currently known minimal distance
5394 + * from the goal block.
5396 +static struct ext4_prealloc_space *
5397 +ext4_mb_check_group_pa(ext4_fsblk_t goal_block,
5398 + struct ext4_prealloc_space *pa,
5399 + struct ext4_prealloc_space *cpa)
5401 + ext4_fsblk_t cur_distance, new_distance;
5403 + if (cpa == NULL) {
5404 + atomic_inc(&pa->pa_count);
5407 + cur_distance = abs(goal_block - cpa->pa_pstart);
5408 + new_distance = abs(goal_block - pa->pa_pstart);
5410 + if (cur_distance < new_distance)
5413 + /* drop the previous reference */
5414 + atomic_dec(&cpa->pa_count);
5415 + atomic_inc(&pa->pa_count);
5420 * search goal blocks in preallocated space
5422 static noinline_for_stack int
5423 ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
5426 struct ext4_inode_info *ei = EXT4_I(ac->ac_inode);
5427 struct ext4_locality_group *lg;
5428 - struct ext4_prealloc_space *pa;
5429 + struct ext4_prealloc_space *pa, *cpa = NULL;
5430 + ext4_fsblk_t goal_block;
5432 /* only data can be preallocated */
5433 if (!(ac->ac_flags & EXT4_MB_HINT_DATA))
5434 @@ -3158,22 +3358,38 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
5438 + order = fls(ac->ac_o_ex.fe_len) - 1;
5439 + if (order > PREALLOC_TB_SIZE - 1)
5440 + /* The max size of hash table is PREALLOC_TB_SIZE */
5441 + order = PREALLOC_TB_SIZE - 1;
5443 + goal_block = ac->ac_g_ex.fe_group * EXT4_BLOCKS_PER_GROUP(ac->ac_sb) +
5444 + ac->ac_g_ex.fe_start +
5445 + le32_to_cpu(EXT4_SB(ac->ac_sb)->s_es->s_first_data_block);
5447 + * search for the prealloc space that is having
5448 + * minimal distance from the goal block.
5450 + for (i = order; i < PREALLOC_TB_SIZE; i++) {
5452 + list_for_each_entry_rcu(pa, &lg->lg_prealloc_list[i],
5454 + spin_lock(&pa->pa_lock);
5455 + if (pa->pa_deleted == 0 &&
5456 + pa->pa_free >= ac->ac_o_ex.fe_len) {
5459 - list_for_each_entry_rcu(pa, &lg->lg_prealloc_list, pa_inode_list) {
5460 - spin_lock(&pa->pa_lock);
5461 - if (pa->pa_deleted == 0 && pa->pa_free >= ac->ac_o_ex.fe_len) {
5462 - atomic_inc(&pa->pa_count);
5463 - ext4_mb_use_group_pa(ac, pa);
5464 + cpa = ext4_mb_check_group_pa(goal_block,
5467 spin_unlock(&pa->pa_lock);
5468 - ac->ac_criteria = 20;
5469 - rcu_read_unlock();
5472 - spin_unlock(&pa->pa_lock);
5473 + rcu_read_unlock();
5476 + ext4_mb_use_group_pa(ac, cpa);
5477 + ac->ac_criteria = 20;
5480 - rcu_read_unlock();
5485 @@ -3396,6 +3612,7 @@ ext4_mb_new_group_pa(struct ext4_allocation_context *ac)
5486 pa->pa_free = pa->pa_len;
5487 atomic_set(&pa->pa_count, 1);
5488 spin_lock_init(&pa->pa_lock);
5489 + INIT_LIST_HEAD(&pa->pa_inode_list);
5493 @@ -3416,10 +3633,10 @@ ext4_mb_new_group_pa(struct ext4_allocation_context *ac)
5494 list_add(&pa->pa_group_list, &grp->bb_prealloc_list);
5495 ext4_unlock_group(sb, ac->ac_b_ex.fe_group);
5497 - spin_lock(pa->pa_obj_lock);
5498 - list_add_tail_rcu(&pa->pa_inode_list, &lg->lg_prealloc_list);
5499 - spin_unlock(pa->pa_obj_lock);
5502 + * We will later add the new pa to the right bucket
5503 + * after updating the pa_free in ext4_mb_release_context
5508 @@ -3473,8 +3690,6 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
5511 next = mb_find_next_bit(bitmap_bh->b_data, end, bit);
5514 start = group * EXT4_BLOCKS_PER_GROUP(sb) + bit +
5515 le32_to_cpu(sbi->s_es->s_first_data_block);
5516 mb_debug(" free preallocated %u/%u in group %u\n",
5517 @@ -3569,22 +3784,25 @@ ext4_mb_discard_group_preallocations(struct super_block *sb,
5518 if (list_empty(&grp->bb_prealloc_list))
5521 - bitmap_bh = read_block_bitmap(sb, group);
5522 + bitmap_bh = ext4_read_block_bitmap(sb, group);
5523 if (bitmap_bh == NULL) {
5524 - /* error handling here */
5525 - ext4_mb_release_desc(&e4b);
5526 - BUG_ON(bitmap_bh == NULL);
5527 + ext4_error(sb, __func__, "Error in reading block "
5528 + "bitmap for %lu\n", group);
5532 err = ext4_mb_load_buddy(sb, group, &e4b);
5533 - BUG_ON(err != 0); /* error handling here */
5535 + ext4_error(sb, __func__, "Error in loading buddy "
5536 + "information for %lu\n", group);
5537 + put_bh(bitmap_bh);
5542 needed = EXT4_BLOCKS_PER_GROUP(sb) + 1;
5544 - grp = ext4_get_group_info(sb, group);
5545 INIT_LIST_HEAD(&list);
5547 ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS);
5549 ext4_lock_group(sb, group);
5550 @@ -3741,13 +3959,18 @@ repeat:
5551 ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, NULL);
5553 err = ext4_mb_load_buddy(sb, group, &e4b);
5554 - BUG_ON(err != 0); /* error handling here */
5556 + ext4_error(sb, __func__, "Error in loading buddy "
5557 + "information for %lu\n", group);
5561 - bitmap_bh = read_block_bitmap(sb, group);
5562 + bitmap_bh = ext4_read_block_bitmap(sb, group);
5563 if (bitmap_bh == NULL) {
5564 - /* error handling here */
5565 + ext4_error(sb, __func__, "Error in reading block "
5566 + "bitmap for %lu\n", group);
5567 ext4_mb_release_desc(&e4b);
5568 - BUG_ON(bitmap_bh == NULL);
5572 ext4_lock_group(sb, group);
5573 @@ -3950,22 +4173,168 @@ ext4_mb_initialize_context(struct ext4_allocation_context *ac,
5577 +static noinline_for_stack void
5578 +ext4_mb_discard_lg_preallocations(struct super_block *sb,
5579 + struct ext4_locality_group *lg,
5580 + int order, int total_entries)
5582 + ext4_group_t group = 0;
5583 + struct ext4_buddy e4b;
5584 + struct list_head discard_list;
5585 + struct ext4_prealloc_space *pa, *tmp;
5586 + struct ext4_allocation_context *ac;
5588 + mb_debug("discard locality group preallocation\n");
5590 + INIT_LIST_HEAD(&discard_list);
5591 + ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS);
5593 + spin_lock(&lg->lg_prealloc_lock);
5594 + list_for_each_entry_rcu(pa, &lg->lg_prealloc_list[order],
5596 + spin_lock(&pa->pa_lock);
5597 + if (atomic_read(&pa->pa_count)) {
5599 + * This is the pa that we just used
5600 + * for block allocation. So don't
5603 + spin_unlock(&pa->pa_lock);
5606 + if (pa->pa_deleted) {
5607 + spin_unlock(&pa->pa_lock);
5610 + /* only lg prealloc space */
5611 + BUG_ON(!pa->pa_linear);
5613 + /* seems this one can be freed ... */
5614 + pa->pa_deleted = 1;
5615 + spin_unlock(&pa->pa_lock);
5617 + list_del_rcu(&pa->pa_inode_list);
5618 + list_add(&pa->u.pa_tmp_list, &discard_list);
5621 + if (total_entries <= 5) {
5623 + * we want to keep only 5 entries
5624 + * allowing it to grow to 8. This
5625 + * mak sure we don't call discard
5626 + * soon for this list.
5631 + spin_unlock(&lg->lg_prealloc_lock);
5633 + list_for_each_entry_safe(pa, tmp, &discard_list, u.pa_tmp_list) {
5635 + ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, NULL);
5636 + if (ext4_mb_load_buddy(sb, group, &e4b)) {
5637 + ext4_error(sb, __func__, "Error in loading buddy "
5638 + "information for %lu\n", group);
5641 + ext4_lock_group(sb, group);
5642 + list_del(&pa->pa_group_list);
5643 + ext4_mb_release_group_pa(&e4b, pa, ac);
5644 + ext4_unlock_group(sb, group);
5646 + ext4_mb_release_desc(&e4b);
5647 + list_del(&pa->u.pa_tmp_list);
5648 + call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
5651 + kmem_cache_free(ext4_ac_cachep, ac);
5655 + * We have incremented pa_count. So it cannot be freed at this
5656 + * point. Also we hold lg_mutex. So no parallel allocation is
5657 + * possible from this lg. That means pa_free cannot be updated.
5659 + * A parallel ext4_mb_discard_group_preallocations is possible.
5660 + * which can cause the lg_prealloc_list to be updated.
5663 +static void ext4_mb_add_n_trim(struct ext4_allocation_context *ac)
5665 + int order, added = 0, lg_prealloc_count = 1;
5666 + struct super_block *sb = ac->ac_sb;
5667 + struct ext4_locality_group *lg = ac->ac_lg;
5668 + struct ext4_prealloc_space *tmp_pa, *pa = ac->ac_pa;
5670 + order = fls(pa->pa_free) - 1;
5671 + if (order > PREALLOC_TB_SIZE - 1)
5672 + /* The max size of hash table is PREALLOC_TB_SIZE */
5673 + order = PREALLOC_TB_SIZE - 1;
5674 + /* Add the prealloc space to lg */
5676 + list_for_each_entry_rcu(tmp_pa, &lg->lg_prealloc_list[order],
5678 + spin_lock(&tmp_pa->pa_lock);
5679 + if (tmp_pa->pa_deleted) {
5680 + spin_unlock(&pa->pa_lock);
5683 + if (!added && pa->pa_free < tmp_pa->pa_free) {
5684 + /* Add to the tail of the previous entry */
5685 + list_add_tail_rcu(&pa->pa_inode_list,
5686 + &tmp_pa->pa_inode_list);
5689 + * we want to count the total
5690 + * number of entries in the list
5693 + spin_unlock(&tmp_pa->pa_lock);
5694 + lg_prealloc_count++;
5697 + list_add_tail_rcu(&pa->pa_inode_list,
5698 + &lg->lg_prealloc_list[order]);
5699 + rcu_read_unlock();
5701 + /* Now trim the list to be not more than 8 elements */
5702 + if (lg_prealloc_count > 8) {
5703 + ext4_mb_discard_lg_preallocations(sb, lg,
5704 + order, lg_prealloc_count);
5711 * release all resource we used in allocation
5713 static int ext4_mb_release_context(struct ext4_allocation_context *ac)
5716 - if (ac->ac_pa->pa_linear) {
5717 + struct ext4_prealloc_space *pa = ac->ac_pa;
5719 + if (pa->pa_linear) {
5720 /* see comment in ext4_mb_use_group_pa() */
5721 - spin_lock(&ac->ac_pa->pa_lock);
5722 - ac->ac_pa->pa_pstart += ac->ac_b_ex.fe_len;
5723 - ac->ac_pa->pa_lstart += ac->ac_b_ex.fe_len;
5724 - ac->ac_pa->pa_free -= ac->ac_b_ex.fe_len;
5725 - ac->ac_pa->pa_len -= ac->ac_b_ex.fe_len;
5726 - spin_unlock(&ac->ac_pa->pa_lock);
5727 + spin_lock(&pa->pa_lock);
5728 + pa->pa_pstart += ac->ac_b_ex.fe_len;
5729 + pa->pa_lstart += ac->ac_b_ex.fe_len;
5730 + pa->pa_free -= ac->ac_b_ex.fe_len;
5731 + pa->pa_len -= ac->ac_b_ex.fe_len;
5732 + spin_unlock(&pa->pa_lock);
5734 + * We want to add the pa to the right bucket.
5735 + * Remove it from the list and while adding
5736 + * make sure the list to which we are adding
5737 + * doesn't grow big.
5739 + if (likely(pa->pa_free)) {
5740 + spin_lock(pa->pa_obj_lock);
5741 + list_del_rcu(&pa->pa_inode_list);
5742 + spin_unlock(pa->pa_obj_lock);
5743 + ext4_mb_add_n_trim(ac);
5746 - ext4_mb_put_pa(ac, ac->ac_sb, ac->ac_pa);
5747 + ext4_mb_put_pa(ac, ac->ac_sb, pa);
5749 if (ac->ac_bitmap_page)
5750 page_cache_release(ac->ac_bitmap_page);
5751 @@ -4011,10 +4380,21 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
5754 if (!test_opt(sb, MBALLOC)) {
5755 - block = ext4_new_blocks_old(handle, ar->inode, ar->goal,
5756 + block = ext4_old_new_blocks(handle, ar->inode, ar->goal,
5760 + if (!EXT4_I(ar->inode)->i_delalloc_reserved_flag) {
5762 + * With delalloc we already reserved the blocks
5764 + ar->len = ext4_has_free_blocks(sbi, ar->len);
5767 + if (ar->len == 0) {
5772 while (ar->len && DQUOT_ALLOC_BLOCK(ar->inode, ar->len)) {
5773 ar->flags |= EXT4_MB_HINT_NOPREALLOC;
5774 @@ -4026,10 +4406,14 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
5778 + if (EXT4_I(ar->inode)->i_delalloc_reserved_flag)
5779 + ar->flags |= EXT4_MB_DELALLOC_RESERVED;
5781 ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS);
5789 ext4_mb_poll_new_transaction(sb, handle);
5790 @@ -4037,12 +4421,11 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
5791 *errp = ext4_mb_initialize_context(ac, ar);
5798 ac->ac_op = EXT4_MB_HISTORY_PREALLOC;
5799 if (!ext4_mb_use_preallocated(ac)) {
5801 ac->ac_op = EXT4_MB_HISTORY_ALLOC;
5802 ext4_mb_normalize_request(ac, ar);
5804 @@ -4085,11 +4468,12 @@ repeat:
5806 ext4_mb_release_context(ac);
5810 + kmem_cache_free(ext4_ac_cachep, ac);
5812 if (ar->len < inquota)
5813 DQUOT_FREE_BLOCK(ar->inode, inquota - ar->len);
5815 - kmem_cache_free(ext4_ac_cachep, ac);
5818 static void ext4_mb_poll_new_transaction(struct super_block *sb,
5819 @@ -4242,12 +4626,16 @@ do_more:
5820 overflow = bit + count - EXT4_BLOCKS_PER_GROUP(sb);
5823 - bitmap_bh = read_block_bitmap(sb, block_group);
5825 + bitmap_bh = ext4_read_block_bitmap(sb, block_group);
5830 gdp = ext4_get_group_desc(sb, block_group, &gd_bh);
5837 if (in_range(ext4_block_bitmap(sb, gdp), block, count) ||
5838 in_range(ext4_inode_bitmap(sb, gdp), block, count) ||
5839 @@ -4309,10 +4697,9 @@ do_more:
5840 ext4_mb_free_metadata(handle, &e4b, block_group, bit, count);
5842 ext4_lock_group(sb, block_group);
5843 - err = mb_free_blocks(inode, &e4b, bit, count);
5844 + mb_free_blocks(inode, &e4b, bit, count);
5845 ext4_mb_return_to_preallocation(inode, &e4b, block, count);
5846 ext4_unlock_group(sb, block_group);
5850 spin_lock(sb_bgl_lock(sbi, block_group));
5851 @@ -4321,6 +4708,13 @@ do_more:
5852 spin_unlock(sb_bgl_lock(sbi, block_group));
5853 percpu_counter_add(&sbi->s_freeblocks_counter, count);
5855 + if (sbi->s_log_groups_per_flex) {
5856 + ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
5857 + spin_lock(sb_bgl_lock(sbi, flex_group));
5858 + sbi->s_flex_groups[flex_group].free_blocks += count;
5859 + spin_unlock(sb_bgl_lock(sbi, flex_group));
5862 ext4_mb_release_desc(&e4b);
5865 diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
5866 index bfe6add..c7c9906 100644
5867 --- a/fs/ext4/mballoc.h
5868 +++ b/fs/ext4/mballoc.h
5869 @@ -164,11 +164,17 @@ struct ext4_free_extent {
5871 * we try to group all related changes together
5872 * so that writeback can flush/allocate them together as well
5873 + * Size of lg_prealloc_list hash is determined by MB_DEFAULT_GROUP_PREALLOC
5874 + * (512). We store prealloc space into the hash based on the pa_free blocks
5875 + * order value.ie, fls(pa_free)-1;
5877 +#define PREALLOC_TB_SIZE 10
5878 struct ext4_locality_group {
5880 - struct mutex lg_mutex; /* to serialize allocates */
5881 - struct list_head lg_prealloc_list;/* list of preallocations */
5882 + /* to serialize allocates */
5883 + struct mutex lg_mutex;
5884 + /* list of preallocations */
5885 + struct list_head lg_prealloc_list[PREALLOC_TB_SIZE];
5886 spinlock_t lg_prealloc_lock;
5889 diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
5890 index b9e077b..46fc0b5 100644
5891 --- a/fs/ext4/migrate.c
5892 +++ b/fs/ext4/migrate.c
5893 @@ -53,7 +53,8 @@ static int finish_range(handle_t *handle, struct inode *inode,
5894 * credit. But below we try to not accumalate too much
5895 * of them by restarting the journal.
5897 - needed = ext4_ext_calc_credits_for_insert(inode, path);
5898 + needed = ext4_ext_calc_credits_for_single_extent(inode,
5899 + lb->last_block - lb->first_block + 1, path);
5902 * Make sure the credit we accumalated is not really high
5903 diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
5904 index ab16bea..387ad98 100644
5905 --- a/fs/ext4/namei.c
5906 +++ b/fs/ext4/namei.c
5907 @@ -183,6 +183,16 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
5908 struct inode *inode);
5911 + * p is at least 6 bytes before the end of page
5913 +static inline struct ext4_dir_entry_2 *
5914 +ext4_next_entry(struct ext4_dir_entry_2 *p)
5916 + return (struct ext4_dir_entry_2 *)((char *)p +
5917 + ext4_rec_len_from_disk(p->rec_len));
5921 * Future: use high four bits of block for coalesce-on-delete flags
5922 * Mask them off for now.
5924 @@ -231,13 +241,13 @@ static inline unsigned dx_root_limit (struct inode *dir, unsigned infosize)
5926 unsigned entry_space = dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(1) -
5927 EXT4_DIR_REC_LEN(2) - infosize;
5928 - return 0? 20: entry_space / sizeof(struct dx_entry);
5929 + return entry_space / sizeof(struct dx_entry);
5932 static inline unsigned dx_node_limit (struct inode *dir)
5934 unsigned entry_space = dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(0);
5935 - return 0? 22: entry_space / sizeof(struct dx_entry);
5936 + return entry_space / sizeof(struct dx_entry);
5940 @@ -554,15 +564,6 @@ static int ext4_htree_next_block(struct inode *dir, __u32 hash,
5944 - * p is at least 6 bytes before the end of page
5946 -static inline struct ext4_dir_entry_2 *ext4_next_entry(struct ext4_dir_entry_2 *p)
5948 - return (struct ext4_dir_entry_2 *)((char *)p +
5949 - ext4_rec_len_from_disk(p->rec_len));
5953 * This function fills a red-black tree with information from a
5954 * directory block. It returns the number directory entries loaded
5955 * into the tree. If there is an error it is returned in err.
5956 @@ -993,19 +994,21 @@ static struct buffer_head * ext4_dx_find_entry(struct dentry *dentry,
5957 de = (struct ext4_dir_entry_2 *) bh->b_data;
5958 top = (struct ext4_dir_entry_2 *) ((char *) de + sb->s_blocksize -
5959 EXT4_DIR_REC_LEN(0));
5960 - for (; de < top; de = ext4_next_entry(de))
5961 - if (ext4_match (namelen, name, de)) {
5962 - if (!ext4_check_dir_entry("ext4_find_entry",
5964 - (block<<EXT4_BLOCK_SIZE_BITS(sb))
5965 - +((char *)de - bh->b_data))) {
5967 + for (; de < top; de = ext4_next_entry(de)) {
5968 + int off = (block << EXT4_BLOCK_SIZE_BITS(sb))
5969 + + ((char *) de - bh->b_data);
5971 + if (!ext4_check_dir_entry(__func__, dir, de, bh, off)) {
5973 *err = ERR_BAD_DX_DIR;
5977 - dx_release (frames);
5980 + if (ext4_match(namelen, name, de)) {
5982 + dx_release(frames);
5987 /* Check to see if we should continue to search */
5988 diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
5989 index 9ff7b1c..b3d3560 100644
5990 --- a/fs/ext4/resize.c
5991 +++ b/fs/ext4/resize.c
5992 @@ -73,7 +73,7 @@ static int verify_group_input(struct super_block *sb,
5993 "Inode bitmap not in group (block %llu)",
5994 (unsigned long long)input->inode_bitmap);
5995 else if (outside(input->inode_table, start, end) ||
5996 - outside(itend - 1, start, end))
5997 + outside(itend - 1, start, end))
5998 ext4_warning(sb, __func__,
5999 "Inode table not in group (blocks %llu-%llu)",
6000 (unsigned long long)input->inode_table, itend - 1);
6001 @@ -104,7 +104,7 @@ static int verify_group_input(struct super_block *sb,
6002 (unsigned long long)input->inode_bitmap,
6003 start, metaend - 1);
6004 else if (inside(input->inode_table, start, metaend) ||
6005 - inside(itend - 1, start, metaend))
6006 + inside(itend - 1, start, metaend))
6007 ext4_warning(sb, __func__,
6008 "Inode table (%llu-%llu) overlaps"
6009 "GDT table (%llu-%llu)",
6010 @@ -158,9 +158,9 @@ static int extend_or_restart_transaction(handle_t *handle, int thresh,
6012 if ((err = ext4_journal_restart(handle, EXT4_MAX_TRANS_DATA)))
6014 - if ((err = ext4_journal_get_write_access(handle, bh)))
6015 + if ((err = ext4_journal_get_write_access(handle, bh)))
6022 @@ -416,11 +416,11 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
6023 "EXT4-fs: ext4_add_new_gdb: adding group block %lu\n",
6027 - * If we are not using the primary superblock/GDT copy don't resize,
6028 - * because the user tools have no way of handling this. Probably a
6029 - * bad time to do it anyways.
6032 + * If we are not using the primary superblock/GDT copy don't resize,
6033 + * because the user tools have no way of handling this. Probably a
6034 + * bad time to do it anyways.
6036 if (EXT4_SB(sb)->s_sbh->b_blocknr !=
6037 le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block)) {
6038 ext4_warning(sb, __func__,
6039 @@ -507,14 +507,14 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
6043 - //ext4_journal_release_buffer(handle, iloc.bh);
6044 + /* ext4_journal_release_buffer(handle, iloc.bh); */
6047 - //ext4_journal_release_buffer(handle, dind);
6048 + /* ext4_journal_release_buffer(handle, dind); */
6050 - //ext4_journal_release_buffer(handle, *primary);
6051 + /* ext4_journal_release_buffer(handle, *primary); */
6053 - //ext4_journal_release_buffer(handle, *primary);
6054 + /* ext4_journal_release_buffer(handle, *primary); */
6058 @@ -773,7 +773,8 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
6060 if (reserved_gdb || gdb_off == 0) {
6061 if (!EXT4_HAS_COMPAT_FEATURE(sb,
6062 - EXT4_FEATURE_COMPAT_RESIZE_INODE)){
6063 + EXT4_FEATURE_COMPAT_RESIZE_INODE)
6064 + || !le16_to_cpu(es->s_reserved_gdt_blocks)) {
6065 ext4_warning(sb, __func__,
6066 "No reserved GDT blocks, can't resize");
6068 @@ -818,12 +819,12 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
6069 if ((err = ext4_journal_get_write_access(handle, sbi->s_sbh)))
6073 - * We will only either add reserved group blocks to a backup group
6074 - * or remove reserved blocks for the first group in a new group block.
6075 - * Doing both would be mean more complex code, and sane people don't
6076 - * use non-sparse filesystems anymore. This is already checked above.
6079 + * We will only either add reserved group blocks to a backup group
6080 + * or remove reserved blocks for the first group in a new group block.
6081 + * Doing both would be mean more complex code, and sane people don't
6082 + * use non-sparse filesystems anymore. This is already checked above.
6085 primary = sbi->s_group_desc[gdb_num];
6086 if ((err = ext4_journal_get_write_access(handle, primary)))
6087 @@ -835,24 +836,24 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
6088 } else if ((err = add_new_gdb(handle, inode, input, &primary)))
6092 - * OK, now we've set up the new group. Time to make it active.
6094 - * Current kernels don't lock all allocations via lock_super(),
6095 - * so we have to be safe wrt. concurrent accesses the group
6096 - * data. So we need to be careful to set all of the relevant
6097 - * group descriptor data etc. *before* we enable the group.
6099 - * The key field here is sbi->s_groups_count: as long as
6100 - * that retains its old value, nobody is going to access the new
6103 - * So first we update all the descriptor metadata for the new
6104 - * group; then we update the total disk blocks count; then we
6105 - * update the groups count to enable the group; then finally we
6106 - * update the free space counts so that the system can start
6107 - * using the new disk blocks.
6110 + * OK, now we've set up the new group. Time to make it active.
6112 + * Current kernels don't lock all allocations via lock_super(),
6113 + * so we have to be safe wrt. concurrent accesses the group
6114 + * data. So we need to be careful to set all of the relevant
6115 + * group descriptor data etc. *before* we enable the group.
6117 + * The key field here is sbi->s_groups_count: as long as
6118 + * that retains its old value, nobody is going to access the new
6121 + * So first we update all the descriptor metadata for the new
6122 + * group; then we update the total disk blocks count; then we
6123 + * update the groups count to enable the group; then finally we
6124 + * update the free space counts so that the system can start
6125 + * using the new disk blocks.
6128 /* Update group descriptor block for new group */
6129 gdp = (struct ext4_group_desc *)((char *)primary->b_data +
6130 @@ -866,6 +867,15 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
6131 gdp->bg_checksum = ext4_group_desc_csum(sbi, input->group, gdp);
6134 + * We can allocate memory for mb_alloc based on the new group
6137 + if (test_opt(sb, MBALLOC)) {
6138 + err = ext4_mb_add_more_groupinfo(sb, input->group, gdp);
6140 + goto exit_journal;
6143 * Make the new blocks and inodes valid next. We do this before
6144 * increasing the group count so that once the group is enabled,
6145 * all of its blocks and inodes are already valid.
6146 @@ -937,7 +947,8 @@ exit_put:
6148 } /* ext4_group_add */
6150 -/* Extend the filesystem to the new number of blocks specified. This entry
6152 + * Extend the filesystem to the new number of blocks specified. This entry
6153 * point is only used to extend the current filesystem to the end of the last
6154 * existing group. It can be accessed via ioctl, or by "remount,resize=<size>"
6155 * for emergencies (because it has no dependencies on reserved blocks).
6156 @@ -957,6 +968,8 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
6159 unsigned long freed_blocks;
6160 + ext4_group_t group;
6161 + struct ext4_group_info *grp;
6163 /* We don't need to worry about locking wrt other resizers just
6164 * yet: we're going to revalidate es->s_blocks_count after
6165 @@ -988,7 +1001,7 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
6168 /* Handle the remaining blocks in the last group only. */
6169 - ext4_get_group_no_and_offset(sb, o_blocks_count, NULL, &last);
6170 + ext4_get_group_no_and_offset(sb, o_blocks_count, &group, &last);
6173 ext4_warning(sb, __func__,
6174 @@ -1013,7 +1026,7 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
6175 o_blocks_count + add, add);
6177 /* See if the device is actually as big as what was requested */
6178 - bh = sb_bread(sb, o_blocks_count + add -1);
6179 + bh = sb_bread(sb, o_blocks_count + add - 1);
6181 ext4_warning(sb, __func__,
6182 "can't read last block, resize aborted");
6183 @@ -1060,6 +1073,45 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
6184 o_blocks_count + add);
6185 if ((err = ext4_journal_stop(handle)))
6189 + * Mark mballoc pages as not up to date so that they will be updated
6190 + * next time they are loaded by ext4_mb_load_buddy.
6192 + if (test_opt(sb, MBALLOC)) {
6193 + struct ext4_sb_info *sbi = EXT4_SB(sb);
6194 + struct inode *inode = sbi->s_buddy_cache;
6195 + int blocks_per_page;
6198 + struct page *page;
6200 + /* Set buddy page as not up to date */
6201 + blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
6202 + block = group * 2;
6203 + pnum = block / blocks_per_page;
6204 + page = find_get_page(inode->i_mapping, pnum);
6205 + if (page != NULL) {
6206 + ClearPageUptodate(page);
6207 + page_cache_release(page);
6210 + /* Set bitmap page as not up to date */
6212 + pnum = block / blocks_per_page;
6213 + page = find_get_page(inode->i_mapping, pnum);
6214 + if (page != NULL) {
6215 + ClearPageUptodate(page);
6216 + page_cache_release(page);
6219 + /* Get the info on the last group */
6220 + grp = ext4_get_group_info(sb, group);
6222 + /* Update free blocks in group info */
6223 + ext4_mb_update_group_info(grp, add);
6226 if (test_opt(sb, DEBUG))
6227 printk(KERN_DEBUG "EXT4-fs: extended group to %llu blocks\n",
6228 ext4_blocks_count(es));
6229 diff --git a/fs/ext4/super.c b/fs/ext4/super.c
6230 index 02bf243..ed80f9f 100644
6231 --- a/fs/ext4/super.c
6232 +++ b/fs/ext4/super.c
6233 @@ -49,20 +49,19 @@ static int ext4_load_journal(struct super_block *, struct ext4_super_block *,
6234 unsigned long journal_devnum);
6235 static int ext4_create_journal(struct super_block *, struct ext4_super_block *,
6237 -static void ext4_commit_super (struct super_block * sb,
6238 - struct ext4_super_block * es,
6240 -static void ext4_mark_recovery_complete(struct super_block * sb,
6241 - struct ext4_super_block * es);
6242 -static void ext4_clear_journal_err(struct super_block * sb,
6243 - struct ext4_super_block * es);
6244 +static void ext4_commit_super(struct super_block *sb,
6245 + struct ext4_super_block *es, int sync);
6246 +static void ext4_mark_recovery_complete(struct super_block *sb,
6247 + struct ext4_super_block *es);
6248 +static void ext4_clear_journal_err(struct super_block *sb,
6249 + struct ext4_super_block *es);
6250 static int ext4_sync_fs(struct super_block *sb, int wait);
6251 -static const char *ext4_decode_error(struct super_block * sb, int errno,
6252 +static const char *ext4_decode_error(struct super_block *sb, int errno,
6254 -static int ext4_remount (struct super_block * sb, int * flags, char * data);
6255 -static int ext4_statfs (struct dentry * dentry, struct kstatfs * buf);
6256 +static int ext4_remount(struct super_block *sb, int *flags, char *data);
6257 +static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf);
6258 static void ext4_unlockfs(struct super_block *sb);
6259 -static void ext4_write_super (struct super_block * sb);
6260 +static void ext4_write_super(struct super_block *sb);
6261 static void ext4_write_super_lockfs(struct super_block *sb);
6264 @@ -211,15 +210,15 @@ static void ext4_handle_error(struct super_block *sb)
6265 if (sb->s_flags & MS_RDONLY)
6268 - if (!test_opt (sb, ERRORS_CONT)) {
6269 + if (!test_opt(sb, ERRORS_CONT)) {
6270 journal_t *journal = EXT4_SB(sb)->s_journal;
6272 EXT4_SB(sb)->s_mount_opt |= EXT4_MOUNT_ABORT;
6274 jbd2_journal_abort(journal, -EIO);
6276 - if (test_opt (sb, ERRORS_RO)) {
6277 - printk (KERN_CRIT "Remounting filesystem read-only\n");
6278 + if (test_opt(sb, ERRORS_RO)) {
6279 + printk(KERN_CRIT "Remounting filesystem read-only\n");
6280 sb->s_flags |= MS_RDONLY;
6282 ext4_commit_super(sb, es, 1);
6283 @@ -228,13 +227,13 @@ static void ext4_handle_error(struct super_block *sb)
6287 -void ext4_error (struct super_block * sb, const char * function,
6288 - const char * fmt, ...)
6289 +void ext4_error(struct super_block *sb, const char *function,
6290 + const char *fmt, ...)
6294 va_start(args, fmt);
6295 - printk(KERN_CRIT "EXT4-fs error (device %s): %s: ",sb->s_id, function);
6296 + printk(KERN_CRIT "EXT4-fs error (device %s): %s: ", sb->s_id, function);
6300 @@ -242,7 +241,7 @@ void ext4_error (struct super_block * sb, const char * function,
6301 ext4_handle_error(sb);
6304 -static const char *ext4_decode_error(struct super_block * sb, int errno,
6305 +static const char *ext4_decode_error(struct super_block *sb, int errno,
6308 char *errstr = NULL;
6309 @@ -278,8 +277,7 @@ static const char *ext4_decode_error(struct super_block * sb, int errno,
6310 /* __ext4_std_error decodes expected errors from journaling functions
6311 * automatically and invokes the appropriate error response. */
6313 -void __ext4_std_error (struct super_block * sb, const char * function,
6315 +void __ext4_std_error(struct super_block *sb, const char *function, int errno)
6319 @@ -292,8 +290,8 @@ void __ext4_std_error (struct super_block * sb, const char * function,
6322 errstr = ext4_decode_error(sb, errno, nbuf);
6323 - printk (KERN_CRIT "EXT4-fs error (device %s) in %s: %s\n",
6324 - sb->s_id, function, errstr);
6325 + printk(KERN_CRIT "EXT4-fs error (device %s) in %s: %s\n",
6326 + sb->s_id, function, errstr);
6328 ext4_handle_error(sb);
6330 @@ -308,15 +306,15 @@ void __ext4_std_error (struct super_block * sb, const char * function,
6331 * case we take the easy way out and panic immediately.
6334 -void ext4_abort (struct super_block * sb, const char * function,
6335 - const char * fmt, ...)
6336 +void ext4_abort(struct super_block *sb, const char *function,
6337 + const char *fmt, ...)
6341 - printk (KERN_CRIT "ext4_abort called.\n");
6342 + printk(KERN_CRIT "ext4_abort called.\n");
6344 va_start(args, fmt);
6345 - printk(KERN_CRIT "EXT4-fs error (device %s): %s: ",sb->s_id, function);
6346 + printk(KERN_CRIT "EXT4-fs error (device %s): %s: ", sb->s_id, function);
6350 @@ -334,8 +332,8 @@ void ext4_abort (struct super_block * sb, const char * function,
6351 jbd2_journal_abort(EXT4_SB(sb)->s_journal, -EIO);
6354 -void ext4_warning (struct super_block * sb, const char * function,
6355 - const char * fmt, ...)
6356 +void ext4_warning(struct super_block *sb, const char *function,
6357 + const char *fmt, ...)
6361 @@ -496,7 +494,7 @@ static void dump_orphan_list(struct super_block *sb, struct ext4_sb_info *sbi)
6365 -static void ext4_put_super (struct super_block * sb)
6366 +static void ext4_put_super(struct super_block *sb)
6368 struct ext4_sb_info *sbi = EXT4_SB(sb);
6369 struct ext4_super_block *es = sbi->s_es;
6370 @@ -506,6 +504,7 @@ static void ext4_put_super (struct super_block * sb)
6371 ext4_ext_release(sb);
6372 ext4_xattr_put_super(sb);
6373 jbd2_journal_destroy(sbi->s_journal);
6374 + sbi->s_journal = NULL;
6375 if (!(sb->s_flags & MS_RDONLY)) {
6376 EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
6377 es->s_state = cpu_to_le16(sbi->s_mount_state);
6378 @@ -517,6 +516,7 @@ static void ext4_put_super (struct super_block * sb)
6379 for (i = 0; i < sbi->s_gdb_count; i++)
6380 brelse(sbi->s_group_desc[i]);
6381 kfree(sbi->s_group_desc);
6382 + kfree(sbi->s_flex_groups);
6383 percpu_counter_destroy(&sbi->s_freeblocks_counter);
6384 percpu_counter_destroy(&sbi->s_freeinodes_counter);
6385 percpu_counter_destroy(&sbi->s_dirs_counter);
6386 @@ -568,9 +568,16 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
6388 ei->i_block_alloc_info = NULL;
6389 ei->vfs_inode.i_version = 1;
6390 + ei->vfs_inode.i_data.writeback_index = 0;
6391 memset(&ei->i_cached_extent, 0, sizeof(struct ext4_ext_cache));
6392 INIT_LIST_HEAD(&ei->i_prealloc_list);
6393 spin_lock_init(&ei->i_prealloc_lock);
6394 + jbd2_journal_init_jbd_inode(&ei->jinode, &ei->vfs_inode);
6395 + ei->i_reserved_data_blocks = 0;
6396 + ei->i_reserved_meta_blocks = 0;
6397 + ei->i_allocated_meta_blocks = 0;
6398 + ei->i_delalloc_reserved_flag = 0;
6399 + spin_lock_init(&(ei->i_block_reservation_lock));
6400 return &ei->vfs_inode;
6403 @@ -635,9 +642,12 @@ static void ext4_clear_inode(struct inode *inode)
6404 EXT4_I(inode)->i_block_alloc_info = NULL;
6407 + jbd2_journal_release_jbd_inode(EXT4_SB(inode->i_sb)->s_journal,
6408 + &EXT4_I(inode)->jinode);
6411 -static inline void ext4_show_quota_options(struct seq_file *seq, struct super_block *sb)
6412 +static inline void ext4_show_quota_options(struct seq_file *seq,
6413 + struct super_block *sb)
6415 #if defined(CONFIG_QUOTA)
6416 struct ext4_sb_info *sbi = EXT4_SB(sb);
6417 @@ -671,7 +681,6 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
6418 unsigned long def_mount_opts;
6419 struct super_block *sb = vfs->mnt_sb;
6420 struct ext4_sb_info *sbi = EXT4_SB(sb);
6421 - journal_t *journal = sbi->s_journal;
6422 struct ext4_super_block *es = sbi->s_es;
6424 def_mount_opts = le32_to_cpu(es->s_default_mount_opts);
6425 @@ -747,6 +756,9 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
6426 seq_puts(seq, ",nomballoc");
6427 if (test_opt(sb, I_VERSION))
6428 seq_puts(seq, ",i_version");
6429 + if (!test_opt(sb, DELALLOC))
6430 + seq_puts(seq, ",nodelalloc");
6434 seq_printf(seq, ",stripe=%lu", sbi->s_stripe);
6435 @@ -810,8 +822,8 @@ static struct dentry *ext4_fh_to_parent(struct super_block *sb, struct fid *fid,
6439 -#define QTYPE2NAME(t) ((t)==USRQUOTA?"user":"group")
6440 -#define QTYPE2MOPT(on, t) ((t)==USRQUOTA?((on)##USRJQUOTA):((on)##GRPJQUOTA))
6441 +#define QTYPE2NAME(t) ((t) == USRQUOTA?"user":"group")
6442 +#define QTYPE2MOPT(on, t) ((t) == USRQUOTA?((on)##USRJQUOTA):((on)##GRPJQUOTA))
6444 static int ext4_dquot_initialize(struct inode *inode, int type);
6445 static int ext4_dquot_drop(struct inode *inode);
6446 @@ -894,7 +906,7 @@ enum {
6447 Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota,
6448 Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota,
6449 Opt_grpquota, Opt_extents, Opt_noextents, Opt_i_version,
6450 - Opt_mballoc, Opt_nomballoc, Opt_stripe,
6451 + Opt_mballoc, Opt_nomballoc, Opt_stripe, Opt_delalloc, Opt_nodelalloc,
6454 static match_table_t tokens = {
6455 @@ -953,6 +965,8 @@ static match_table_t tokens = {
6456 {Opt_nomballoc, "nomballoc"},
6457 {Opt_stripe, "stripe=%u"},
6458 {Opt_resize, "resize"},
6459 + {Opt_delalloc, "delalloc"},
6460 + {Opt_nodelalloc, "nodelalloc"},
6464 @@ -977,12 +991,12 @@ static ext4_fsblk_t get_sb_block(void **data)
6468 -static int parse_options (char *options, struct super_block *sb,
6469 - unsigned int *inum, unsigned long *journal_devnum,
6470 - ext4_fsblk_t *n_blocks_count, int is_remount)
6471 +static int parse_options(char *options, struct super_block *sb,
6472 + unsigned int *inum, unsigned long *journal_devnum,
6473 + ext4_fsblk_t *n_blocks_count, int is_remount)
6475 struct ext4_sb_info *sbi = EXT4_SB(sb);
6478 substring_t args[MAX_OPT_ARGS];
6481 @@ -990,11 +1004,12 @@ static int parse_options (char *options, struct super_block *sb,
6485 + ext4_fsblk_t last_block;
6490 - while ((p = strsep (&options, ",")) != NULL) {
6491 + while ((p = strsep(&options, ",")) != NULL) {
6495 @@ -1002,16 +1017,16 @@ static int parse_options (char *options, struct super_block *sb,
6496 token = match_token(p, tokens, args);
6499 - clear_opt (sbi->s_mount_opt, MINIX_DF);
6500 + clear_opt(sbi->s_mount_opt, MINIX_DF);
6503 - set_opt (sbi->s_mount_opt, MINIX_DF);
6504 + set_opt(sbi->s_mount_opt, MINIX_DF);
6507 - set_opt (sbi->s_mount_opt, GRPID);
6508 + set_opt(sbi->s_mount_opt, GRPID);
6511 - clear_opt (sbi->s_mount_opt, GRPID);
6512 + clear_opt(sbi->s_mount_opt, GRPID);
6515 if (match_int(&args[0], &option))
6516 @@ -1028,41 +1043,41 @@ static int parse_options (char *options, struct super_block *sb,
6517 /* *sb_block = match_int(&args[0]); */
6520 - clear_opt (sbi->s_mount_opt, ERRORS_CONT);
6521 - clear_opt (sbi->s_mount_opt, ERRORS_RO);
6522 - set_opt (sbi->s_mount_opt, ERRORS_PANIC);
6523 + clear_opt(sbi->s_mount_opt, ERRORS_CONT);
6524 + clear_opt(sbi->s_mount_opt, ERRORS_RO);
6525 + set_opt(sbi->s_mount_opt, ERRORS_PANIC);
6528 - clear_opt (sbi->s_mount_opt, ERRORS_CONT);
6529 - clear_opt (sbi->s_mount_opt, ERRORS_PANIC);
6530 - set_opt (sbi->s_mount_opt, ERRORS_RO);
6531 + clear_opt(sbi->s_mount_opt, ERRORS_CONT);
6532 + clear_opt(sbi->s_mount_opt, ERRORS_PANIC);
6533 + set_opt(sbi->s_mount_opt, ERRORS_RO);
6536 - clear_opt (sbi->s_mount_opt, ERRORS_RO);
6537 - clear_opt (sbi->s_mount_opt, ERRORS_PANIC);
6538 - set_opt (sbi->s_mount_opt, ERRORS_CONT);
6539 + clear_opt(sbi->s_mount_opt, ERRORS_RO);
6540 + clear_opt(sbi->s_mount_opt, ERRORS_PANIC);
6541 + set_opt(sbi->s_mount_opt, ERRORS_CONT);
6544 - set_opt (sbi->s_mount_opt, NO_UID32);
6545 + set_opt(sbi->s_mount_opt, NO_UID32);
6548 - clear_opt (sbi->s_mount_opt, CHECK);
6549 + clear_opt(sbi->s_mount_opt, CHECK);
6552 - set_opt (sbi->s_mount_opt, DEBUG);
6553 + set_opt(sbi->s_mount_opt, DEBUG);
6556 - set_opt (sbi->s_mount_opt, OLDALLOC);
6557 + set_opt(sbi->s_mount_opt, OLDALLOC);
6560 - clear_opt (sbi->s_mount_opt, OLDALLOC);
6561 + clear_opt(sbi->s_mount_opt, OLDALLOC);
6563 #ifdef CONFIG_EXT4DEV_FS_XATTR
6564 case Opt_user_xattr:
6565 - set_opt (sbi->s_mount_opt, XATTR_USER);
6566 + set_opt(sbi->s_mount_opt, XATTR_USER);
6568 case Opt_nouser_xattr:
6569 - clear_opt (sbi->s_mount_opt, XATTR_USER);
6570 + clear_opt(sbi->s_mount_opt, XATTR_USER);
6573 case Opt_user_xattr:
6574 @@ -1100,7 +1115,7 @@ static int parse_options (char *options, struct super_block *sb,
6575 "journal on remount\n");
6578 - set_opt (sbi->s_mount_opt, UPDATE_JOURNAL);
6579 + set_opt(sbi->s_mount_opt, UPDATE_JOURNAL);
6581 case Opt_journal_inum:
6583 @@ -1130,7 +1145,7 @@ static int parse_options (char *options, struct super_block *sb,
6584 set_opt(sbi->s_mount_opt, JOURNAL_CHECKSUM);
6587 - set_opt (sbi->s_mount_opt, NOLOAD);
6588 + set_opt(sbi->s_mount_opt, NOLOAD);
6591 if (match_int(&args[0], &option))
6592 @@ -1309,15 +1324,39 @@ set_qf_format:
6593 clear_opt(sbi->s_mount_opt, NOBH);
6596 - set_opt (sbi->s_mount_opt, EXTENTS);
6597 + if (!EXT4_HAS_INCOMPAT_FEATURE(sb,
6598 + EXT4_FEATURE_INCOMPAT_EXTENTS)) {
6599 + ext4_warning(sb, __func__,
6600 + "extents feature not enabled "
6601 + "on this filesystem, use tune2fs\n");
6604 + set_opt(sbi->s_mount_opt, EXTENTS);
6607 - clear_opt (sbi->s_mount_opt, EXTENTS);
6609 + * When e2fsprogs support resizing an already existing
6610 + * ext3 file system to greater than 2**32 we need to
6611 + * add support to block allocator to handle growing
6612 + * already existing block mapped inode so that blocks
6613 + * allocated for them fall within 2**32
6615 + last_block = ext4_blocks_count(sbi->s_es) - 1;
6616 + if (last_block > 0xffffffffULL) {
6617 + printk(KERN_ERR "EXT4-fs: Filesystem too "
6618 + "large to mount with "
6619 + "-o noextents options\n");
6622 + clear_opt(sbi->s_mount_opt, EXTENTS);
6625 set_opt(sbi->s_mount_opt, I_VERSION);
6626 sb->s_flags |= MS_I_VERSION;
6628 + case Opt_nodelalloc:
6629 + clear_opt(sbi->s_mount_opt, DELALLOC);
6632 set_opt(sbi->s_mount_opt, MBALLOC);
6634 @@ -1331,10 +1370,13 @@ set_qf_format:
6636 sbi->s_stripe = option;
6638 + case Opt_delalloc:
6639 + set_opt(sbi->s_mount_opt, DELALLOC);
6643 - "EXT4-fs: Unrecognized mount option \"%s\" "
6644 - "or missing value\n", p);
6646 + "EXT4-fs: Unrecognized mount option \"%s\" "
6647 + "or missing value\n", p);
6651 @@ -1381,31 +1423,31 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
6654 if (le32_to_cpu(es->s_rev_level) > EXT4_MAX_SUPP_REV) {
6655 - printk (KERN_ERR "EXT4-fs warning: revision level too high, "
6656 - "forcing read-only mode\n");
6657 + printk(KERN_ERR "EXT4-fs warning: revision level too high, "
6658 + "forcing read-only mode\n");
6663 if (!(sbi->s_mount_state & EXT4_VALID_FS))
6664 - printk (KERN_WARNING "EXT4-fs warning: mounting unchecked fs, "
6665 - "running e2fsck is recommended\n");
6666 + printk(KERN_WARNING "EXT4-fs warning: mounting unchecked fs, "
6667 + "running e2fsck is recommended\n");
6668 else if ((sbi->s_mount_state & EXT4_ERROR_FS))
6669 - printk (KERN_WARNING
6670 - "EXT4-fs warning: mounting fs with errors, "
6671 - "running e2fsck is recommended\n");
6672 + printk(KERN_WARNING
6673 + "EXT4-fs warning: mounting fs with errors, "
6674 + "running e2fsck is recommended\n");
6675 else if ((__s16) le16_to_cpu(es->s_max_mnt_count) >= 0 &&
6676 le16_to_cpu(es->s_mnt_count) >=
6677 (unsigned short) (__s16) le16_to_cpu(es->s_max_mnt_count))
6678 - printk (KERN_WARNING
6679 - "EXT4-fs warning: maximal mount count reached, "
6680 - "running e2fsck is recommended\n");
6681 + printk(KERN_WARNING
6682 + "EXT4-fs warning: maximal mount count reached, "
6683 + "running e2fsck is recommended\n");
6684 else if (le32_to_cpu(es->s_checkinterval) &&
6685 (le32_to_cpu(es->s_lastcheck) +
6686 le32_to_cpu(es->s_checkinterval) <= get_seconds()))
6687 - printk (KERN_WARNING
6688 - "EXT4-fs warning: checktime reached, "
6689 - "running e2fsck is recommended\n");
6690 + printk(KERN_WARNING
6691 + "EXT4-fs warning: checktime reached, "
6692 + "running e2fsck is recommended\n");
6694 /* @@@ We _will_ want to clear the valid bit if we find
6695 * inconsistencies, to force a fsck at reboot. But for
6696 @@ -1443,6 +1485,53 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
6700 +static int ext4_fill_flex_info(struct super_block *sb)
6702 + struct ext4_sb_info *sbi = EXT4_SB(sb);
6703 + struct ext4_group_desc *gdp = NULL;
6704 + struct buffer_head *bh;
6705 + ext4_group_t flex_group_count;
6706 + ext4_group_t flex_group;
6707 + int groups_per_flex = 0;
6708 + __u64 block_bitmap = 0;
6711 + if (!sbi->s_es->s_log_groups_per_flex) {
6712 + sbi->s_log_groups_per_flex = 0;
6716 + sbi->s_log_groups_per_flex = sbi->s_es->s_log_groups_per_flex;
6717 + groups_per_flex = 1 << sbi->s_log_groups_per_flex;
6719 + flex_group_count = (sbi->s_groups_count + groups_per_flex - 1) /
6721 + sbi->s_flex_groups = kzalloc(flex_group_count *
6722 + sizeof(struct flex_groups), GFP_KERNEL);
6723 + if (sbi->s_flex_groups == NULL) {
6724 + printk(KERN_ERR "EXT4-fs: not enough memory for "
6725 + "%lu flex groups\n", flex_group_count);
6729 + gdp = ext4_get_group_desc(sb, 1, &bh);
6730 + block_bitmap = ext4_block_bitmap(sb, gdp) - 1;
6732 + for (i = 0; i < sbi->s_groups_count; i++) {
6733 + gdp = ext4_get_group_desc(sb, i, &bh);
6735 + flex_group = ext4_flex_group(sbi, i);
6736 + sbi->s_flex_groups[flex_group].free_inodes +=
6737 + le16_to_cpu(gdp->bg_free_inodes_count);
6738 + sbi->s_flex_groups[flex_group].free_blocks +=
6739 + le16_to_cpu(gdp->bg_free_blocks_count);
6747 __le16 ext4_group_desc_csum(struct ext4_sb_info *sbi, __u32 block_group,
6748 struct ext4_group_desc *gdp)
6750 @@ -1507,16 +1596,14 @@ static int ext4_check_descriptors(struct super_block *sb)
6751 (EXT4_BLOCKS_PER_GROUP(sb) - 1);
6753 block_bitmap = ext4_block_bitmap(sb, gdp);
6754 - if (block_bitmap < first_block || block_bitmap > last_block)
6756 + if (block_bitmap < first_block || block_bitmap > last_block) {
6757 printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: "
6758 "Block bitmap for group %lu not in group "
6759 "(block %llu)!", i, block_bitmap);
6762 inode_bitmap = ext4_inode_bitmap(sb, gdp);
6763 - if (inode_bitmap < first_block || inode_bitmap > last_block)
6765 + if (inode_bitmap < first_block || inode_bitmap > last_block) {
6766 printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: "
6767 "Inode bitmap for group %lu not in group "
6768 "(block %llu)!", i, inode_bitmap);
6769 @@ -1524,26 +1611,28 @@ static int ext4_check_descriptors(struct super_block *sb)
6771 inode_table = ext4_inode_table(sb, gdp);
6772 if (inode_table < first_block ||
6773 - inode_table + sbi->s_itb_per_group - 1 > last_block)
6775 + inode_table + sbi->s_itb_per_group - 1 > last_block) {
6776 printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: "
6777 "Inode table for group %lu not in group "
6778 "(block %llu)!", i, inode_table);
6781 + spin_lock(sb_bgl_lock(sbi, i));
6782 if (!ext4_group_desc_csum_verify(sbi, i, gdp)) {
6783 printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: "
6784 "Checksum for group %lu failed (%u!=%u)\n",
6785 i, le16_to_cpu(ext4_group_desc_csum(sbi, i,
6786 gdp)), le16_to_cpu(gdp->bg_checksum));
6788 + if (!(sb->s_flags & MS_RDONLY))
6791 + spin_unlock(sb_bgl_lock(sbi, i));
6793 first_block += EXT4_BLOCKS_PER_GROUP(sb);
6796 ext4_free_blocks_count_set(sbi->s_es, ext4_count_free_blocks(sb));
6797 - sbi->s_es->s_free_inodes_count=cpu_to_le32(ext4_count_free_inodes(sb));
6798 + sbi->s_es->s_free_inodes_count = cpu_to_le32(ext4_count_free_inodes(sb));
6802 @@ -1564,8 +1653,8 @@ static int ext4_check_descriptors(struct super_block *sb)
6803 * e2fsck was run on this filesystem, and it must have already done the orphan
6804 * inode cleanup for us, so we can safely abort without any further action.
6806 -static void ext4_orphan_cleanup (struct super_block * sb,
6807 - struct ext4_super_block * es)
6808 +static void ext4_orphan_cleanup(struct super_block *sb,
6809 + struct ext4_super_block *es)
6811 unsigned int s_flags = sb->s_flags;
6812 int nr_orphans = 0, nr_truncates = 0;
6813 @@ -1642,7 +1731,7 @@ static void ext4_orphan_cleanup (struct super_block * sb,
6814 iput(inode); /* The delete magic happens here! */
6817 -#define PLURAL(x) (x), ((x)==1) ? "" : "s"
6818 +#define PLURAL(x) (x), ((x) == 1) ? "" : "s"
6821 printk(KERN_INFO "EXT4-fs: %s: %d orphan inode%s deleted\n",
6822 @@ -1809,12 +1898,12 @@ static unsigned long ext4_get_stripe_size(struct ext4_sb_info *sbi)
6826 -static int ext4_fill_super (struct super_block *sb, void *data, int silent)
6827 - __releases(kernel_sem)
6828 - __acquires(kernel_sem)
6829 +static int ext4_fill_super(struct super_block *sb, void *data, int silent)
6830 + __releases(kernel_lock)
6831 + __acquires(kernel_lock)
6834 - struct buffer_head * bh;
6835 + struct buffer_head *bh;
6836 struct ext4_super_block *es = NULL;
6837 struct ext4_sb_info *sbi;
6839 @@ -1851,11 +1940,6 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
6843 - if (!sb_set_blocksize(sb, blocksize)) {
6844 - printk(KERN_ERR "EXT4-fs: bad blocksize %d.\n", blocksize);
6849 * The ext4 superblock will not be buffer aligned for other than 1kB
6850 * block sizes. We need to calculate the offset from buffer start.
6851 @@ -1868,7 +1952,7 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
6854 if (!(bh = sb_bread(sb, logical_sb_block))) {
6855 - printk (KERN_ERR "EXT4-fs: unable to read superblock\n");
6856 + printk(KERN_ERR "EXT4-fs: unable to read superblock\n");
6860 @@ -1919,17 +2003,30 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
6863 * turn on extents feature by default in ext4 filesystem
6864 - * User -o noextents to turn it off
6865 + * only if feature flag already set by mkfs or tune2fs.
6866 + * Use -o noextents to turn it off
6868 - set_opt(sbi->s_mount_opt, EXTENTS);
6869 + if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS))
6870 + set_opt(sbi->s_mount_opt, EXTENTS);
6872 + ext4_warning(sb, __func__,
6873 + "extents feature not enabled on this filesystem, "
6874 + "use tune2fs.\n");
6876 - * turn on mballoc feature by default in ext4 filesystem
6877 - * User -o nomballoc to turn it off
6878 + * turn on mballoc code by default in ext4 filesystem
6879 + * Use -o nomballoc to turn it off
6881 set_opt(sbi->s_mount_opt, MBALLOC);
6883 - if (!parse_options ((char *) data, sb, &journal_inum, &journal_devnum,
6886 + * enable delayed allocation by default
6887 + * Use -o nodelalloc to turn it off
6889 + set_opt(sbi->s_mount_opt, DELALLOC);
6892 + if (!parse_options((char *) data, sb, &journal_inum, &journal_devnum,
6896 sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
6897 @@ -2004,7 +2101,7 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
6903 logical_sb_block = sb_block * EXT4_MIN_BLOCK_SIZE;
6904 offset = do_div(logical_sb_block, blocksize);
6905 bh = sb_bread(sb, logical_sb_block);
6906 @@ -2016,8 +2113,8 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
6907 es = (struct ext4_super_block *)(((char *)bh->b_data) + offset);
6909 if (es->s_magic != cpu_to_le16(EXT4_SUPER_MAGIC)) {
6911 - "EXT4-fs: Magic mismatch, very weird !\n");
6913 + "EXT4-fs: Magic mismatch, very weird !\n");
6917 @@ -2034,9 +2131,9 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
6918 if ((sbi->s_inode_size < EXT4_GOOD_OLD_INODE_SIZE) ||
6919 (!is_power_of_2(sbi->s_inode_size)) ||
6920 (sbi->s_inode_size > blocksize)) {
6922 - "EXT4-fs: unsupported inode size: %d\n",
6923 - sbi->s_inode_size);
6925 + "EXT4-fs: unsupported inode size: %d\n",
6926 + sbi->s_inode_size);
6929 if (sbi->s_inode_size > EXT4_GOOD_OLD_INODE_SIZE)
6930 @@ -2068,20 +2165,20 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
6931 sbi->s_mount_state = le16_to_cpu(es->s_state);
6932 sbi->s_addr_per_block_bits = ilog2(EXT4_ADDR_PER_BLOCK(sb));
6933 sbi->s_desc_per_block_bits = ilog2(EXT4_DESC_PER_BLOCK(sb));
6934 - for (i=0; i < 4; i++)
6935 + for (i = 0; i < 4; i++)
6936 sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]);
6937 sbi->s_def_hash_version = es->s_def_hash_version;
6939 if (sbi->s_blocks_per_group > blocksize * 8) {
6941 - "EXT4-fs: #blocks per group too big: %lu\n",
6942 - sbi->s_blocks_per_group);
6944 + "EXT4-fs: #blocks per group too big: %lu\n",
6945 + sbi->s_blocks_per_group);
6948 if (sbi->s_inodes_per_group > blocksize * 8) {
6950 - "EXT4-fs: #inodes per group too big: %lu\n",
6951 - sbi->s_inodes_per_group);
6953 + "EXT4-fs: #inodes per group too big: %lu\n",
6954 + sbi->s_inodes_per_group);
6958 @@ -2115,10 +2212,10 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
6959 sbi->s_groups_count = blocks_count;
6960 db_count = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) /
6961 EXT4_DESC_PER_BLOCK(sb);
6962 - sbi->s_group_desc = kmalloc(db_count * sizeof (struct buffer_head *),
6963 + sbi->s_group_desc = kmalloc(db_count * sizeof(struct buffer_head *),
6965 if (sbi->s_group_desc == NULL) {
6966 - printk (KERN_ERR "EXT4-fs: not enough memory\n");
6967 + printk(KERN_ERR "EXT4-fs: not enough memory\n");
6971 @@ -2128,16 +2225,24 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
6972 block = descriptor_loc(sb, logical_sb_block, i);
6973 sbi->s_group_desc[i] = sb_bread(sb, block);
6974 if (!sbi->s_group_desc[i]) {
6975 - printk (KERN_ERR "EXT4-fs: "
6976 - "can't read group descriptor %d\n", i);
6977 + printk(KERN_ERR "EXT4-fs: "
6978 + "can't read group descriptor %d\n", i);
6983 - if (!ext4_check_descriptors (sb)) {
6984 + if (!ext4_check_descriptors(sb)) {
6985 printk(KERN_ERR "EXT4-fs: group descriptors corrupted!\n");
6988 + if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG))
6989 + if (!ext4_fill_flex_info(sb)) {
6991 + "EXT4-fs: unable to initialize "
6992 + "flex_bg meta info!\n");
6993 + goto failed_mount2;
6996 sbi->s_gdb_count = db_count;
6997 get_random_bytes(&sbi->s_next_generation, sizeof(u32));
6998 spin_lock_init(&sbi->s_next_gen_lock);
6999 @@ -2202,11 +2307,11 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
7000 EXT4_SB(sb)->s_journal->j_failed_commit) {
7001 printk(KERN_CRIT "EXT4-fs error (device %s): "
7002 "ext4_fill_super: Journal transaction "
7003 - "%u is corrupt\n", sb->s_id,
7004 + "%u is corrupt\n", sb->s_id,
7005 EXT4_SB(sb)->s_journal->j_failed_commit);
7006 - if (test_opt (sb, ERRORS_RO)) {
7008 - "Mounting filesystem read-only\n");
7009 + if (test_opt(sb, ERRORS_RO)) {
7011 + "Mounting filesystem read-only\n");
7012 sb->s_flags |= MS_RDONLY;
7013 EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
7014 es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
7015 @@ -2226,9 +2331,9 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
7020 - "ext4: No journal on filesystem on %s\n",
7023 + "ext4: No journal on filesystem on %s\n",
7028 @@ -2312,7 +2417,7 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
7032 - ext4_setup_super (sb, es, sb->s_flags & MS_RDONLY);
7033 + ext4_setup_super(sb, es, sb->s_flags & MS_RDONLY);
7035 /* determine the minimum size of new large inodes, if present */
7036 if (sbi->s_inode_size > EXT4_GOOD_OLD_INODE_SIZE) {
7037 @@ -2351,12 +2456,19 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
7038 ext4_orphan_cleanup(sb, es);
7039 EXT4_SB(sb)->s_mount_state &= ~EXT4_ORPHAN_FS;
7041 - printk (KERN_INFO "EXT4-fs: recovery complete.\n");
7042 + printk(KERN_INFO "EXT4-fs: recovery complete.\n");
7043 ext4_mark_recovery_complete(sb, es);
7044 - printk (KERN_INFO "EXT4-fs: mounted filesystem with %s data mode.\n",
7045 - test_opt(sb,DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA ? "journal":
7046 - test_opt(sb,DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA ? "ordered":
7048 + printk(KERN_INFO "EXT4-fs: mounted filesystem with %s data mode.\n",
7049 + test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA ? "journal":
7050 + test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA ? "ordered":
7053 + if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) {
7054 + printk(KERN_WARNING "EXT4-fs: Ignoring delalloc option - "
7055 + "requested data journaling mode\n");
7056 + clear_opt(sbi->s_mount_opt, DELALLOC);
7057 + } else if (test_opt(sb, DELALLOC))
7058 + printk(KERN_INFO "EXT4-fs: delayed allocation enabled\n");
7061 ext4_mb_init(sb, needs_recovery);
7062 @@ -2372,6 +2484,7 @@ cantfind_ext4:
7065 jbd2_journal_destroy(sbi->s_journal);
7066 + sbi->s_journal = NULL;
7068 percpu_counter_destroy(&sbi->s_freeblocks_counter);
7069 percpu_counter_destroy(&sbi->s_freeinodes_counter);
7070 @@ -2461,14 +2574,14 @@ static journal_t *ext4_get_journal(struct super_block *sb,
7071 static journal_t *ext4_get_dev_journal(struct super_block *sb,
7074 - struct buffer_head * bh;
7075 + struct buffer_head *bh;
7079 int hblock, blocksize;
7080 ext4_fsblk_t sb_block;
7081 unsigned long offset;
7082 - struct ext4_super_block * es;
7083 + struct ext4_super_block *es;
7084 struct block_device *bdev;
7086 bdev = ext4_blkdev_get(j_dev);
7087 @@ -2583,8 +2696,8 @@ static int ext4_load_journal(struct super_block *sb,
7088 "unavailable, cannot proceed.\n");
7091 - printk (KERN_INFO "EXT4-fs: write access will "
7092 - "be enabled during recovery.\n");
7093 + printk(KERN_INFO "EXT4-fs: write access will "
7094 + "be enabled during recovery.\n");
7098 @@ -2637,8 +2750,8 @@ static int ext4_load_journal(struct super_block *sb,
7102 -static int ext4_create_journal(struct super_block * sb,
7103 - struct ext4_super_block * es,
7104 +static int ext4_create_journal(struct super_block *sb,
7105 + struct ext4_super_block *es,
7106 unsigned int journal_inum)
7109 @@ -2679,9 +2792,8 @@ static int ext4_create_journal(struct super_block * sb,
7113 -static void ext4_commit_super (struct super_block * sb,
7114 - struct ext4_super_block * es,
7116 +static void ext4_commit_super(struct super_block *sb,
7117 + struct ext4_super_block *es, int sync)
7119 struct buffer_head *sbh = EXT4_SB(sb)->s_sbh;
7121 @@ -2702,8 +2814,8 @@ static void ext4_commit_super (struct super_block * sb,
7122 * remounting) the filesystem readonly, then we will end up with a
7123 * consistent fs on disk. Record that fact.
7125 -static void ext4_mark_recovery_complete(struct super_block * sb,
7126 - struct ext4_super_block * es)
7127 +static void ext4_mark_recovery_complete(struct super_block *sb,
7128 + struct ext4_super_block *es)
7130 journal_t *journal = EXT4_SB(sb)->s_journal;
7132 @@ -2725,8 +2837,8 @@ static void ext4_mark_recovery_complete(struct super_block * sb,
7133 * has recorded an error from a previous lifetime, move that error to the
7134 * main filesystem now.
7136 -static void ext4_clear_journal_err(struct super_block * sb,
7137 - struct ext4_super_block * es)
7138 +static void ext4_clear_journal_err(struct super_block *sb,
7139 + struct ext4_super_block *es)
7143 @@ -2751,7 +2863,7 @@ static void ext4_clear_journal_err(struct super_block * sb,
7145 EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
7146 es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
7147 - ext4_commit_super (sb, es, 1);
7148 + ext4_commit_super(sb, es, 1);
7150 jbd2_journal_clear_err(journal);
7152 @@ -2784,7 +2896,7 @@ int ext4_force_commit(struct super_block *sb)
7153 * This implicitly triggers the writebehind on sync().
7156 -static void ext4_write_super (struct super_block * sb)
7157 +static void ext4_write_super(struct super_block *sb)
7159 if (mutex_trylock(&sb->s_lock) != 0)
7161 @@ -2840,13 +2952,14 @@ static void ext4_unlockfs(struct super_block *sb)
7165 -static int ext4_remount (struct super_block * sb, int * flags, char * data)
7166 +static int ext4_remount(struct super_block *sb, int *flags, char *data)
7168 - struct ext4_super_block * es;
7169 + struct ext4_super_block *es;
7170 struct ext4_sb_info *sbi = EXT4_SB(sb);
7171 ext4_fsblk_t n_blocks_count = 0;
7172 unsigned long old_sb_flags;
7173 struct ext4_mount_options old_opts;
7178 @@ -2925,6 +3038,26 @@ static int ext4_remount (struct super_block * sb, int * flags, char * data)
7182 + * Make sure the group descriptor checksums
7183 + * are sane. If they aren't, refuse to
7186 + for (g = 0; g < sbi->s_groups_count; g++) {
7187 + struct ext4_group_desc *gdp =
7188 + ext4_get_group_desc(sb, g, NULL);
7190 + if (!ext4_group_desc_csum_verify(sbi, g, gdp)) {
7192 + "EXT4-fs: ext4_remount: "
7193 + "Checksum for group %lu failed (%u!=%u)\n",
7194 + g, le16_to_cpu(ext4_group_desc_csum(sbi, g, gdp)),
7195 + le16_to_cpu(gdp->bg_checksum));
7197 + goto restore_opts;
7202 * If we have an unprocessed orphan list hanging
7203 * around from a previously readonly bdev mount,
7204 * require a full umount/remount for now.
7205 @@ -2949,7 +3082,7 @@ static int ext4_remount (struct super_block * sb, int * flags, char * data)
7206 sbi->s_mount_state = le16_to_cpu(es->s_state);
7207 if ((err = ext4_group_extend(sb, es, n_blocks_count)))
7209 - if (!ext4_setup_super (sb, es, 0))
7210 + if (!ext4_setup_super(sb, es, 0))
7211 sb->s_flags &= ~MS_RDONLY;
7214 @@ -2979,7 +3112,7 @@ restore_opts:
7218 -static int ext4_statfs (struct dentry * dentry, struct kstatfs * buf)
7219 +static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf)
7221 struct super_block *sb = dentry->d_sb;
7222 struct ext4_sb_info *sbi = EXT4_SB(sb);
7223 @@ -3217,12 +3350,12 @@ static int ext4_quota_on(struct super_block *sb, int type, int format_id,
7225 /* Journaling quota? */
7226 if (EXT4_SB(sb)->s_qf_names[type]) {
7227 - /* Quotafile not of fs root? */
7228 + /* Quotafile not in fs root? */
7229 if (nd.path.dentry->d_parent->d_inode != sb->s_root->d_inode)
7231 "EXT4-fs: Quota file not on filesystem root. "
7232 "Journaled quota will not work.\n");
7237 * When we journal data on quota file, we have to flush journal to see
7238 @@ -3325,7 +3458,7 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type,
7239 err = ext4_journal_dirty_metadata(handle, bh);
7241 /* Always do at least ordered writes for quotas */
7242 - err = ext4_journal_dirty_data(handle, bh);
7243 + err = ext4_jbd2_file_inode(handle, inode);
7244 mark_buffer_dirty(bh);
7247 diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
7248 index ff08633..8954208 100644
7249 --- a/fs/ext4/xattr.c
7250 +++ b/fs/ext4/xattr.c
7251 @@ -810,7 +810,7 @@ inserted:
7252 /* We need to allocate a new block */
7253 ext4_fsblk_t goal = ext4_group_first_block_no(sb,
7254 EXT4_I(inode)->i_block_group);
7255 - ext4_fsblk_t block = ext4_new_block(handle, inode,
7256 + ext4_fsblk_t block = ext4_new_meta_block(handle, inode,
7260 @@ -1512,7 +1512,7 @@ static inline void ext4_xattr_hash_entry(struct ext4_xattr_header *header,
7261 char *name = entry->e_name;
7264 - for (n=0; n < entry->e_name_len; n++) {
7265 + for (n = 0; n < entry->e_name_len; n++) {
7266 hash = (hash << NAME_HASH_SHIFT) ^
7267 (hash >> (8*sizeof(hash) - NAME_HASH_SHIFT)) ^
7269 diff --git a/fs/ext4/xattr_trusted.c b/fs/ext4/xattr_trusted.c
7270 index fff3338..ac1a52c 100644
7271 --- a/fs/ext4/xattr_trusted.c
7272 +++ b/fs/ext4/xattr_trusted.c
7277 -#define XATTR_TRUSTED_PREFIX "trusted."
7280 ext4_xattr_trusted_list(struct inode *inode, char *list, size_t list_size,
7281 const char *name, size_t name_len)
7283 - const size_t prefix_len = sizeof(XATTR_TRUSTED_PREFIX)-1;
7284 + const size_t prefix_len = XATTR_TRUSTED_PREFIX_LEN;
7285 const size_t total_len = prefix_len + name_len + 1;
7287 if (!capable(CAP_SYS_ADMIN))
7288 diff --git a/fs/ext4/xattr_user.c b/fs/ext4/xattr_user.c
7289 index 67be723..d91aa61 100644
7290 --- a/fs/ext4/xattr_user.c
7291 +++ b/fs/ext4/xattr_user.c
7296 -#define XATTR_USER_PREFIX "user."
7299 ext4_xattr_user_list(struct inode *inode, char *list, size_t list_size,
7300 const char *name, size_t name_len)
7302 - const size_t prefix_len = sizeof(XATTR_USER_PREFIX)-1;
7303 + const size_t prefix_len = XATTR_USER_PREFIX_LEN;
7304 const size_t total_len = prefix_len + name_len + 1;
7306 if (!test_opt(inode->i_sb, XATTR_USER))
7307 diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
7308 index 6914598..91389c8 100644
7309 --- a/fs/jbd2/checkpoint.c
7310 +++ b/fs/jbd2/checkpoint.c
7311 @@ -688,7 +688,6 @@ void __jbd2_journal_drop_transaction(journal_t *journal, transaction_t *transact
7313 J_ASSERT(transaction->t_state == T_FINISHED);
7314 J_ASSERT(transaction->t_buffers == NULL);
7315 - J_ASSERT(transaction->t_sync_datalist == NULL);
7316 J_ASSERT(transaction->t_forget == NULL);
7317 J_ASSERT(transaction->t_iobuf_list == NULL);
7318 J_ASSERT(transaction->t_shadow_list == NULL);
7319 diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
7320 index a2ed72f..adf0395 100644
7321 --- a/fs/jbd2/commit.c
7322 +++ b/fs/jbd2/commit.c
7324 #include <linux/pagemap.h>
7325 #include <linux/jiffies.h>
7326 #include <linux/crc32.h>
7327 +#include <linux/writeback.h>
7328 +#include <linux/backing-dev.h>
7331 * Default IO end handler for temporary BJ_IO buffer_heads.
7332 @@ -37,8 +39,8 @@ static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
7336 - * When an ext3-ordered file is truncated, it is possible that many pages are
7337 - * not sucessfully freed, because they are attached to a committing transaction.
7338 + * When an ext4 file is truncated, it is possible that some pages are not
7339 + * successfully freed, because they are attached to a committing transaction.
7340 * After the transaction commits, these pages are left on the LRU, with no
7341 * ->mapping, and with attached buffers. These pages are trivially reclaimable
7342 * by the VM, but their apparent absence upsets the VM accounting, and it makes
7343 @@ -80,21 +82,6 @@ nope:
7347 - * Try to acquire jbd_lock_bh_state() against the buffer, when j_list_lock is
7348 - * held. For ranking reasons we must trylock. If we lose, schedule away and
7349 - * return 0. j_list_lock is dropped in this case.
7351 -static int inverted_lock(journal_t *journal, struct buffer_head *bh)
7353 - if (!jbd_trylock_bh_state(bh)) {
7354 - spin_unlock(&journal->j_list_lock);
7362 * Done it all: now submit the commit record. We should have
7363 * cleaned up our previous buffers by now, so if we are in abort
7364 * mode we can now just skip the rest of the journal write
7365 @@ -112,6 +99,7 @@ static int journal_submit_commit_record(journal_t *journal,
7366 struct buffer_head *bh;
7368 int barrier_done = 0;
7369 + struct timespec now = current_kernel_time();
7371 if (is_journal_aborted(journal))
7373 @@ -126,6 +114,8 @@ static int journal_submit_commit_record(journal_t *journal,
7374 tmp->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER);
7375 tmp->h_blocktype = cpu_to_be32(JBD2_COMMIT_BLOCK);
7376 tmp->h_sequence = cpu_to_be32(commit_transaction->t_tid);
7377 + tmp->h_commit_sec = cpu_to_be64(now.tv_sec);
7378 + tmp->h_commit_nsec = cpu_to_be32(now.tv_nsec);
7380 if (JBD2_HAS_COMPAT_FEATURE(journal,
7381 JBD2_FEATURE_COMPAT_CHECKSUM)) {
7382 @@ -197,159 +187,114 @@ static int journal_wait_on_commit_record(struct buffer_head *bh)
7386 - * Wait for all submitted IO to complete.
7387 + * write the filemap data using writepage() address_space_operations.
7388 + * We don't do block allocation here even for delalloc. We don't
7389 + * use writepages() because with dealyed allocation we may be doing
7390 + * block allocation in writepages().
7392 -static int journal_wait_on_locked_list(journal_t *journal,
7393 - transaction_t *commit_transaction)
7394 +static int journal_submit_inode_data_buffers(struct address_space *mapping)
7397 - struct journal_head *jh;
7399 - while (commit_transaction->t_locked_list) {
7400 - struct buffer_head *bh;
7402 - jh = commit_transaction->t_locked_list->b_tprev;
7405 - if (buffer_locked(bh)) {
7406 - spin_unlock(&journal->j_list_lock);
7407 - wait_on_buffer(bh);
7408 - if (unlikely(!buffer_uptodate(bh)))
7410 - spin_lock(&journal->j_list_lock);
7412 - if (!inverted_lock(journal, bh)) {
7414 - spin_lock(&journal->j_list_lock);
7417 - if (buffer_jbd(bh) && jh->b_jlist == BJ_Locked) {
7418 - __jbd2_journal_unfile_buffer(jh);
7419 - jbd_unlock_bh_state(bh);
7420 - jbd2_journal_remove_journal_head(bh);
7423 - jbd_unlock_bh_state(bh);
7426 - cond_resched_lock(&journal->j_list_lock);
7429 + struct writeback_control wbc = {
7430 + .sync_mode = WB_SYNC_ALL,
7431 + .nr_to_write = mapping->nrpages * 2,
7433 + .range_end = i_size_read(mapping->host),
7434 + .for_writepages = 1,
7437 + ret = generic_writepages(mapping, &wbc);
7442 -static void journal_do_submit_data(struct buffer_head **wbuf, int bufs)
7444 + * Submit all the data buffers of inode associated with the transaction to
7447 + * We are in a committing transaction. Therefore no new inode can be added to
7448 + * our inode list. We use JI_COMMIT_RUNNING flag to protect inode we currently
7449 + * operate on from being released while we write out pages.
7451 +static int journal_submit_data_buffers(journal_t *journal,
7452 + transaction_t *commit_transaction)
7455 + struct jbd2_inode *jinode;
7457 + struct address_space *mapping;
7459 - for (i = 0; i < bufs; i++) {
7460 - wbuf[i]->b_end_io = end_buffer_write_sync;
7461 - /* We use-up our safety reference in submit_bh() */
7462 - submit_bh(WRITE, wbuf[i]);
7463 + spin_lock(&journal->j_list_lock);
7464 + list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
7465 + mapping = jinode->i_vfs_inode->i_mapping;
7466 + jinode->i_flags |= JI_COMMIT_RUNNING;
7467 + spin_unlock(&journal->j_list_lock);
7469 + * submit the inode data buffers. We use writepage
7470 + * instead of writepages. Because writepages can do
7471 + * block allocation with delalloc. We need to write
7472 + * only allocated blocks here.
7474 + err = journal_submit_inode_data_buffers(mapping);
7477 + spin_lock(&journal->j_list_lock);
7478 + J_ASSERT(jinode->i_transaction == commit_transaction);
7479 + jinode->i_flags &= ~JI_COMMIT_RUNNING;
7480 + wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
7482 + spin_unlock(&journal->j_list_lock);
7487 - * Submit all the data buffers to disk
7488 + * Wait for data submitted for writeout, refile inodes to proper
7489 + * transaction if needed.
7492 -static void journal_submit_data_buffers(journal_t *journal,
7493 - transaction_t *commit_transaction)
7494 +static int journal_finish_inode_data_buffers(journal_t *journal,
7495 + transaction_t *commit_transaction)
7497 - struct journal_head *jh;
7498 - struct buffer_head *bh;
7501 - struct buffer_head **wbuf = journal->j_wbuf;
7502 + struct jbd2_inode *jinode, *next_i;
7506 - * Whenever we unlock the journal and sleep, things can get added
7507 - * onto ->t_sync_datalist, so we have to keep looping back to
7508 - * write_out_data until we *know* that the list is empty.
7510 - * Cleanup any flushed data buffers from the data list. Even in
7511 - * abort mode, we want to flush this out as soon as possible.
7515 + /* For locking, see the comment in journal_submit_data_buffers() */
7516 spin_lock(&journal->j_list_lock);
7518 - while (commit_transaction->t_sync_datalist) {
7519 - jh = commit_transaction->t_sync_datalist;
7523 - /* Get reference just to make sure buffer does not disappear
7524 - * when we are forced to drop various locks */
7526 - /* If the buffer is dirty, we need to submit IO and hence
7527 - * we need the buffer lock. We try to lock the buffer without
7528 - * blocking. If we fail, we need to drop j_list_lock and do
7529 - * blocking lock_buffer().
7531 - if (buffer_dirty(bh)) {
7532 - if (test_set_buffer_locked(bh)) {
7533 - BUFFER_TRACE(bh, "needs blocking lock");
7534 - spin_unlock(&journal->j_list_lock);
7535 - /* Write out all data to prevent deadlocks */
7536 - journal_do_submit_data(wbuf, bufs);
7539 - spin_lock(&journal->j_list_lock);
7543 - /* We have to get bh_state lock. Again out of order, sigh. */
7544 - if (!inverted_lock(journal, bh)) {
7545 - jbd_lock_bh_state(bh);
7546 - spin_lock(&journal->j_list_lock);
7548 - /* Someone already cleaned up the buffer? */
7549 - if (!buffer_jbd(bh)
7550 - || jh->b_transaction != commit_transaction
7551 - || jh->b_jlist != BJ_SyncData) {
7552 - jbd_unlock_bh_state(bh);
7554 - unlock_buffer(bh);
7555 - BUFFER_TRACE(bh, "already cleaned up");
7559 - if (locked && test_clear_buffer_dirty(bh)) {
7560 - BUFFER_TRACE(bh, "needs writeout, adding to array");
7561 - wbuf[bufs++] = bh;
7562 - __jbd2_journal_file_buffer(jh, commit_transaction,
7564 - jbd_unlock_bh_state(bh);
7565 - if (bufs == journal->j_wbufsize) {
7566 - spin_unlock(&journal->j_list_lock);
7567 - journal_do_submit_data(wbuf, bufs);
7569 - goto write_out_data;
7571 - } else if (!locked && buffer_locked(bh)) {
7572 - __jbd2_journal_file_buffer(jh, commit_transaction,
7574 - jbd_unlock_bh_state(bh);
7577 - BUFFER_TRACE(bh, "writeout complete: unfile");
7578 - __jbd2_journal_unfile_buffer(jh);
7579 - jbd_unlock_bh_state(bh);
7581 - unlock_buffer(bh);
7582 - jbd2_journal_remove_journal_head(bh);
7583 - /* Once for our safety reference, once for
7584 - * jbd2_journal_remove_journal_head() */
7587 + list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
7588 + jinode->i_flags |= JI_COMMIT_RUNNING;
7589 + spin_unlock(&journal->j_list_lock);
7590 + err = filemap_fdatawait(jinode->i_vfs_inode->i_mapping);
7593 + * Because AS_EIO is cleared by
7594 + * wait_on_page_writeback_range(), set it again so
7595 + * that user process can get -EIO from fsync().
7598 + &jinode->i_vfs_inode->i_mapping->flags);
7603 + spin_lock(&journal->j_list_lock);
7604 + jinode->i_flags &= ~JI_COMMIT_RUNNING;
7605 + wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
7608 - if (need_resched() || spin_needbreak(&journal->j_list_lock)) {
7609 - spin_unlock(&journal->j_list_lock);
7610 - goto write_out_data;
7611 + /* Now refile inode to proper lists */
7612 + list_for_each_entry_safe(jinode, next_i,
7613 + &commit_transaction->t_inode_list, i_list) {
7614 + list_del(&jinode->i_list);
7615 + if (jinode->i_next_transaction) {
7616 + jinode->i_transaction = jinode->i_next_transaction;
7617 + jinode->i_next_transaction = NULL;
7618 + list_add(&jinode->i_list,
7619 + &jinode->i_transaction->t_inode_list);
7621 + jinode->i_transaction = NULL;
7624 spin_unlock(&journal->j_list_lock);
7625 - journal_do_submit_data(wbuf, bufs);
7630 static __u32 jbd2_checksum_data(__u32 crc32_sum, struct buffer_head *bh)
7631 @@ -524,21 +469,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
7632 * Now start flushing things to disk, in the order they appear
7633 * on the transaction lists. Data blocks go first.
7636 - journal_submit_data_buffers(journal, commit_transaction);
7639 - * Wait for all previously submitted IO to complete if commit
7640 - * record is to be written synchronously.
7642 - spin_lock(&journal->j_list_lock);
7643 - if (!JBD2_HAS_INCOMPAT_FEATURE(journal,
7644 - JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT))
7645 - err = journal_wait_on_locked_list(journal,
7646 - commit_transaction);
7648 - spin_unlock(&journal->j_list_lock);
7650 + err = journal_submit_data_buffers(journal, commit_transaction);
7652 jbd2_journal_abort(journal, err);
7654 @@ -547,16 +478,6 @@ void jbd2_journal_commit_transaction(journal_t *journal)
7655 jbd_debug(3, "JBD: commit phase 2\n");
7658 - * If we found any dirty or locked buffers, then we should have
7659 - * looped back up to the write_out_data label. If there weren't
7660 - * any then journal_clean_data_list should have wiped the list
7661 - * clean by now, so check that it is in fact empty.
7663 - J_ASSERT (commit_transaction->t_sync_datalist == NULL);
7665 - jbd_debug (3, "JBD: commit phase 3\n");
7668 * Way to go: we have now written out all of the data for a
7669 * transaction! Now comes the tricky part: we need to write out
7670 * metadata. Loop over the transaction's entire buffer list:
7671 @@ -574,6 +495,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
7672 J_ASSERT(commit_transaction->t_nr_buffers <=
7673 commit_transaction->t_outstanding_credits);
7678 while (commit_transaction->t_buffers) {
7679 @@ -748,13 +670,23 @@ start_journal_io:
7682 __jbd2_journal_abort_hard(journal);
7685 - spin_lock(&journal->j_list_lock);
7686 - err = journal_wait_on_locked_list(journal,
7687 - commit_transaction);
7688 - spin_unlock(&journal->j_list_lock);
7690 - __jbd2_journal_abort_hard(journal);
7692 + * This is the right place to wait for data buffers both for ASYNC
7693 + * and !ASYNC commit. If commit is ASYNC, we need to wait only after
7694 + * the commit block went to disk (which happens above). If commit is
7695 + * SYNC, we need to wait for data buffers before we start writing
7696 + * commit block, which happens below in such setting.
7698 + err = journal_finish_inode_data_buffers(journal, commit_transaction);
7700 + char b[BDEVNAME_SIZE];
7702 + printk(KERN_WARNING
7703 + "JBD2: Detected IO errors while flushing file data "
7704 + "on %s\n", bdevname(journal->j_fs_dev, b));
7708 /* Lo and behold: we have just managed to send a transaction to
7709 @@ -768,7 +700,7 @@ start_journal_io:
7710 so we incur less scheduling load.
7713 - jbd_debug(3, "JBD: commit phase 4\n");
7714 + jbd_debug(3, "JBD: commit phase 3\n");
7717 * akpm: these are BJ_IO, and j_list_lock is not needed.
7718 @@ -827,7 +759,7 @@ wait_for_iobuf:
7720 J_ASSERT (commit_transaction->t_shadow_list == NULL);
7722 - jbd_debug(3, "JBD: commit phase 5\n");
7723 + jbd_debug(3, "JBD: commit phase 4\n");
7725 /* Here we wait for the revoke record and descriptor record buffers */
7727 @@ -854,7 +786,7 @@ wait_for_iobuf:
7728 /* AKPM: bforget here */
7731 - jbd_debug(3, "JBD: commit phase 6\n");
7732 + jbd_debug(3, "JBD: commit phase 5\n");
7734 if (!JBD2_HAS_INCOMPAT_FEATURE(journal,
7735 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
7736 @@ -874,9 +806,9 @@ wait_for_iobuf:
7737 transaction can be removed from any checkpoint list it was on
7740 - jbd_debug(3, "JBD: commit phase 7\n");
7741 + jbd_debug(3, "JBD: commit phase 6\n");
7743 - J_ASSERT(commit_transaction->t_sync_datalist == NULL);
7744 + J_ASSERT(list_empty(&commit_transaction->t_inode_list));
7745 J_ASSERT(commit_transaction->t_buffers == NULL);
7746 J_ASSERT(commit_transaction->t_checkpoint_list == NULL);
7747 J_ASSERT(commit_transaction->t_iobuf_list == NULL);
7748 @@ -997,7 +929,7 @@ restart_loop:
7750 /* Done with this transaction! */
7752 - jbd_debug(3, "JBD: commit phase 8\n");
7753 + jbd_debug(3, "JBD: commit phase 7\n");
7755 J_ASSERT(commit_transaction->t_state == T_COMMIT);
7757 diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
7758 index 2e24567..8207a01 100644
7759 --- a/fs/jbd2/journal.c
7760 +++ b/fs/jbd2/journal.c
7761 @@ -50,7 +50,6 @@ EXPORT_SYMBOL(jbd2_journal_unlock_updates);
7762 EXPORT_SYMBOL(jbd2_journal_get_write_access);
7763 EXPORT_SYMBOL(jbd2_journal_get_create_access);
7764 EXPORT_SYMBOL(jbd2_journal_get_undo_access);
7765 -EXPORT_SYMBOL(jbd2_journal_dirty_data);
7766 EXPORT_SYMBOL(jbd2_journal_dirty_metadata);
7767 EXPORT_SYMBOL(jbd2_journal_release_buffer);
7768 EXPORT_SYMBOL(jbd2_journal_forget);
7769 @@ -69,7 +68,6 @@ EXPORT_SYMBOL(jbd2_journal_set_features);
7770 EXPORT_SYMBOL(jbd2_journal_create);
7771 EXPORT_SYMBOL(jbd2_journal_load);
7772 EXPORT_SYMBOL(jbd2_journal_destroy);
7773 -EXPORT_SYMBOL(jbd2_journal_update_superblock);
7774 EXPORT_SYMBOL(jbd2_journal_abort);
7775 EXPORT_SYMBOL(jbd2_journal_errno);
7776 EXPORT_SYMBOL(jbd2_journal_ack_err);
7777 @@ -82,6 +80,10 @@ EXPORT_SYMBOL(jbd2_journal_blocks_per_page);
7778 EXPORT_SYMBOL(jbd2_journal_invalidatepage);
7779 EXPORT_SYMBOL(jbd2_journal_try_to_free_buffers);
7780 EXPORT_SYMBOL(jbd2_journal_force_commit);
7781 +EXPORT_SYMBOL(jbd2_journal_file_inode);
7782 +EXPORT_SYMBOL(jbd2_journal_init_jbd_inode);
7783 +EXPORT_SYMBOL(jbd2_journal_release_jbd_inode);
7784 +EXPORT_SYMBOL(jbd2_journal_begin_ordered_truncate);
7786 static int journal_convert_superblock_v1(journal_t *, journal_superblock_t *);
7787 static void __journal_abort_soft (journal_t *journal, int errno);
7788 @@ -2195,6 +2197,54 @@ void jbd2_journal_put_journal_head(struct journal_head *jh)
7792 + * Initialize jbd inode head
7794 +void jbd2_journal_init_jbd_inode(struct jbd2_inode *jinode, struct inode *inode)
7796 + jinode->i_transaction = NULL;
7797 + jinode->i_next_transaction = NULL;
7798 + jinode->i_vfs_inode = inode;
7799 + jinode->i_flags = 0;
7800 + INIT_LIST_HEAD(&jinode->i_list);
7804 + * Function to be called before we start removing inode from memory (i.e.,
7805 + * clear_inode() is a fine place to be called from). It removes inode from
7806 + * transaction's lists.
7808 +void jbd2_journal_release_jbd_inode(journal_t *journal,
7809 + struct jbd2_inode *jinode)
7816 + spin_lock(&journal->j_list_lock);
7817 + /* Is commit writing out inode - we have to wait */
7818 + if (jinode->i_flags & JI_COMMIT_RUNNING) {
7819 + wait_queue_head_t *wq;
7820 + DEFINE_WAIT_BIT(wait, &jinode->i_flags, __JI_COMMIT_RUNNING);
7821 + wq = bit_waitqueue(&jinode->i_flags, __JI_COMMIT_RUNNING);
7822 + prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE);
7823 + spin_unlock(&journal->j_list_lock);
7825 + finish_wait(wq, &wait.wait);
7829 + /* Do we need to wait for data writeback? */
7830 + if (journal->j_committing_transaction == jinode->i_transaction)
7832 + if (jinode->i_transaction) {
7833 + list_del(&jinode->i_list);
7834 + jinode->i_transaction = NULL;
7836 + spin_unlock(&journal->j_list_lock);
7842 #ifdef CONFIG_JBD2_DEBUG
7843 diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
7844 index d6e006e..4f7cadb 100644
7845 --- a/fs/jbd2/transaction.c
7846 +++ b/fs/jbd2/transaction.c
7847 @@ -41,7 +41,6 @@ static void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh);
7848 * new transaction and we can't block without protecting against other
7849 * processes trying to touch the journal while it is in transition.
7851 - * Called under j_state_lock
7854 static transaction_t *
7855 @@ -52,6 +51,7 @@ jbd2_get_transaction(journal_t *journal, transaction_t *transaction)
7856 transaction->t_tid = journal->j_transaction_sequence++;
7857 transaction->t_expires = jiffies + journal->j_commit_interval;
7858 spin_lock_init(&transaction->t_handle_lock);
7859 + INIT_LIST_HEAD(&transaction->t_inode_list);
7861 /* Set up the commit timer for the new transaction. */
7862 journal->j_commit_timer.expires = round_jiffies(transaction->t_expires);
7863 @@ -943,183 +943,6 @@ out:
7867 - * int jbd2_journal_dirty_data() - mark a buffer as containing dirty data which
7868 - * needs to be flushed before we can commit the
7869 - * current transaction.
7870 - * @handle: transaction
7871 - * @bh: bufferhead to mark
7873 - * The buffer is placed on the transaction's data list and is marked as
7874 - * belonging to the transaction.
7876 - * Returns error number or 0 on success.
7878 - * jbd2_journal_dirty_data() can be called via page_launder->ext3_writepage
7881 -int jbd2_journal_dirty_data(handle_t *handle, struct buffer_head *bh)
7883 - journal_t *journal = handle->h_transaction->t_journal;
7884 - int need_brelse = 0;
7885 - struct journal_head *jh;
7887 - if (is_handle_aborted(handle))
7890 - jh = jbd2_journal_add_journal_head(bh);
7891 - JBUFFER_TRACE(jh, "entry");
7894 - * The buffer could *already* be dirty. Writeout can start
7897 - jbd_debug(4, "jh: %p, tid:%d\n", jh, handle->h_transaction->t_tid);
7900 - * What if the buffer is already part of a running transaction?
7902 - * There are two cases:
7903 - * 1) It is part of the current running transaction. Refile it,
7904 - * just in case we have allocated it as metadata, deallocated
7905 - * it, then reallocated it as data.
7906 - * 2) It is part of the previous, still-committing transaction.
7907 - * If all we want to do is to guarantee that the buffer will be
7908 - * written to disk before this new transaction commits, then
7909 - * being sure that the *previous* transaction has this same
7910 - * property is sufficient for us! Just leave it on its old
7913 - * In case (2), the buffer must not already exist as metadata
7914 - * --- that would violate write ordering (a transaction is free
7915 - * to write its data at any point, even before the previous
7916 - * committing transaction has committed). The caller must
7917 - * never, ever allow this to happen: there's nothing we can do
7918 - * about it in this layer.
7920 - jbd_lock_bh_state(bh);
7921 - spin_lock(&journal->j_list_lock);
7923 - /* Now that we have bh_state locked, are we really still mapped? */
7924 - if (!buffer_mapped(bh)) {
7925 - JBUFFER_TRACE(jh, "unmapped buffer, bailing out");
7929 - if (jh->b_transaction) {
7930 - JBUFFER_TRACE(jh, "has transaction");
7931 - if (jh->b_transaction != handle->h_transaction) {
7932 - JBUFFER_TRACE(jh, "belongs to older transaction");
7933 - J_ASSERT_JH(jh, jh->b_transaction ==
7934 - journal->j_committing_transaction);
7936 - /* @@@ IS THIS TRUE ? */
7938 - * Not any more. Scenario: someone does a write()
7939 - * in data=journal mode. The buffer's transaction has
7940 - * moved into commit. Then someone does another
7941 - * write() to the file. We do the frozen data copyout
7942 - * and set b_next_transaction to point to j_running_t.
7943 - * And while we're in that state, someone does a
7944 - * writepage() in an attempt to pageout the same area
7945 - * of the file via a shared mapping. At present that
7946 - * calls jbd2_journal_dirty_data(), and we get right here.
7947 - * It may be too late to journal the data. Simply
7948 - * falling through to the next test will suffice: the
7949 - * data will be dirty and wil be checkpointed. The
7950 - * ordering comments in the next comment block still
7953 - //J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
7956 - * If we're journalling data, and this buffer was
7957 - * subject to a write(), it could be metadata, forget
7958 - * or shadow against the committing transaction. Now,
7959 - * someone has dirtied the same darn page via a mapping
7960 - * and it is being writepage()'d.
7961 - * We *could* just steal the page from commit, with some
7962 - * fancy locking there. Instead, we just skip it -
7963 - * don't tie the page's buffers to the new transaction
7965 - * Implication: if we crash before the writepage() data
7966 - * is written into the filesystem, recovery will replay
7967 - * the write() data.
7969 - if (jh->b_jlist != BJ_None &&
7970 - jh->b_jlist != BJ_SyncData &&
7971 - jh->b_jlist != BJ_Locked) {
7972 - JBUFFER_TRACE(jh, "Not stealing");
7977 - * This buffer may be undergoing writeout in commit. We
7978 - * can't return from here and let the caller dirty it
7979 - * again because that can cause the write-out loop in
7980 - * commit to never terminate.
7982 - if (buffer_dirty(bh)) {
7984 - spin_unlock(&journal->j_list_lock);
7985 - jbd_unlock_bh_state(bh);
7987 - sync_dirty_buffer(bh);
7988 - jbd_lock_bh_state(bh);
7989 - spin_lock(&journal->j_list_lock);
7990 - /* Since we dropped the lock... */
7991 - if (!buffer_mapped(bh)) {
7992 - JBUFFER_TRACE(jh, "buffer got unmapped");
7995 - /* The buffer may become locked again at any
7996 - time if it is redirtied */
7999 - /* journal_clean_data_list() may have got there first */
8000 - if (jh->b_transaction != NULL) {
8001 - JBUFFER_TRACE(jh, "unfile from commit");
8002 - __jbd2_journal_temp_unlink_buffer(jh);
8003 - /* It still points to the committing
8004 - * transaction; move it to this one so
8005 - * that the refile assert checks are
8007 - jh->b_transaction = handle->h_transaction;
8009 - /* The buffer will be refiled below */
8013 - * Special case --- the buffer might actually have been
8014 - * allocated and then immediately deallocated in the previous,
8015 - * committing transaction, so might still be left on that
8016 - * transaction's metadata lists.
8018 - if (jh->b_jlist != BJ_SyncData && jh->b_jlist != BJ_Locked) {
8019 - JBUFFER_TRACE(jh, "not on correct data list: unfile");
8020 - J_ASSERT_JH(jh, jh->b_jlist != BJ_Shadow);
8021 - __jbd2_journal_temp_unlink_buffer(jh);
8022 - jh->b_transaction = handle->h_transaction;
8023 - JBUFFER_TRACE(jh, "file as data");
8024 - __jbd2_journal_file_buffer(jh, handle->h_transaction,
8028 - JBUFFER_TRACE(jh, "not on a transaction");
8029 - __jbd2_journal_file_buffer(jh, handle->h_transaction, BJ_SyncData);
8032 - spin_unlock(&journal->j_list_lock);
8033 - jbd_unlock_bh_state(bh);
8034 - if (need_brelse) {
8035 - BUFFER_TRACE(bh, "brelse");
8038 - JBUFFER_TRACE(jh, "exit");
8039 - jbd2_journal_put_journal_head(jh);
8044 * int jbd2_journal_dirty_metadata() - mark a buffer as containing dirty metadata
8045 * @handle: transaction to add buffer to.
8046 * @bh: buffer to mark
8047 @@ -1541,10 +1364,10 @@ __blist_del_buffer(struct journal_head **list, struct journal_head *jh)
8048 * Remove a buffer from the appropriate transaction list.
8050 * Note that this function can *change* the value of
8051 - * bh->b_transaction->t_sync_datalist, t_buffers, t_forget,
8052 - * t_iobuf_list, t_shadow_list, t_log_list or t_reserved_list. If the caller
8053 - * is holding onto a copy of one of thee pointers, it could go bad.
8054 - * Generally the caller needs to re-read the pointer from the transaction_t.
8055 + * bh->b_transaction->t_buffers, t_forget, t_iobuf_list, t_shadow_list,
8056 + * t_log_list or t_reserved_list. If the caller is holding onto a copy of one
8057 + * of these pointers, it could go bad. Generally the caller needs to re-read
8058 + * the pointer from the transaction_t.
8060 * Called under j_list_lock. The journal may not be locked.
8062 @@ -1566,9 +1389,6 @@ void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh)
8063 switch (jh->b_jlist) {
8067 - list = &transaction->t_sync_datalist;
8070 transaction->t_nr_buffers--;
8071 J_ASSERT_JH(jh, transaction->t_nr_buffers >= 0);
8072 @@ -1589,9 +1409,6 @@ void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh)
8074 list = &transaction->t_reserved_list;
8077 - list = &transaction->t_locked_list;
8081 __blist_del_buffer(list, jh);
8082 @@ -1634,15 +1451,7 @@ __journal_try_to_free_buffer(journal_t *journal, struct buffer_head *bh)
8085 spin_lock(&journal->j_list_lock);
8086 - if (jh->b_transaction != NULL && jh->b_cp_transaction == NULL) {
8087 - if (jh->b_jlist == BJ_SyncData || jh->b_jlist == BJ_Locked) {
8088 - /* A written-back ordered data buffer */
8089 - JBUFFER_TRACE(jh, "release data");
8090 - __jbd2_journal_unfile_buffer(jh);
8091 - jbd2_journal_remove_journal_head(bh);
8094 - } else if (jh->b_cp_transaction != NULL && jh->b_transaction == NULL) {
8095 + if (jh->b_cp_transaction != NULL && jh->b_transaction == NULL) {
8096 /* written-back checkpointed metadata buffer */
8097 if (jh->b_jlist == BJ_None) {
8098 JBUFFER_TRACE(jh, "remove from checkpoint list");
8099 @@ -1656,12 +1465,43 @@ out:
8104 + * jbd2_journal_try_to_free_buffers() could race with
8105 + * jbd2_journal_commit_transaction(). The later might still hold the
8106 + * reference count to the buffers when inspecting them on
8107 + * t_syncdata_list or t_locked_list.
8109 + * jbd2_journal_try_to_free_buffers() will call this function to
8110 + * wait for the current transaction to finish syncing data buffers, before
8111 + * try to free that buffer.
8113 + * Called with journal->j_state_lock hold.
8115 +static void jbd2_journal_wait_for_transaction_sync_data(journal_t *journal)
8117 + transaction_t *transaction;
8120 + spin_lock(&journal->j_state_lock);
8121 + transaction = journal->j_committing_transaction;
8123 + if (!transaction) {
8124 + spin_unlock(&journal->j_state_lock);
8128 + tid = transaction->t_tid;
8129 + spin_unlock(&journal->j_state_lock);
8130 + jbd2_log_wait_commit(journal, tid);
8134 * int jbd2_journal_try_to_free_buffers() - try to free page buffers.
8135 * @journal: journal for operation
8136 * @page: to try and free
8137 - * @unused_gfp_mask: unused
8138 + * @gfp_mask: we use the mask to detect how hard should we try to release
8139 + * buffers. If __GFP_WAIT and __GFP_FS is set, we wait for commit code to
8140 + * release the buffers.
8143 * For all the buffers on this page,
8144 @@ -1690,9 +1530,11 @@ out:
8145 * journal_try_to_free_buffer() is changing its state. But that
8146 * cannot happen because we never reallocate freed data as metadata
8147 * while the data is part of a transaction. Yes?
8149 + * Return 0 on failure, 1 on success
8151 int jbd2_journal_try_to_free_buffers(journal_t *journal,
8152 - struct page *page, gfp_t unused_gfp_mask)
8153 + struct page *page, gfp_t gfp_mask)
8155 struct buffer_head *head;
8156 struct buffer_head *bh;
8157 @@ -1708,7 +1550,8 @@ int jbd2_journal_try_to_free_buffers(journal_t *journal,
8159 * We take our own ref against the journal_head here to avoid
8160 * having to add tons of locking around each instance of
8161 - * jbd2_journal_remove_journal_head() and jbd2_journal_put_journal_head().
8162 + * jbd2_journal_remove_journal_head() and
8163 + * jbd2_journal_put_journal_head().
8165 jh = jbd2_journal_grab_journal_head(bh);
8167 @@ -1721,7 +1564,28 @@ int jbd2_journal_try_to_free_buffers(journal_t *journal,
8170 } while ((bh = bh->b_this_page) != head);
8172 ret = try_to_free_buffers(page);
8175 + * There are a number of places where jbd2_journal_try_to_free_buffers()
8176 + * could race with jbd2_journal_commit_transaction(), the later still
8177 + * holds the reference to the buffers to free while processing them.
8178 + * try_to_free_buffers() failed to free those buffers. Some of the
8179 + * caller of releasepage() request page buffers to be dropped, otherwise
8180 + * treat the fail-to-free as errors (such as generic_file_direct_IO())
8182 + * So, if the caller of try_to_release_page() wants the synchronous
8183 + * behaviour(i.e make sure buffers are dropped upon return),
8184 + * let's wait for the current transaction to finish flush of
8185 + * dirty data buffers, then try to free those buffers again,
8186 + * with the journal locked.
8188 + if (ret == 0 && (gfp_mask & __GFP_WAIT) && (gfp_mask & __GFP_FS)) {
8189 + jbd2_journal_wait_for_transaction_sync_data(journal);
8190 + ret = try_to_free_buffers(page);
8196 @@ -1823,6 +1687,7 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh)
8197 if (!buffer_jbd(bh))
8198 goto zap_buffer_unlocked;
8200 + /* OK, we have data buffer in journaled mode */
8201 spin_lock(&journal->j_state_lock);
8202 jbd_lock_bh_state(bh);
8203 spin_lock(&journal->j_list_lock);
8204 @@ -1886,15 +1751,6 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh)
8206 } else if (transaction == journal->j_committing_transaction) {
8207 JBUFFER_TRACE(jh, "on committing transaction");
8208 - if (jh->b_jlist == BJ_Locked) {
8210 - * The buffer is on the committing transaction's locked
8211 - * list. We have the buffer locked, so I/O has
8212 - * completed. So we can nail the buffer now.
8214 - may_free = __dispose_buffer(jh, transaction);
8218 * If it is committing, we simply cannot touch it. We
8219 * can remove it's next_transaction pointer from the
8220 @@ -2027,9 +1883,6 @@ void __jbd2_journal_file_buffer(struct journal_head *jh,
8221 J_ASSERT_JH(jh, !jh->b_committed_data);
8222 J_ASSERT_JH(jh, !jh->b_frozen_data);
8225 - list = &transaction->t_sync_datalist;
8228 transaction->t_nr_buffers++;
8229 list = &transaction->t_buffers;
8230 @@ -2049,9 +1902,6 @@ void __jbd2_journal_file_buffer(struct journal_head *jh,
8232 list = &transaction->t_reserved_list;
8235 - list = &transaction->t_locked_list;
8239 __blist_add_buffer(list, jh);
8240 @@ -2141,3 +1991,88 @@ void jbd2_journal_refile_buffer(journal_t *journal, struct journal_head *jh)
8241 spin_unlock(&journal->j_list_lock);
8246 + * File inode in the inode list of the handle's transaction
8248 +int jbd2_journal_file_inode(handle_t *handle, struct jbd2_inode *jinode)
8250 + transaction_t *transaction = handle->h_transaction;
8251 + journal_t *journal = transaction->t_journal;
8253 + if (is_handle_aborted(handle))
8256 + jbd_debug(4, "Adding inode %lu, tid:%d\n", jinode->i_vfs_inode->i_ino,
8257 + transaction->t_tid);
8260 + * First check whether inode isn't already on the transaction's
8261 + * lists without taking the lock. Note that this check is safe
8262 + * without the lock as we cannot race with somebody removing inode
8263 + * from the transaction. The reason is that we remove inode from the
8264 + * transaction only in journal_release_jbd_inode() and when we commit
8265 + * the transaction. We are guarded from the first case by holding
8266 + * a reference to the inode. We are safe against the second case
8267 + * because if jinode->i_transaction == transaction, commit code
8268 + * cannot touch the transaction because we hold reference to it,
8269 + * and if jinode->i_next_transaction == transaction, commit code
8270 + * will only file the inode where we want it.
8272 + if (jinode->i_transaction == transaction ||
8273 + jinode->i_next_transaction == transaction)
8276 + spin_lock(&journal->j_list_lock);
8278 + if (jinode->i_transaction == transaction ||
8279 + jinode->i_next_transaction == transaction)
8282 + /* On some different transaction's list - should be
8283 + * the committing one */
8284 + if (jinode->i_transaction) {
8285 + J_ASSERT(jinode->i_next_transaction == NULL);
8286 + J_ASSERT(jinode->i_transaction ==
8287 + journal->j_committing_transaction);
8288 + jinode->i_next_transaction = transaction;
8291 + /* Not on any transaction list... */
8292 + J_ASSERT(!jinode->i_next_transaction);
8293 + jinode->i_transaction = transaction;
8294 + list_add(&jinode->i_list, &transaction->t_inode_list);
8296 + spin_unlock(&journal->j_list_lock);
8302 + * This function must be called when inode is journaled in ordered mode
8303 + * before truncation happens. It starts writeout of truncated part in
8304 + * case it is in the committing transaction so that we stand to ordered
8305 + * mode consistency guarantees.
8307 +int jbd2_journal_begin_ordered_truncate(struct jbd2_inode *inode,
8310 + journal_t *journal;
8311 + transaction_t *commit_trans;
8314 + if (!inode->i_transaction && !inode->i_next_transaction)
8316 + journal = inode->i_transaction->t_journal;
8317 + spin_lock(&journal->j_state_lock);
8318 + commit_trans = journal->j_committing_transaction;
8319 + spin_unlock(&journal->j_state_lock);
8320 + if (inode->i_transaction == commit_trans) {
8321 + ret = filemap_fdatawrite_range(inode->i_vfs_inode->i_mapping,
8322 + new_size, LLONG_MAX);
8324 + jbd2_journal_abort(journal, ret);
8329 diff --git a/fs/mpage.c b/fs/mpage.c
8330 index 235e4d3..dbcc7af 100644
8333 @@ -82,7 +82,7 @@ static void mpage_end_io_write(struct bio *bio, int err)
8337 -static struct bio *mpage_bio_submit(int rw, struct bio *bio)
8338 +struct bio *mpage_bio_submit(int rw, struct bio *bio)
8340 bio->bi_end_io = mpage_end_io_read;
8342 @@ -90,6 +90,7 @@ static struct bio *mpage_bio_submit(int rw, struct bio *bio)
8343 submit_bio(rw, bio);
8346 +EXPORT_SYMBOL(mpage_bio_submit);
8349 mpage_alloc(struct block_device *bdev,
8350 @@ -435,15 +436,9 @@ EXPORT_SYMBOL(mpage_readpage);
8351 * written, so it can intelligently allocate a suitably-sized BIO. For now,
8352 * just allocate full-size (16-page) BIOs.
8354 -struct mpage_data {
8356 - sector_t last_block_in_bio;
8357 - get_block_t *get_block;
8358 - unsigned use_writepage;
8361 -static int __mpage_writepage(struct page *page, struct writeback_control *wbc,
8363 +int __mpage_writepage(struct page *page, struct writeback_control *wbc,
8366 struct mpage_data *mpd = data;
8367 struct bio *bio = mpd->bio;
8368 @@ -651,6 +646,7 @@ out:
8372 +EXPORT_SYMBOL(__mpage_writepage);
8375 * mpage_writepages - walk the list of dirty pages of the given address space & writepage() all of them
8376 diff --git a/include/linux/fs.h b/include/linux/fs.h
8377 index d8e2762..97f992a 100644
8378 --- a/include/linux/fs.h
8379 +++ b/include/linux/fs.h
8380 @@ -1740,6 +1740,8 @@ extern int wait_on_page_writeback_range(struct address_space *mapping,
8381 pgoff_t start, pgoff_t end);
8382 extern int __filemap_fdatawrite_range(struct address_space *mapping,
8383 loff_t start, loff_t end, int sync_mode);
8384 +extern int filemap_fdatawrite_range(struct address_space *mapping,
8385 + loff_t start, loff_t end);
8387 extern long do_fsync(struct file *file, int datasync);
8388 extern void sync_supers(void);
8389 diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
8390 index d147f0f..3dd2090 100644
8391 --- a/include/linux/jbd2.h
8392 +++ b/include/linux/jbd2.h
8393 @@ -168,6 +168,8 @@ struct commit_header {
8394 unsigned char h_chksum_size;
8395 unsigned char h_padding[2];
8396 __be32 h_chksum[JBD2_CHECKSUM_BYTES];
8397 + __be64 h_commit_sec;
8398 + __be32 h_commit_nsec;
8402 @@ -379,6 +381,38 @@ static inline void jbd_unlock_bh_journal_head(struct buffer_head *bh)
8403 bit_spin_unlock(BH_JournalHead, &bh->b_state);
8406 +/* Flags in jbd_inode->i_flags */
8407 +#define __JI_COMMIT_RUNNING 0
8408 +/* Commit of the inode data in progress. We use this flag to protect us from
8409 + * concurrent deletion of inode. We cannot use reference to inode for this
8410 + * since we cannot afford doing last iput() on behalf of kjournald
8412 +#define JI_COMMIT_RUNNING (1 << __JI_COMMIT_RUNNING)
8415 + * struct jbd_inode is the structure linking inodes in ordered mode
8416 + * present in a transaction so that we can sync them during commit.
8418 +struct jbd2_inode {
8419 + /* Which transaction does this inode belong to? Either the running
8420 + * transaction or the committing one. [j_list_lock] */
8421 + transaction_t *i_transaction;
8423 + /* Pointer to the running transaction modifying inode's data in case
8424 + * there is already a committing transaction touching it. [j_list_lock] */
8425 + transaction_t *i_next_transaction;
8427 + /* List of inodes in the i_transaction [j_list_lock] */
8428 + struct list_head i_list;
8430 + /* VFS inode this inode belongs to [constant during the lifetime
8431 + * of the structure] */
8432 + struct inode *i_vfs_inode;
8434 + /* Flags of inode [j_list_lock] */
8435 + unsigned int i_flags;
8438 struct jbd2_revoke_table_s;
8441 @@ -509,24 +543,12 @@ struct transaction_s
8442 struct journal_head *t_reserved_list;
8445 - * Doubly-linked circular list of all buffers under writeout during
8446 - * commit [j_list_lock]
8448 - struct journal_head *t_locked_list;
8451 * Doubly-linked circular list of all metadata buffers owned by this
8452 * transaction [j_list_lock]
8454 struct journal_head *t_buffers;
8457 - * Doubly-linked circular list of all data buffers still to be
8458 - * flushed before this transaction can be committed [j_list_lock]
8460 - struct journal_head *t_sync_datalist;
8463 * Doubly-linked circular list of all forget buffers (superseded
8464 * buffers which we can un-checkpoint once this transaction commits)
8466 @@ -565,6 +587,12 @@ struct transaction_s
8467 struct journal_head *t_log_list;
8470 + * List of inodes whose data we've modified in data=ordered mode.
8473 + struct list_head t_inode_list;
8476 * Protects info related to handles
8478 spinlock_t t_handle_lock;
8479 @@ -1004,7 +1032,6 @@ extern int jbd2_journal_extend (handle_t *, int nblocks);
8480 extern int jbd2_journal_get_write_access(handle_t *, struct buffer_head *);
8481 extern int jbd2_journal_get_create_access (handle_t *, struct buffer_head *);
8482 extern int jbd2_journal_get_undo_access(handle_t *, struct buffer_head *);
8483 -extern int jbd2_journal_dirty_data (handle_t *, struct buffer_head *);
8484 extern int jbd2_journal_dirty_metadata (handle_t *, struct buffer_head *);
8485 extern void jbd2_journal_release_buffer (handle_t *, struct buffer_head *);
8486 extern int jbd2_journal_forget (handle_t *, struct buffer_head *);
8487 @@ -1044,6 +1071,10 @@ extern void jbd2_journal_ack_err (journal_t *);
8488 extern int jbd2_journal_clear_err (journal_t *);
8489 extern int jbd2_journal_bmap(journal_t *, unsigned long, unsigned long long *);
8490 extern int jbd2_journal_force_commit(journal_t *);
8491 +extern int jbd2_journal_file_inode(handle_t *handle, struct jbd2_inode *inode);
8492 +extern int jbd2_journal_begin_ordered_truncate(struct jbd2_inode *inode, loff_t new_size);
8493 +extern void jbd2_journal_init_jbd_inode(struct jbd2_inode *jinode, struct inode *inode);
8494 +extern void jbd2_journal_release_jbd_inode(journal_t *journal, struct jbd2_inode *jinode);
8497 * journal_head management
8498 @@ -1179,15 +1210,13 @@ static inline int jbd_space_needed(journal_t *journal)
8500 /* journaling buffer types */
8501 #define BJ_None 0 /* Not journaled */
8502 -#define BJ_SyncData 1 /* Normal data: flush before commit */
8503 -#define BJ_Metadata 2 /* Normal journaled metadata */
8504 -#define BJ_Forget 3 /* Buffer superseded by this transaction */
8505 -#define BJ_IO 4 /* Buffer is for temporary IO use */
8506 -#define BJ_Shadow 5 /* Buffer contents being shadowed to the log */
8507 -#define BJ_LogCtl 6 /* Buffer contains log descriptors */
8508 -#define BJ_Reserved 7 /* Buffer is reserved for access by journal */
8509 -#define BJ_Locked 8 /* Locked for I/O during commit */
8511 +#define BJ_Metadata 1 /* Normal journaled metadata */
8512 +#define BJ_Forget 2 /* Buffer superseded by this transaction */
8513 +#define BJ_IO 3 /* Buffer is for temporary IO use */
8514 +#define BJ_Shadow 4 /* Buffer contents being shadowed to the log */
8515 +#define BJ_LogCtl 5 /* Buffer contains log descriptors */
8516 +#define BJ_Reserved 6 /* Buffer is reserved for access by journal */
8519 extern int jbd_blocks_per_page(struct inode *inode);
8521 diff --git a/include/linux/mpage.h b/include/linux/mpage.h
8522 index 068a0c9..5c42821 100644
8523 --- a/include/linux/mpage.h
8524 +++ b/include/linux/mpage.h
8529 +struct mpage_data {
8531 + sector_t last_block_in_bio;
8532 + get_block_t *get_block;
8533 + unsigned use_writepage;
8536 struct writeback_control;
8538 +struct bio *mpage_bio_submit(int rw, struct bio *bio);
8539 int mpage_readpages(struct address_space *mapping, struct list_head *pages,
8540 unsigned nr_pages, get_block_t get_block);
8541 int mpage_readpage(struct page *page, get_block_t get_block);
8542 +int __mpage_writepage(struct page *page, struct writeback_control *wbc,
8544 int mpage_writepages(struct address_space *mapping,
8545 struct writeback_control *wbc, get_block_t get_block);
8546 int mpage_writepage(struct page *page, get_block_t *get_block,
8547 diff --git a/include/linux/percpu_counter.h b/include/linux/percpu_counter.h
8548 index 9007ccd..2083888 100644
8549 --- a/include/linux/percpu_counter.h
8550 +++ b/include/linux/percpu_counter.h
8551 @@ -35,7 +35,7 @@ int percpu_counter_init_irq(struct percpu_counter *fbc, s64 amount);
8552 void percpu_counter_destroy(struct percpu_counter *fbc);
8553 void percpu_counter_set(struct percpu_counter *fbc, s64 amount);
8554 void __percpu_counter_add(struct percpu_counter *fbc, s64 amount, s32 batch);
8555 -s64 __percpu_counter_sum(struct percpu_counter *fbc);
8556 +s64 __percpu_counter_sum(struct percpu_counter *fbc, int set);
8558 static inline void percpu_counter_add(struct percpu_counter *fbc, s64 amount)
8560 @@ -44,13 +44,19 @@ static inline void percpu_counter_add(struct percpu_counter *fbc, s64 amount)
8562 static inline s64 percpu_counter_sum_positive(struct percpu_counter *fbc)
8564 - s64 ret = __percpu_counter_sum(fbc);
8565 + s64 ret = __percpu_counter_sum(fbc, 0);
8566 return ret < 0 ? 0 : ret;
8569 +static inline s64 percpu_counter_sum_and_set(struct percpu_counter *fbc)
8571 + return __percpu_counter_sum(fbc, 1);
8575 static inline s64 percpu_counter_sum(struct percpu_counter *fbc)
8577 - return __percpu_counter_sum(fbc);
8578 + return __percpu_counter_sum(fbc, 0);
8581 static inline s64 percpu_counter_read(struct percpu_counter *fbc)
8582 diff --git a/include/linux/writeback.h b/include/linux/writeback.h
8583 index f462439..0d8573e 100644
8584 --- a/include/linux/writeback.h
8585 +++ b/include/linux/writeback.h
8586 @@ -63,6 +63,7 @@ struct writeback_control {
8587 unsigned for_writepages:1; /* This is a writepages() call */
8588 unsigned range_cyclic:1; /* range_start is cyclic */
8589 unsigned more_io:1; /* more io to be dispatched */
8590 + unsigned range_cont:1;
8594 diff --git a/lib/percpu_counter.c b/lib/percpu_counter.c
8595 index 1191744..4a8ba4b 100644
8596 --- a/lib/percpu_counter.c
8597 +++ b/lib/percpu_counter.c
8598 @@ -52,7 +52,7 @@ EXPORT_SYMBOL(__percpu_counter_add);
8599 * Add up all the per-cpu counts, return the result. This is a more accurate
8600 * but much slower version of percpu_counter_read_positive()
8602 -s64 __percpu_counter_sum(struct percpu_counter *fbc)
8603 +s64 __percpu_counter_sum(struct percpu_counter *fbc, int set)
8607 @@ -62,7 +62,12 @@ s64 __percpu_counter_sum(struct percpu_counter *fbc)
8608 for_each_online_cpu(cpu) {
8609 s32 *pcount = per_cpu_ptr(fbc->counters, cpu);
8617 spin_unlock(&fbc->lock);
8620 diff --git a/mm/filemap.c b/mm/filemap.c
8621 index 1e6a7d3..65d9d9e 100644
8624 @@ -236,11 +236,12 @@ int filemap_fdatawrite(struct address_space *mapping)
8626 EXPORT_SYMBOL(filemap_fdatawrite);
8628 -static int filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
8629 +int filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
8632 return __filemap_fdatawrite_range(mapping, start, end, WB_SYNC_ALL);
8634 +EXPORT_SYMBOL(filemap_fdatawrite_range);
8637 * filemap_flush - mostly a non-blocking flush
8638 diff --git a/mm/page-writeback.c b/mm/page-writeback.c
8639 index 789b6ad..ded57d5 100644
8640 --- a/mm/page-writeback.c
8641 +++ b/mm/page-writeback.c
8642 @@ -956,6 +956,9 @@ retry:
8644 if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
8645 mapping->writeback_index = index;
8647 + if (wbc->range_cont)
8648 + wbc->range_start = index << PAGE_CACHE_SHIFT;
8651 EXPORT_SYMBOL(write_cache_pages);